In [1]:
# Google Drive 마운트하기
from google.colab import drive
drive.mount('/content/drive')

# 특정 디렉토리로 이동하기
import os

# 예: My Drive 내의 'project' 폴더로 이동

project_path = '/content/drive/MyDrive/GPU-MODE/lectures/lecture_001'
# 디렉토리 존재 여부 확인 후 이동
if os.path.exists(project_path):
    os.chdir(project_path)
    print(f"현재 작업 디렉토리: {os.getcwd()}")
else:
    print(f"디렉토리가 존재하지 않습니다: {project_path}")

# 현재 디렉토리 내용 확인
print("\n현재 디렉토리 내용:")
print(os.listdir())

Mounted at /content/drive
현재 작업 디렉토리: /content/drive/MyDrive/GPU-MODE/lectures/lecture_001

현재 디렉토리 내용:
['pytorch_square.py', 'triton_profile', 'nsys_square.py', 'test.py', 'ncu_logs', 'hello_load_inline.py', 'main.py', 'numba_square.py', 'load_inline.py', 'CUDA MODE_ Lecture 1.pdf', 'README.md', 'tmp', 'load_inline_cuda', 'build', 'setup.py', 'my_module.cpython-311-x86_64-linux-gnu.so', 'pt_profiler.py', '.ipynb_checkpoints', 'breakpoint_square.py', 'triton_square.py', 'square() performance.png', 'square() performance.csv', 'results.html', 'triton-square-profile.py']


In [None]:
!pip install ninja

In [None]:
%cd /content/drive/MyDrive/GPU-MODE/lectures/lecture_001
!python pt_profiler.py

/content
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          ProfilerStep*         0.00%       0.000us         0.00%       0.000us       0.000us     226.081ms       125.60%     226.081ms     113.041ms             2  
                                            aten::copy_         0.01%     122.261us         9.77%     174.209ms      87.104ms     173.478ms        96.38%     173.478ms      86.739ms             2  


# STEP1 #
python, C++ linking을 이용한 컴파일 실행 예제

1. python inline source code 정의
2. source code 분리, python Compiler & Run
3. .so (Library) build & Run

\# Reference.

GPU-MODE/lectures/lecture_001/tmp

GPU-MODE/lectures/lecture_001/hello_load_inline.py


In [None]:
%cd /content/drive/MyDrive/GPU-MODE/lectures/lecture_001
!python hello_load_inline.py

/content/drive/MyDrive/GPU-MODE/lectures/lecture_001
Emitting ninja build file ./tmp/build.ninja...
Building extension module my_module...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
[1/2] c++ -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=my_module -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /usr/local/lib/python3.11/dist-packages/torch/include -isystem /usr/local/lib/python3.11/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.11/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.11/dist-packages/torch/include/THC -isystem /usr/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -c /content/drive/MyDrive/GPU-MODE/lectures/lecture_001/tmp/main.cpp -o main.o 
[2/2] c++ main.o -shared -L/usr/local/lib/python3.11/dist-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_

In [None]:
%cd /content/drive/MyDrive/GPU-MODE/lectures/lecture_001/tmp

/content/drive/MyDrive/GPU-MODE/lectures/lecture_001/tmp


In [None]:
from torch.utils.cpp_extension import load

#Using JIT build
# 이 한 줄로 컴파일과 로딩을 동시에 수행
my_module = load(
    name="my_module",
    sources=["main.cpp"],
    verbose=True  # 컴파일 과정 출력
)

# 바로 사용 가능
result = my_module.hello_world()
print(result)

Using /root/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...
Creating extension directory /root/.cache/torch_extensions/py311_cu124/my_module...
Emitting ninja build file /root/.cache/torch_extensions/py311_cu124/my_module/build.ninja...
Building extension module my_module...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


Hello World!


Loading extension module my_module...


# Optional Build Code (Using LD_LABRARY_PATH) #


In [None]:
import torch
import os

# PyTorch 라이브러리 경로 찾기
torch_lib_path = os.path.join(os.path.dirname(torch.__file__), 'lib')
print(f"PyTorch 라이브러리 경로: {torch_lib_path}")

# 환경 변수 설정
os.environ['LD_LIBRARY_PATH'] = torch_lib_path + ':' + os.environ.get('LD_LIBRARY_PATH', '')

# 확인
print(f"설정된 LD_LIBRARY_PATH: {os.environ['LD_LIBRARY_PATH']}")

PyTorch 라이브러리 경로: /usr/local/lib/python3.11/dist-packages/torch/lib
설정된 LD_LIBRARY_PATH: /usr/local/lib/python3.11/dist-packages/torch/lib:/usr/local/lib/python3.11/dist-packages/torch/lib:/usr/local/lib/python3.11/dist-packages/torch/lib:/usr/local/lib/python3.11/dist-packages/torch/lib:/usr/lib64-nvidia


In [None]:
%%writefile setup.py
from setuptools import setup
from torch.utils.cpp_extension import CppExtension, BuildExtension
import torch
import os

include_dirs = [
    os.path.join(os.path.dirname(torch.__file__), 'include'),
    os.path.join(os.path.dirname(torch.__file__), 'include/torch/csrc/api/include')
]

setup(
    name="my_module",
    ext_modules=[
        CppExtension(
            name="my_module",
            sources=["/content/drive/MyDrive/GPU-MODE/lectures/lecture_001/tmp/main.cpp"],
            include_dirs=include_dirs,
        )
    ],
    cmdclass={
        "build_ext": BuildExtension
    }
)

Overwriting setup.py


In [None]:
!python setup.py build_ext --inplace

running build_ext
building 'my_module' extension
Emitting ninja build file /content/drive/MyDrive/GPU-MODE/lectures/lecture_001/tmp/build/temp.linux-x86_64-cpython-311/build.ninja...
Compiling objects...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
x86_64-linux-gnu-g++ -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -shared -Wl,-O1 -Wl,-Bsymbolic-functions /content/drive/MyDrive/GPU-MODE/lectures/lecture_001/tmp/build/temp.linux-x86_64-cpython-311/content/drive/MyDrive/GPU-MODE/lectures/lecture_001/tmp/main.o -L/usr/local/lib/python3.11/dist-packages/torch/lib -L/usr/lib/x86_64-linux-gnu -lc10 -ltorch -ltorch_cpu -ltorch_python -o build/lib.linux-x86_64-cpython-311/my_module.cpython-311-x86_64-linux-gnu.so
copying build/lib.linux-x86_64-cpython-311/my_module.cpython-311-x86_64-linux-gnu.so -> 


In [None]:
#To check my_module.so file
!ls -al

total 10607
drwx------ 4 root root    4096 Mar 14 01:54 build
-rw------- 1 root root    1133 Mar 14 01:50 build.ninja
-rw------- 1 root root     209 Mar 11 13:42 main.cpp
-rw------- 1 root root 1777808 Mar 11 13:42 main.o
-rwx------ 1 root root 9074240 Mar 14 04:02 my_module.cpython-311-x86_64-linux-gnu.so
-rw------- 1 root root     112 Mar 11 13:42 .ninja_deps
-rw------- 1 root root     129 Mar 11 13:42 .ninja_log
-rw------- 1 root root     591 Mar 14 04:01 setup.py


In [None]:
import my_module

# hello_world 함수 호출
result = my_module.hello_world()
print(result)

Hello World!


# STEP2 #
**Code Description**

main.cpp:
- 역할: PyTorch C++ 확장 모듈의 진입점
- 주요 구성:
  - #include <torch/extension.h>: PyTorch 확장 모듈 헤더 포함
  - torch::Tensor square_matrix(torch::Tensor matrix): 함수 선언
  - PYBIND11_MODULE: Python으로 C++ 함수를 노출시키는 매크로
  - m.def("square_matrix", ...): Python에서 호출할 함수명 정의

cuda.cu:
- 역할: CUDA 커널 및 C++ 인터페이스 구현
- 주요 구성:
  - __global__ void square_matrix_kernel: CUDA 커널 정의
      - blockIdx, threadIdx: CUDA 스레드 좌표
    - 행렬의 각 요소를 제곱하는 로직
  - torch::Tensor square_matrix: 파이썬에서 호출할 C++ 함수 구현
      - 스레드/블록 구성 설정
    - CUDA 커널 실행
    - 결과 반환
  
**Build & Run**
  1. C++ 컴파일:
    - main.cpp 파일이 C++ 컴파일러로 컴파일됨
    - PyTorch 헤더 포함하여 오브젝트 파일 생성
  2. CUDA 컴파일:
    - cuda.cu 파일이 NVCC로 컴파일됨
    - CUDA 커널 코드를 실행 가능한 형태로 변환
  3. 링킹:
    - 두 오브젝트 파일을 PyTorch, CUDA 라이브러리와 링크
    - 동적 라이브러리(.so) 생성
  4. Python에서 로드:
    - 생성된 .so 파일을 Python 모듈로 로드
    - Python에서 C++/CUDA 함수 호출 가능

In [None]:
%cd /content/drive/MyDrive/GPU-MODE/lectures/lecture_001

/content/drive/MyDrive/GPU-MODE/lectures/lecture_001


In [None]:
#using python file, python 파일 내 모든 소스코드 정의. 파일 내부에서 C++/CUDA 코드 컴파일 및 정의
!python load_inline.py

If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
tensor([[ 1.,  4.,  9.],
        [16., 25., 36.]], device='cuda:0')


In [None]:
%cd /content/drive/MyDrive/GPU-MODE/lectures/lecture_001/load_inline_cuda

/content/drive/MyDrive/GPU-MODE/lectures/lecture_001/load_inline_cuda


In [None]:
import torch
from torch.utils.cpp_extension import load

# JIT 컴파일 실행 - 외부 소스 파일 사용
module = load(
    name="square_matrix_extension",
    sources=["/content/drive/MyDrive/GPU-MODE/lectures/lecture_001/load_inline_cuda/main.cpp",
             "/content/drive/MyDrive/GPU-MODE/lectures/lecture_001/load_inline_cuda/cuda.cu"],
    verbose=True,
    with_cuda=True
)

# 모듈 테스트 실행
a = torch.tensor([[1., 2., 3.], [4., 5., 6.]], device='cuda')
result = module.square_matrix(a)
print(result)

Using /root/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu124/square_matrix_extension/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module square_matrix_extension...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module square_matrix_extension...


tensor([[ 1.,  4.,  9.],
        [16., 25., 36.]], device='cuda:0')


In [None]:
%%writefile setup.py
from setuptools import setup
from torch.utils.cpp_extension import CUDAExtension, BuildExtension
import torch
import os

# 필요한 include 디렉토리 경로
include_dirs = [
    os.path.join(os.path.dirname(torch.__file__), 'include'),
    os.path.join(os.path.dirname(torch.__file__), 'include/torch/csrc/api/include')
]

setup(
    name="square_matrix_extension",
    ext_modules=[
        CUDAExtension(
            name="square_matrix_extension",
            sources=[
                "/content/drive/MyDrive/GPU-MODE/lectures/lecture_001/load_inline_cuda/main.cpp",
                "/content/drive/MyDrive/GPU-MODE/lectures/lecture_001/load_inline_cuda/cuda.cu"
            ],
            include_dirs=include_dirs,
            # 필요한 경우 CUDA 아키텍처 지정
            extra_compile_args={
                'cxx': ['-D_GLIBCXX_USE_CXX11_ABI=0'],
                'nvcc': ['-D_GLIBCXX_USE_CXX11_ABI=0']
            }
        )
    ],
    cmdclass={
        "build_ext": BuildExtension
    }
)

Overwriting setup.py


In [None]:
!python setup.py build_ext --inplace
#To check my_module.so file
!ls -al

running build_ext
building 'square_matrix_extension' extension
creating /content/drive/MyDrive/GPU-MODE/lectures/lecture_001/tmp/build/temp.linux-x86_64-cpython-311/content/drive/MyDrive/GPU-MODE/lectures/lecture_001/load_inline_cuda
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Emitting ninja build file /content/drive/MyDrive/GPU-MODE/lectures/lecture_001/tmp/build/temp.linux-x86_64-cpython-311/build.ninja...
Compiling objects...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
[1/2] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /content/drive/MyDrive/GPU-MODE/lectures/lecture_001/tmp/build/temp.linux-x86_64-cpython-311/content/drive/MyDrive/GPU-MODE/lectures/lecture_001/load_inline_cuda/cuda.o.d -I/usr/local/lib/python3.11/dist-packages/torch/include -I/usr/local/lib/python3.11/dist-packages/torch/include/torch/csrc/api/include -I/usr/local/lib/python3.11/dist

In [None]:
import square_matrix_extension

a = torch.tensor([[1., 2., 3.], [4., 5., 6.]], device='cuda')
result = square_matrix_extension.square_matrix(a)
print(result)

total 19828
drwx------ 4 root root    4096 Mar 14 01:54 build
-rw------- 1 root root    1357 Mar 14 04:38 build.ninja
-rw------- 1 root root     209 Mar 11 13:42 main.cpp
-rw------- 1 root root 1777808 Mar 11 13:42 main.o
-rwx------ 1 root root 9074240 Mar 14 04:53 my_module.cpython-311-x86_64-linux-gnu.so
-rw------- 1 root root     112 Mar 11 13:42 .ninja_deps
-rw------- 1 root root     129 Mar 11 13:42 .ninja_log
-rw------- 1 root root    1014 Mar 14 04:57 setup.py
-rwx------ 1 root root 9442232 Mar 14 04:58 square_matrix_extension.cpython-311-x86_64-linux-gnu.so
tensor([[ 1.,  4.,  9.],
        [16., 25., 36.]], device='cuda:0')


# Triton Optimization #
Triton을 이용하여 CUDA GPU, Square 연산 최적화
1. Triton Kernel Code 정의
  - @triton.jit : CUDA Code 컴파일
  - row_idx = tl.program_id(0) : 인스턴스 ID load, 행처리 최적화
    GPU에서는 1개의 block이 1개의 행을 담당하며 병령성을 확보
  - row 시작 주소, 인덱스 배열 생성, 메모리 포인터 계산
  - tl.load() : GPU에서 SRAM으로 로드 & Masking
  - Operation
  - Save : 출력 위치 계산, 및 저장
2. Python Interface ( square 함수 ) 정의
  - 함수 정의
  - BLOCK_SIZE = triton.next_power_of_2(n_cols): 입력 크기에 맞는 최적화된 연산 단위 계산
  - num_warps : 32개의 스레드로 구성된 GPU 실행 단위
    - 워프수를 늘리면 많은 스레드가 한개의 행을 처리
    - 큰 사이즈 행렬일수록 많은 병렬처리가 필요
  - 커널 실행 및 결과 반환
3. 검증 테스트
  - UT
  - benchmark test

In [9]:
%cd /content/drive/MyDrive/GPU-MODE/lectures/lecture_001
!python triton_square.py

/content
Figure(640x480)
square() performance:
        N       Triton  Torch (native)  Torch (compiled)
0  6400.0  1428.266277      246.691253        251.577740
1  6528.0  1430.794464      240.110348        242.884678
2  6656.0  1467.171136      237.391379        243.063518
3  6784.0  1471.003849      237.303274        242.895666
4  6912.0  1418.840958      237.744381        243.059340


### NCU를 이용한 square 성능 Profile ###
- Block Size를 변경하며 square 연산을 단순 반복
- nuc를 붙혀서 각 인스턴스에 대한 성능 분석
- Memory TP 향상을 위한 간단한 최적화

In [11]:
%cd /content/drive/MyDrive/GPU-MODE/lectures/lecture_001
print("### ORG Square ###")
!python triton-square-profile.py

print("### OPT Square ###")
!python triton-square-opt.py

/content
### ORG Square ###
Triton Square 연산 프로파일링 시작

크기: 1024 요소
평균 실행 시간: 0.0404 ms
처리량: 0.0254 십억 요소/초
최대 오차: 0.0

크기: 8192 요소
평균 실행 시간: 0.0373 ms
처리량: 0.2195 십억 요소/초
최대 오차: 0.0

크기: 65536 요소
평균 실행 시간: 0.0372 ms
처리량: 1.7636 십억 요소/초
최대 오차: 0.0

크기: 524288 요소
평균 실행 시간: 0.0384 ms
처리량: 13.6688 십억 요소/초
최대 오차: 0.0

크기: 4194304 요소
평균 실행 시간: 0.1406 ms
처리량: 29.8390 십억 요소/초
최대 오차: 0.0

Triton Square 연산 프로파일링 완료
### OPT Square ###
Triton Square 연산 프로파일링 시작

크기: 1024 요소
평균 실행 시간: 0.0396 ms
처리량: 0.0259 십억 요소/초
최대 오차: 0.0

크기: 8192 요소
평균 실행 시간: 0.0383 ms
처리량: 0.2141 십억 요소/초
최대 오차: 0.0

크기: 65536 요소
평균 실행 시간: 0.0371 ms
처리량: 1.7674 십억 요소/초
최대 오차: 0.0

크기: 524288 요소
평균 실행 시간: 0.0381 ms
처리량: 13.7644 십억 요소/초
최대 오차: 0.0

크기: 4194304 요소
평균 실행 시간: 0.1404 ms
처리량: 29.8686 십억 요소/초
최대 오차: 0.0

Triton Square 연산 프로파일링 완료


In [5]:
!ncu -o profile_result --set full python triton-square-profile.py

Triton Square 연산 프로파일링 시작

크기: 1024 요소
==PROF== Connected to process 1570 (/usr/bin/python3.11)
==PROF== Profiling "distribution_elementwise_grid..." - 0: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 1: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 2: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 3: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 4: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 5: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 6: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 7: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 8: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 9: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 10: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 11: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 12: 0%

In [7]:
!ncu -o profile_result --set full python triton-square-opt.py

Triton Square 연산 프로파일링 시작

크기: 1024 요소
==PROF== Connected to process 25497 (/usr/bin/python3.11)
==PROF== Profiling "distribution_elementwise_grid..." - 0: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 1: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 2: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 3: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 4: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 5: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 6: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 7: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 8: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 9: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 10: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 11: 0%....50%....100% - 30 passes
==PROF== Profiling "square_kernel" - 12: 0