https://www.youtube.com/watch?v=4sgKnKbR-WE

In [12]:
%load_ext wurlitzer

The wurlitzer extension is already loaded. To reload it, use:
  %reload_ext wurlitzer


In [None]:
import torch
from torch.utils.cpp_extension import load_inline

In [16]:

def load_cuda(cuda_src, cpp_src, funcs, opt=False, verbose=False):
    return load_inline(cuda_sources=[cuda_src], cpp_sources=[cpp_src], functions=funcs,
                       extra_cuda_cflags=["-O2"] if opt else [], verbose=verbose, name="inline_ext")

In [17]:
cuda_begin = r'''
#include <torch/extension.h>
#include <stdio.h>
#include <c10/cuda/CUDAException.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}
'''

In [18]:
cuda_src = cuda_begin + r'''
__global__ void hello_cuda() {
    printf("Hello, World from GPU! (thread %d, block %d)\n", threadIdx.x, blockIdx.x);
}

torch::Tensor hello() {
    hello_cuda<<<1, 5>>>();
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    return torch::Tensor();
}'''

In [19]:
cpp_src = "torch::Tensor hello();"


In [20]:
module = load_cuda(cuda_src, cpp_src, ['hello'], verbose=True)


In [21]:
dir(module)

['__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'hello']

In [None]:
%%time

module.hello()

CPU times: user 34 μs, sys: 17 μs, total: 51 μs
Wall time: 55.3 μs


Hello, World from GPU! (thread 0, block 0)
Hello, World from GPU! (thread 1, block 0)
Hello, World from GPU! (thread 2, block 0)
Hello, World from GPU! (thread 3, block 0)
Hello, World from GPU! (thread 4, block 0)
