In [40]:
%pip install ninja



In [41]:
#@markdown ### Код
%%writefile kernel.cu

#include <torch/extension.h>


__global__ void d_add(float *a, float *b, float *c, int n) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < n) {
        c[i] = a[i] + b[i];
    }
}


#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
#define CHECK_SIZE(x, y) TORCH_CHECK(x.is_same_size(y), #y " be the same size as " #x)
#define BLOCK_SIZE 128
#define CALC_GRID_SIZE(m) ((m + BLOCK_SIZE - 1) / BLOCK_SIZE)


torch::Tensor my_add(torch::Tensor a, torch::Tensor b) {
    CHECK_INPUT(a);
    CHECK_INPUT(b);
    CHECK_SIZE(a, b);

    auto c = torch::empty_like(a);
    int n = a.numel();

    d_add<<<CALC_GRID_SIZE(n), BLOCK_SIZE>>>(
        a.data_ptr<float>(),
        b.data_ptr<float>(),
        c.data_ptr<float>(),
        n
    );

    return c;
}


PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("my_add", &my_add, "Custom vector addition");
}

Overwriting kernel.cu


In [42]:
import torch
from torch.utils.cpp_extension import load


my_ext = load(
    name='my_extension',
    sources=['kernel.cu'],
    extra_cuda_cflags=['-O3'],
    extra_cflags=['-O3'],
)

In [43]:
import torch.utils.benchmark as benchmark
from tqdm.autonotebook import tqdm


def add_benchmark(low, high, step, add_impl, dtype=torch.float32,
                  device='cuda:0', bar_label='', r=101):

    assert low < high
    assert step < high - low
    high += 1

    add_benchmark.add_m = lambda a, b, m: add_impl(a[:m], b[:m])

    if not hasattr(add_benchmark, 'a'):
        add_benchmark.a = torch.empty(
            (high, ), device=device, dtype=dtype).uniform_(-r, r)

        add_benchmark.b = torch.empty(
            (high, ), device=device, dtype=dtype).uniform_(-r, r)

    stamps = {}
    num_test = (high - low) // step

    if not bar_label:
        bar_label = 'Vector Addition Benchmark'

    with tqdm(range(low, high, step), desc=bar_label, total=num_test) as test:
        for m in test:
            t = benchmark.Timer(
                    stmt='add_benchmark.add_m(a, b, m)',
                    setup='from __main__ import add_benchmark',
                    globals={
                        'a': add_benchmark.a,
                        'b': add_benchmark.b,
                        'm': m
                    }
                )

            timing = t.blocked_autorange().median
            stamps[m] = timing

            test.set_postfix(timing=f'{timing:.3e}s')
            torch.cuda.synchronize()

    return stamps

  from tqdm.autonotebook import tqdm


In [None]:
LOW = 5120000 #@param {type:"number"}
HIGH = 512000000 #@param {type:"number"}
STEP = 5120000 #@param {type:"number"}

stamps = add_benchmark(LOW, HIGH, STEP, my_ext.my_add)

In [None]:
# from google.colab import files

# uploaded = files.upload()

# for fn in uploaded.keys():
#   print('User uploaded file "{name}" with length {length} bytes'.format(
#       name=fn, length=len(uploaded[fn])))