<a href="https://colab.research.google.com/github/ickma2311/mycolab/blob/main/cuda/vector_add.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install cupy-cuda12x




In [2]:
kernal_code="""
extern "C"
__global__ void vector_add(float *a, float *b, float *c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        c[i] = a[i] + b[i];
    }
}

"""

In [3]:
import cupy as cp
import numpy as np


In [4]:
def calculated_expected_resultes(n):
  h_a=np.random.rand(n).astype(np.float32)
  h_b=np.random.rand(n).astype(np.float32)
  h_c=h_a+h_b
  return h_a,h_b,h_c

In [5]:
n=100000000
h_a,h_b,expected_c=calculated_expected_resultes(n)

In [6]:
%%timeit
h_a+h_b
# how many time CPU spends

145 ms ± 751 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
d_a=cp.asarray(h_a)
d_b=cp.asarray(h_b)
d_c=cp.zeros_like(d_a)

In [8]:
cp.cuda.runtime.deviceSynchronize()

In [9]:
moudle=cp.RawModule(code=kernal_code)
vector_add_kernel=moudle.get_function('vector_add')

In [10]:
threads_per_block=256
import math
blocks_per_grid=math.ceil(n/threads_per_block)

In [11]:
%%time
vector_add_kernel(
        (blocks_per_grid,),          # Grid dimensions (tuple)
        (threads_per_block,),       # Block dimensions (tuple)
        (d_a, d_b, d_c, np.int32(n)) # Arguments (tuple)
    )
cp.cuda.runtime.deviceSynchronize()

CPU times: user 4.96 ms, sys: 0 ns, total: 4.96 ms
Wall time: 4.72 ms


In [12]:
h_c=d_c.get()

In [15]:
# will raise exception if test fails
print(np.testing.assert_allclose(h_c, expected_c, atol=1e-5, rtol=0))


None
