In [48]:
import numpy as np
import numpy.typing as npt
from numba import cuda
import time

@cuda.jit
def kernel(a: npt.NDArray[np.float32], b: npt.NDArray[np.float32], result: npt.NDArray[np.float32]):
    idx = cuda.grid(1)

    if idx < a.size:
        result[idx] = a[idx] * b[idx]

In [50]:
size = 100_000_000
a = np.random.rand(size).astype(np.float32)
b = np.random.rand(size).astype(np.float32)

start_cpu = time.time()
result_cpu = a * b
end_cpu = time.time()
cpu_time = end_cpu - start_cpu
print(f"CPU {cpu_time}")


d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.device_array(size, dtype=np.float32)

threads_per_block = 256
num_blocks = (size + (threads_per_block - 1)) // threads_per_block

start_gpu = time.time()
kernel[num_blocks, threads_per_block](d_a, d_b, d_c)
cuda.synchronize()
end_gpu = time.time()

result_gpu = d_c.copy_to_host()
gpu_time = end_gpu - start_gpu
print(f"GPU {gpu_time}")

CPU 0.09381270408630371
GPU 0.01540231704711914
