In [1]:
import numpy as np
from numba import jit
import time

@jit(nopython=True)  # Optimize for CPU
def numba_vector_add(a, b):
    return a + b

# Generate data
n = 10000000
a = np.random.rand(n)
b = np.random.rand(n)

# Benchmark
start = time.time()
result = numba_vector_add(a, b)
print("Numba (CPU):", time.time() - start, "seconds")

Numba (CPU): 0.5326104164123535 seconds


In [2]:
from numba import cuda

@cuda.jit
def numba_gpu_add(a, b, out):
    i = cuda.grid(1)
    if i < len(a):
        out[i] = a[i] + b[i]

# Copy data to GPU
a_gpu = cuda.to_device(a)
b_gpu = cuda.to_device(b)
out_gpu = cuda.device_array_like(a_gpu)

# Benchmark
start = time.time()
numba_gpu_add[128, 64](a_gpu, b_gpu, out_gpu)  # Grid/block tuning
result = out_gpu.copy_to_host()
print("Numba (GPU):", time.time() - start, "seconds")

Numba (GPU): 0.15823602676391602 seconds


In [3]:
import cupy as cp

# Transfer data to GPU
a_cp = cp.array(a)
b_cp = cp.array(b)

# Benchmark
start = time.time()
result = a_cp + b_cp  # CuPy automatically runs on GPU
cp.cuda.Stream.null.synchronize()  # Ensure GPU finishes
print("CuPy (GPU):", time.time() - start, "seconds")

CuPy (GPU): 0.035204410552978516 seconds


In [4]:
@jit(nopython=True)
def numba_matmul(a, b):
    return a @ b

a = np.random.rand(1000, 1000)
b = np.random.rand(1000, 1000)

start = time.time()
numba_matmul(a, b)
print("Numba (CPU) MatMul:", time.time() - start)

Numba (CPU) MatMul: 0.992711067199707


In [6]:
a_cp = cp.array(a)
b_cp = cp.array(b)

start = time.time()
result = a_cp @ b_cp
cp.cuda.Stream.null.synchronize()
print("CuPy (GPU) MatMul:", time.time() - start)

CuPy (GPU) MatMul: 0.12929010391235352
