# Adding 10 million Vectors with Numba

In [12]:
import numpy as np
from numba import jit, cuda
import time as time_module

# 1. Pure Python version (baseline - slowest)
def vector_add_python(a, b, c):
    for i in range(len(a)):
        c[i] = a[i] + b[i]

# 2. Numba CPU version (JIT compiled)
@jit(nopython=True)
def vector_add_numba_cpu(a, b, c):
    for i in range(len(a)):
        c[i] = a[i] + b[i]

# 3. Numba CUDA version (GPU kernel)
@cuda.jit
def vector_add_cuda_kernel(a, b, c):
    idx = cuda.grid(1)
    if idx < c.size:
        c[idx] = a[idx] + b[idx]

def vector_add_cuda(a, b):
    d_a = cuda.to_device(a)
    d_b = cuda.to_device(b)
    d_c = cuda.device_array_like(a)
    
    threads_per_block = 256
    blocks_per_grid = (a.size + threads_per_block - 1) // threads_per_block
    
    vector_add_cuda_kernel[blocks_per_grid, threads_per_block](d_a, d_b, d_c)
    
    c = d_c.copy_to_host()
    return c

def vector_add_cuda_no_transfer(d_a, d_b, d_c, threads_per_block, blocks_per_grid):
    """GPU kernel without memory transfers - data already on GPU"""
    vector_add_cuda_kernel[blocks_per_grid, threads_per_block](d_a, d_b, d_c)
    cuda.synchronize()

def benchmark(func, *args, warmup=True):
    if warmup:
        func(*args)
    
    times = []
    for _ in range(5):
        start = time_module.perf_counter()
        result = func(*args)
        end = time_module.perf_counter()
        times.append(end - start)
    
    return np.mean(times), result

def check_cuda():
    try:
        cuda.select_device(0)
        cuda.current_context()
        return True
    except Exception as e:
        return False

def format_time(ms):
    """Format time with appropriate units"""
    if ms >= 1000:
        return f"{ms/1000:.2f}s"
    elif ms >= 1:
        return f"{ms:.2f}ms"
    else:
        return f"{ms*1000:.2f}μs"

In [15]:
N = 10_000_000
a = np.random.rand(N).astype(np.float32)
b = np.random.rand(N).astype(np.float32)

print(f"\nVector Addition Benchmark: {N:,} elements\n")

# Benchmark
c_python = np.zeros_like(a)
time_python, _ = benchmark(vector_add_python, a, b, c_python, warmup=False)

c_cpu = np.zeros_like(a)
time_cpu, _ = benchmark(vector_add_numba_cpu, a, b, c_cpu)

cuda_available = check_cuda()

if cuda_available:
    # GPU with transfer
    time_gpu, c_gpu = benchmark(vector_add_cuda, a, b)
    
    # GPU kernel only
    d_a = cuda.to_device(a)
    d_b = cuda.to_device(b)
    d_c = cuda.device_array_like(a)
    threads_per_block = 256
    blocks_per_grid = (a.size + threads_per_block - 1) // threads_per_block
    
    vector_add_cuda_no_transfer(d_a, d_b, d_c, threads_per_block, blocks_per_grid)
    times = []
    for _ in range(5):
        start = time_module.perf_counter()
        vector_add_cuda_no_transfer(d_a, d_b, d_c, threads_per_block, blocks_per_grid)
        times.append(time_module.perf_counter() - start)
    time_kernel = np.mean(times)
    
    print("┌─────────────────────────────┬──────────┬──────────────┬─────────────┐")
    print("│ Method                      │ Time     │ vs Python    │ vs CPU      │")
    print("├─────────────────────────────┼──────────┼──────────────┼─────────────┤")
    print(f"│ Pure Python                 │ {format_time(time_python*1000):>8} │ 1.0x         │ -           │")
    print(f"│ Numba CPU                   │ {format_time(time_cpu*1000):>8} │ {time_python/time_cpu:>5.0f}x faster │ -           │")
    print(f"│ GPU (with transfer)         │ {format_time(time_gpu*1000):>8} │ {time_python/time_gpu:>5.0f}x faster │ {time_cpu/time_gpu:>5.1f}x slower │")
    
    # Format kernel time to show both ms and μs for consistency
    kernel_ms = time_kernel * 1000
    if kernel_ms < 1:
        kernel_display = f"{kernel_ms:.2f}ms"
    else:
        kernel_display = format_time(kernel_ms)
    
    print(f"│ GPU (kernel only)           │ {kernel_display:>8} │ {time_python/time_kernel:>5.0f}x faster │ {time_cpu/time_kernel:>5.0f}x faster │")
    print("└─────────────────────────────┴──────────┴──────────────┴─────────────┘")
else:
    # CPU only table
    print("┌─────────────────────────────┬──────────┬──────────────┐")
    print("│ Method                      │ Time     │ vs Python    │")
    print("├─────────────────────────────┼──────────┼──────────────┤")
    print(f"│ Pure Python                 │ {format_time(time_python*1000):>8} │ 1.0x         │")
    print(f"│ Numba CPU                   │ {format_time(time_cpu*1000):>8} │ {time_python/time_cpu:>5.0f}x faster │")
    print("└─────────────────────────────┴──────────┴──────────────┘")
    print("\n(GPU not available)\n")


Vector Addition Benchmark: 10,000,000 elements

┌─────────────────────────────┬──────────┬──────────────┬─────────────┐
│ Method                      │ Time     │ vs Python    │ vs CPU      │
├─────────────────────────────┼──────────┼──────────────┼─────────────┤
│ Pure Python                 │ 684.34ms │ 1.0x         │ -           │
│ Numba CPU                   │   2.80ms │   245x faster │ -           │
│ GPU (with transfer)         │  10.95ms │    63x faster │   0.3x slower │
│ GPU (kernel only)           │   0.08ms │  8788x faster │    36x faster │
└─────────────────────────────┴──────────┴──────────────┴─────────────┘


# Scaling the operations

In [16]:
import numpy as np
from numba import jit, cuda
import time as time_module

@jit(nopython=True)
def vector_add_cpu(a, b, c):
    for i in range(len(a)):
        c[i] = a[i] + b[i]

@cuda.jit
def vector_add_gpu_kernel(a, b, c):
    idx = cuda.grid(1)
    if idx < c.size:
        c[idx] = a[idx] + b[idx]

def benchmark_cpu_n_ops(a, b, n_ops):
    """Benchmark CPU for n operations"""
    c = np.zeros_like(a)
    
    # Warmup
    vector_add_cpu(a, b, c)
    
    # Measure
    start = time_module.perf_counter()
    for _ in range(n_ops):
        vector_add_cpu(a, b, c)
    end = time_module.perf_counter()
    
    return end - start

def benchmark_gpu_n_ops(a, b, n_ops):
    """Benchmark GPU for n operations (data stays on GPU)"""
    # Transfer to GPU once
    d_a = cuda.to_device(a)
    d_b = cuda.to_device(b)
    d_c = cuda.device_array_like(a)
    
    threads_per_block = 256
    blocks_per_grid = (a.size + threads_per_block - 1) // threads_per_block
    
    # Warmup
    vector_add_gpu_kernel[blocks_per_grid, threads_per_block](d_a, d_b, d_c)
    cuda.synchronize()
    
    # Measure (including final transfer back)
    start = time_module.perf_counter()
    for _ in range(n_ops):
        vector_add_gpu_kernel[blocks_per_grid, threads_per_block](d_a, d_b, d_c)
    cuda.synchronize()
    result = d_c.copy_to_host()  # Transfer back
    end = time_module.perf_counter()
    
    return end - start

def check_cuda():
    try:
        cuda.select_device(0)
        cuda.current_context()
        return True
    except:
        return False

In [20]:
N = 10_000_000
a = np.random.rand(N).astype(np.float32)
b = np.random.rand(N).astype(np.float32)

print(f"\nScaling Benchmark: CPU vs GPU ({N:,} elements)\n")

cuda_available = check_cuda()

if not cuda_available:
    print("GPU not available - only showing CPU results\n")

# Test different numbers of operations
operation_counts = [1, 2, 5, 10, 20, 50, 100, 200, 500]

print("┌──────────────┬────────────┬────────────┬──────────────┬──────────────┐")
print("│ # Operations │  CPU Time  │  GPU Time  │   Speedup    │    Winner    │")
print("├──────────────┼────────────┼────────────┼──────────────┼──────────────┤")

for n_ops in operation_counts:
    # CPU benchmark
    time_cpu = benchmark_cpu_n_ops(a, b, n_ops)
    
    if cuda_available:
        # GPU benchmark
        time_gpu = benchmark_gpu_n_ops(a, b, n_ops)
        
        # Calculate speedup
        if time_gpu < time_cpu:
            speedup = time_cpu / time_gpu
            winner = "GPU"
            speedup_str = f"{speedup:>5.2f}x faster"
        else:
            speedup = time_gpu / time_cpu
            winner = "CPU"
            speedup_str = f"{speedup:>5.2f}x slower"
        
        # Format times
        cpu_time_str = f"{time_cpu*1000:>7.2f} ms"
        gpu_time_str = f"{time_gpu*1000:>7.2f} ms"
        
        print(f"│ {n_ops:>12} │ {cpu_time_str} │ {gpu_time_str} │ {speedup_str} │ {winner:^12} │")
    else:
        cpu_time_str = f"{time_cpu*1000:>7.2f} ms"
        print(f"│ {n_ops:>12} │ {cpu_time_str} │     N/A    │      N/A     │     N/A      │")

print("└──────────────┴────────────┴────────────┴──────────────┴──────────────┘")




Scaling Benchmark: CPU vs GPU (10,000,000 elements)

┌──────────────┬────────────┬────────────┬──────────────┬──────────────┐
│ # Operations │  CPU Time  │  GPU Time  │   Speedup    │    Winner    │
├──────────────┼────────────┼────────────┼──────────────┼──────────────┤
│            1 │    2.93 ms │    7.12 ms │  2.43x slower │     CPU      │
│            2 │    6.68 ms │    6.97 ms │  1.04x slower │     CPU      │
│            5 │   15.69 ms │    7.18 ms │  2.18x faster │     GPU      │
│           10 │   32.47 ms │    6.93 ms │  4.69x faster │     GPU      │
│           20 │   62.38 ms │    7.95 ms │  7.85x faster │     GPU      │
│           50 │  152.07 ms │    9.55 ms │ 15.93x faster │     GPU      │
│          100 │  319.62 ms │   12.99 ms │ 24.61x faster │     GPU      │
│          200 │  631.49 ms │   19.58 ms │ 32.26x faster │     GPU      │
│          500 │ 1536.38 ms │   39.60 ms │ 38.80x faster │     GPU      │
└──────────────┴────────────┴────────────┴──────────────┴────