In [15]:
import numpy as np
from numba import jit, cuda
import time as time_module

# 1. Pure Python version (baseline - slowest)
def vector_add_python(a, b, c):
    for i in range(len(a)):
        c[i] = a[i] + b[i]

# 2. Numba CPU version (JIT compiled)
@jit(nopython=True)
def vector_add_numba_cpu(a, b, c):
    for i in range(len(a)):
        c[i] = a[i] + b[i]

# 3. Numba CUDA version (GPU kernel)
@cuda.jit
def vector_add_cuda_kernel(a, b, c):
    idx = cuda.grid(1)
    if idx < c.size:
        c[idx] = a[idx] + b[idx]

def vector_add_cuda(a, b):
    d_a = cuda.to_device(a)
    d_b = cuda.to_device(b)
    d_c = cuda.device_array_like(a)
    
    threads_per_block = 256
    blocks_per_grid = (a.size + threads_per_block - 1) // threads_per_block
    
    vector_add_cuda_kernel[blocks_per_grid, threads_per_block](d_a, d_b, d_c)
    
    c = d_c.copy_to_host()
    return c

def vector_add_cuda_no_transfer(d_a, d_b, d_c, threads_per_block, blocks_per_grid):
    """GPU kernel without memory transfers - data already on GPU"""
    vector_add_cuda_kernel[blocks_per_grid, threads_per_block](d_a, d_b, d_c)
    cuda.synchronize()

def multiple_operations_gpu(a, b, n_ops=10):
    """Perform multiple additions - data stays on GPU"""
    d_a = cuda.to_device(a)
    d_b = cuda.to_device(b)
    d_c = cuda.device_array_like(a)
    
    threads_per_block = 256
    blocks_per_grid = (a.size + threads_per_block - 1) // threads_per_block
    
    for _ in range(n_ops):
        vector_add_cuda_kernel[blocks_per_grid, threads_per_block](d_a, d_b, d_c)
    
    cuda.synchronize()
    return d_c.copy_to_host()

def benchmark(func, *args, name="Function", warmup=True):
    if warmup:
        func(*args)
    
    times = []
    for _ in range(5):
        start = time_module.perf_counter()
        result = func(*args)
        end = time_module.perf_counter()
        times.append(end - start)
    
    avg_time = np.mean(times)
    return avg_time, result

def check_cuda():
    try:
        cuda.select_device(0)
        cuda.current_context()
        return True
    except Exception as e:
        print(f"⚠ CUDA not available: {e}\n")
        return False

def format_time(ms):
    """Format time with appropriate units"""
    if ms >= 1000:
        return f"{ms/1000:.2f} s "
    elif ms >= 1:
        return f"{ms:.2f} ms"
    else:
        return f"{ms*1000:.2f} μs"

def print_header(title, char="=", width=80):
    print(f"\n{char * width}")
    print(f"{title:^{width}}")
    print(f"{char * width}")

def print_section(title, width=80):
    print(f"\n{title}")
    print("-" * width)

def print_result(label, time_ms, width=50):
    time_str = format_time(time_ms)
    print(f"  {label:<{width-15}} {time_str:>12}")

def print_speedup(label, speedup, width=50):
    if speedup >= 1:
        print(f"  {label:<{width-15}} {speedup:>10.1f}x faster")
    else:
        print(f"  {label:<{width-15}} {1/speedup:>10.1f}x slower")

In [17]:
N = 10_000_000
a = np.random.rand(N).astype(np.float32)
b = np.random.rand(N).astype(np.float32)

print_header("VECTOR ADDITION BENCHMARK")
print(f"Array size: {N:,} elements ({N*4/1e6:.1f} MB per array)")

cuda_available = check_cuda()

# Store results
results = {}

# ========================================================================
# 1. Pure Python
# ========================================================================
print_section("1. Pure Python (baseline)")
c_python = np.zeros_like(a)
time_python, _ = benchmark(vector_add_python, a, b, c_python, 
                            name="Pure Python", warmup=False)
results['python'] = time_python
print_result("Execution time", time_python * 1000)

# ========================================================================
# 2. Numba CPU
# ========================================================================
print_section("2. Numba CPU (JIT compiled)")
c_cpu = np.zeros_like(a)
time_cpu, _ = benchmark(vector_add_numba_cpu, a, b, c_cpu, name="Numba CPU")
results['cpu'] = time_cpu
print_result("Execution time", time_cpu * 1000)
print_speedup("vs Pure Python", time_python / time_cpu)

# ========================================================================
# 3. GPU Benchmarks
# ========================================================================
if cuda_available:
    print_section("3. Numba CUDA (with memory transfers)")
    time_cuda, c_cuda = benchmark(vector_add_cuda, a, b, name="GPU")
    results['gpu_total'] = time_cuda
    print_result("Total time (with transfers)", time_cuda * 1000)
    print_speedup("vs Pure Python", time_python / time_cuda)
    print_speedup("vs Numba CPU", time_cpu / time_cuda)
    
    # Break down GPU time
    print_section("   GPU Time Breakdown")
    
    # Transfer TO GPU
    start = time_module.perf_counter()
    d_a = cuda.to_device(a)
    d_b = cuda.to_device(b)
    transfer_to = time_module.perf_counter() - start
    results['transfer_to'] = transfer_to
    
    # Kernel execution
    d_c = cuda.device_array_like(a)
    threads_per_block = 256
    blocks_per_grid = (a.size + threads_per_block - 1) // threads_per_block
    
    vector_add_cuda_no_transfer(d_a, d_b, d_c, threads_per_block, blocks_per_grid)
    
    times = []
    for _ in range(5):
        start = time_module.perf_counter()
        vector_add_cuda_no_transfer(d_a, d_b, d_c, threads_per_block, blocks_per_grid)
        times.append(time_module.perf_counter() - start)
    kernel_time = np.mean(times)
    results['kernel'] = kernel_time
    
    # Transfer FROM GPU
    start = time_module.perf_counter()
    result = d_c.copy_to_host()
    transfer_from = time_module.perf_counter() - start
    results['transfer_from'] = transfer_from
    
    total_breakdown = transfer_to + kernel_time + transfer_from
    
    print_result("Transfer to GPU", transfer_to * 1000)
    print_result("Kernel execution", kernel_time * 1000)
    print_result("Transfer from GPU", transfer_from * 1000)
    print("-" * 80)
    print_result("TOTAL", total_breakdown * 1000)
    
    print(f"\n  💡 Memory transfer takes {(transfer_to + transfer_from)/kernel_time:.0f}x longer than computation!")
    
    # ====================================================================
    # 4. Multiple Operations
    # ====================================================================
    print_section("4. Multiple Operations (data stays on GPU)")
    n_ops = 10
    
    # CPU
    c_cpu = np.zeros_like(a)
    start = time_module.perf_counter()
    for _ in range(n_ops):
        vector_add_numba_cpu(a, b, c_cpu)
    time_cpu_multi = time_module.perf_counter() - start
    results['cpu_multi'] = time_cpu_multi
    
    # GPU
    time_gpu_multi, _ = benchmark(multiple_operations_gpu, a, b, n_ops, warmup=True)
    results['gpu_multi'] = time_gpu_multi
    
    print_result(f"CPU ({n_ops} operations)", time_cpu_multi * 1000)
    print_result(f"GPU ({n_ops} operations)", time_gpu_multi * 1000)
    print_speedup(f"GPU vs CPU ({n_ops} ops)", time_cpu_multi / time_gpu_multi)
    
    # ====================================================================
    # Verification
    # ====================================================================
    print_section("VERIFICATION")
    cpu_match = np.allclose(c_python, c_cpu)
    gpu_match = np.allclose(c_python, c_cuda)
    
    print(f"  ✓ CPU results match Python:  {'PASS' if cpu_match else 'FAIL'}")
    print(f"  ✓ GPU results match Python:  {'PASS' if gpu_match else 'FAIL'}")
    
    # ====================================================================
    # Summary Table - Cleaner Format
    # ====================================================================
    print_header("PERFORMANCE SUMMARY", "=")
    
    print("\n┌─────────────────────────────────────┬──────────────┬──────────────┐")
    print("│ Implementation                      │ Time         │ Speedup      │")
    print("├─────────────────────────────────────┼──────────────┼──────────────┤")
    print(f"│ Pure Python (baseline)              │ {format_time(results['python']*1000):>10}   │ 1.0x         │")
    print(f"│ Numba CPU                           │ {format_time(results['cpu']*1000):>10}   │ {results['python']/results['cpu']:>5.0f}x faster │")
    print(f"│ GPU (with transfers)                │ {format_time(results['gpu_total']*1000):>10}   │ {results['python']/results['gpu_total']:>5.0f}x faster │")
    print(f"│ GPU (kernel only)                   │ {format_time(results['kernel']*1000):>10}   │ {results['cpu']/results['kernel']:>5.0f}x faster │")
    print(f"│ GPU ({n_ops} ops, keeps data)            │ {format_time(results['gpu_multi']*1000):>10}   │ {results['cpu_multi']/results['gpu_multi']:>5.1f}x faster │")
    print("└─────────────────────────────────────┴──────────────┴──────────────┘")
    


                           VECTOR ADDITION BENCHMARK                            
Array size: 10,000,000 elements (40.0 MB per array)

1. Pure Python (baseline)
--------------------------------------------------------------------------------
  Execution time                         695.77 ms

2. Numba CPU (JIT compiled)
--------------------------------------------------------------------------------
  Execution time                           3.06 ms
  vs Pure Python                           227.0x faster

3. Numba CUDA (with memory transfers)
--------------------------------------------------------------------------------
  Total time (with transfers)             11.80 ms
  vs Pure Python                            59.0x faster
  vs Numba CPU                               3.8x slower

   GPU Time Breakdown
--------------------------------------------------------------------------------
  Transfer to GPU                          4.87 ms
  Kernel execution                        79.75 μ