# GPU Matrix Multiplication Performance Test

This notebook is designed to run in Google Colab to test GPU performance of our CUDA matrix multiplication implementation. It focuses on two matrix sizes to keep the testing compact and efficient.

**Note:** This notebook is intended for Linux environments (Google Colab) only.

In [None]:
# Install required packages
!pip install numpy pytest matplotlib

In [None]:
# Setup repository directory
import os
# Go to /content directory
%cd /content
# Remove any existing cuda-matmul directory
!rm -rf cuda-matmul

# Clone the repository
!git clone https://github.com/konkolchin/cuda-matmul.git
%cd cuda-matmul

In [None]:
# Install the package in development mode
!pip install -e .

In [None]:
import numpy as np
import time
import cuda_ops
import cuda_runtime
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def check_gpu():
    """Check CUDA availability and print device info."""
    try:
        device_count = cuda_runtime.cudaGetDeviceCount()
        logger.info(f"Found {device_count} CUDA device(s)")
        
        if device_count > 0:
            props = cuda_runtime.cudaGetDeviceProperties(0)
            logger.info(f"Device 0: {props.name.decode() if hasattr(props, 'name') else 'Unknown'}")
            return True
        else:
            logger.warning("No CUDA devices found")
            return False
    except Exception as e:
        logger.error(f"Error checking CUDA availability: {str(e)}")
        return False

In [None]:
# Check GPU availability
has_gpu = check_gpu()
if not has_gpu:
    raise RuntimeError("No GPU available. Please ensure you're running this notebook in Google Colab with GPU runtime.")

In [None]:
def run_performance_test(m, n, k, block_sizes=[16, 32]):
    """Run performance test for given matrix dimensions."""
    logger.info(f"\nTesting {m}x{n} matrices...")
    
    # Generate test matrices
    a = np.random.rand(m, k).astype(np.float32)
    b = np.random.rand(k, n).astype(np.float32)
    
    # Warm-up runs
    for _ in range(2):
        cuda_ops.matrix_multiply(a, b, use_gpu=False)
        for block_size in block_sizes:
            cuda_ops.matrix_multiply(a, b, use_gpu=True, block_size=block_size)
    
    # CPU timing
    start = time.time()
    for _ in range(3):
        cuda_ops.matrix_multiply(a, b, use_gpu=False)
    cpu_time = (time.time() - start) / 3
    
    # GPU timing
    gpu_times = []
    speedups = []
    
    for block_size in block_sizes:
        start = time.time()
        for _ in range(3):
            cuda_ops.matrix_multiply(a, b, use_gpu=True, block_size=block_size)
        gpu_time = (time.time() - start) / 3
        gpu_times.append(gpu_time)
        speedups.append(cpu_time / gpu_time)
    
    return {
        'size': f"{m}x{n}",
        'cpu_time': cpu_time,
        'gpu_times': gpu_times,
        'speedups': speedups
    }

In [None]:
# Run performance tests with two matrix sizes
sizes = [(1024, 1024), (2048, 2048)]  # Two representative sizes
block_sizes = [16, 32]

# Print header
header = "Matrix Size\tCPU Time (s)\tGPU 16x16 (s)\tGPU 32x32 (s)\tSpeedup 16x16\tSpeedup 32x32"
print("\nPerformance Test Results:")
print(header)
print("-" * 80)

# Run tests and collect results
results = []
for m, n in sizes:
    k = m  # Square matrices
    result = run_performance_test(m, n, k, block_sizes)
    results.append(result)
    
    # Print results
    print(f"{result['size']}\t\t{result['cpu_time']:.4f}\t\t"
          f"{result['gpu_times'][0]:.4f}\t\t{result['gpu_times'][1]:.4f}\t\t"
          f"{result['speedups'][0]:.2f}x\t\t{result['speedups'][1]:.2f}x")

In [None]:
# Visualize results
import matplotlib.pyplot as plt

def plot_results(results):
    sizes = [r['size'] for r in results]
    cpu_times = [r['cpu_time'] for r in results]
    gpu_times_16 = [r['gpu_times'][0] for r in results]
    gpu_times_32 = [r['gpu_times'][1] for r in results]
    
    x = range(len(sizes))
    width = 0.25
    
    plt.figure(figsize=(10, 6))
    plt.bar([i - width for i in x], cpu_times, width, label='CPU')
    plt.bar(x, gpu_times_16, width, label='GPU (16x16)')
    plt.bar([i + width for i in x], gpu_times_32, width, label='GPU (32x32)')
    
    plt.xlabel('Matrix Size')
    plt.ylabel('Time (seconds)')
    plt.title('Matrix Multiplication Performance Comparison')
    plt.xticks(x, sizes)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

plot_results(results)