# Custom CUDA Roll Operation

Testing custom torch.roll implementation with CUDA optimization.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kobejean/torch-ops/blob/main/test_roll_colab_clean.ipynb)

## Setup

In [None]:
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Device: {torch.cuda.get_device_name(0)}")

In [None]:
!git clone https://github.com/kobejean/torch-ops.git
%cd torch-ops
!pip install -e . -q

In [None]:
import custom_ops
import time

## Correctness test

In [None]:
# Test basic functionality
x = torch.randn(100, 100, device='cuda')
shifts, dims = [10, 20], [0, 1]

result_custom = custom_ops.roll(x, shifts, dims)
result_pytorch = torch.roll(x, shifts, dims)

print(f"Results match: {torch.allclose(result_custom, result_pytorch)}")
print(f"Max difference: {torch.max(torch.abs(result_custom - result_pytorch)).item():.2e}")

## Performance comparison

In [None]:
sizes = [(500, 500), (1000, 1000), (2000, 2000)]

print(f"{'Size':<12} {'Custom (ms)':<12} {'PyTorch (ms)':<13} {'Speedup':<8}")
print("-" * 50)

for size in sizes:
    x = torch.randn(size, device='cuda')
    shifts = [size[0]//10, size[1]//10]
    dims = [0, 1]
    
    # Warmup
    for _ in range(5):
        custom_ops.roll(x, shifts, dims)
        torch.roll(x, shifts, dims)
    torch.cuda.synchronize()
    
    # Time custom implementation
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    
    start.record()
    for _ in range(20):
        custom_ops.roll(x, shifts, dims)
    end.record()
    torch.cuda.synchronize()
    custom_time = start.elapsed_time(end) / 20
    
    # Time PyTorch implementation
    start.record()
    for _ in range(20):
        torch.roll(x, shifts, dims)
    end.record()
    torch.cuda.synchronize()
    pytorch_time = start.elapsed_time(end) / 20
    
    speedup = pytorch_time / custom_time
    print(f"{str(size):<12} {custom_time:<12.3f} {pytorch_time:<13.3f} {speedup:<8.2f}x")

## Kernel analysis

In [None]:
# Get GPU properties
props = torch.cuda.get_device_properties(0)
print(f"GPU: {props.name}")
print(f"Compute Capability: {props.major}.{props.minor}")
print(f"Multiprocessors: {props.multi_processor_count}")
print(f"Max threads per MP: {props.max_threads_per_multi_processor}")

In [None]:
# Calculate occupancy for our kernel
threads_per_block = 256
warps_per_block = threads_per_block // 32
max_blocks = props.max_threads_per_multi_processor // threads_per_block
max_warps = props.max_threads_per_multi_processor // 32
occupancy = (max_blocks * warps_per_block) / max_warps * 100

print(f"Threads per block: {threads_per_block}")
print(f"Warps per block: {warps_per_block}")
print(f"Max active blocks: {max_blocks}")
print(f"Theoretical occupancy: {occupancy:.1f}%")

## Memory bandwidth

In [None]:
# Measure memory bandwidth
x = torch.randn(1000, 1000, device='cuda')
bytes_transferred = x.numel() * 4 * 2  # read + write float32

start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)

start.record()
for _ in range(10):
    custom_ops.roll(x, [100, 200], [0, 1])
end.record()
torch.cuda.synchronize()

time_ms = start.elapsed_time(end) / 10
bandwidth = (bytes_transferred / 1e9) / (time_ms / 1000)
print(f"Kernel time: {time_ms:.3f} ms")
print(f"Memory bandwidth: {bandwidth:.1f} GB/s")

## Profiler output

In [None]:
import torch.profiler

x = torch.randn(1000, 1000, device='cuda')

with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CUDA],
    record_shapes=True
) as prof:
    for _ in range(5):
        custom_ops.roll(x, [100, 200], [0, 1])

# Show kernel details
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=5))