# Custom CUDA Roll Operation

Testing custom torch.roll implementation with CUDA optimization.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kobejean/torch-ops/blob/main/test_roll_colab.ipynb)

## Setup

In [1]:
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Device: {torch.cuda.get_device_name(0)}")

PyTorch: 2.8.0+cu126
CUDA: True
Device: Tesla T4


In [2]:
!git clone https://github.com/kobejean/torch-ops.git
%cd torch-ops
!pip install -e . -q -v

Cloning into 'torch-ops'...
remote: Enumerating objects: 77, done.[K
remote: Counting objects: 100% (77/77), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 77 (delta 33), reused 66 (delta 22), pack-reused 0 (from 0)[K
Receiving objects: 100% (77/77), 33.67 KiB | 4.21 MiB/s, done.
Resolving deltas: 100% (33/33), done.
/content/torch-ops
Obtaining file:///content/torch-ops
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: custom_ops
  Running setup.py develop for custom_ops
Successfully installed custom_ops-0.0.0


In [3]:
import custom_ops
import time

## Correctness test

In [4]:
# Test basic functionality
x = torch.randn(100, 100, device='cuda')
shifts, dims = [10, 20], [0, 1]

result_custom = custom_ops.roll(x, shifts, dims)
result_pytorch = torch.roll(x, shifts, dims)

print(f"Results match: {torch.allclose(result_custom, result_pytorch)}")
print(f"Max difference: {torch.max(torch.abs(result_custom - result_pytorch)).item():.2e}")

Results match: True
Max difference: 0.00e+00


## Performance comparison

In [5]:
sizes = [(500, 500), (1000, 1000), (5000, 5000), (10000, 10000)]

print(f"{'Size':<12} {'Custom (ms)':<12} {'PyTorch (ms)':<13} {'Speedup':<8}")
print("-" * 50)

for size in sizes:
    x = torch.randn(size, device='cuda')
    shifts = [size[0]//10, size[1]//10]
    dims = [0, 1]

    # Warmup
    for _ in range(5):
        custom_ops.roll(x, shifts, dims)
        torch.roll(x, shifts, dims)
    torch.cuda.synchronize()

    # Time custom implementation
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)

    start.record()
    for _ in range(20):
        custom_ops.roll(x, shifts, dims)
    end.record()
    torch.cuda.synchronize()
    custom_time = start.elapsed_time(end) / 20

    # Time PyTorch implementation
    start.record()
    for _ in range(20):
        torch.roll(x, shifts, dims)
    end.record()
    torch.cuda.synchronize()
    pytorch_time = start.elapsed_time(end) / 20

    speedup = pytorch_time / custom_time
    print(f"{str(size):<12} {custom_time:<12.3f} {pytorch_time:<13.3f} {speedup:<8.2f}x")

Size         Custom (ms)  PyTorch (ms)  Speedup 
--------------------------------------------------
(500, 500)   0.064        0.037         0.57    x
(1000, 1000) 0.139        0.132         0.95    x
(5000, 5000) 2.588        3.021         1.17    x
(10000, 10000) 5.521        7.123         1.29    x


## Kernel analysis

In [6]:
# Get GPU properties
props = torch.cuda.get_device_properties(0)
print(f"GPU: {props.name}")
print(f"Compute Capability: {props.major}.{props.minor}")
print(f"Multiprocessors: {props.multi_processor_count}")
print(f"Max threads per MP: {props.max_threads_per_multi_processor}")

GPU: Tesla T4
Compute Capability: 7.5
Multiprocessors: 40
Max threads per MP: 1024


In [7]:
# Calculate occupancy for our kernel
threads_per_block = 256
warps_per_block = threads_per_block // 32
max_blocks = props.max_threads_per_multi_processor // threads_per_block
max_warps = props.max_threads_per_multi_processor // 32
occupancy = (max_blocks * warps_per_block) / max_warps * 100

print(f"Threads per block: {threads_per_block}")
print(f"Warps per block: {warps_per_block}")
print(f"Max active blocks: {max_blocks}")
print(f"Theoretical occupancy: {occupancy:.1f}%")

Threads per block: 256
Warps per block: 8
Max active blocks: 4
Theoretical occupancy: 100.0%


## Memory bandwidth

In [8]:
# Measure memory bandwidth
x = torch.randn(1000, 1000, device='cuda')
bytes_transferred = x.numel() * 4 * 2  # read + write float32

start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)

start.record()
for _ in range(10):
    custom_ops.roll(x, [100, 200], [0, 1])
end.record()
torch.cuda.synchronize()

time_ms = start.elapsed_time(end) / 10
bandwidth = (bytes_transferred / 1e9) / (time_ms / 1000)
print(f"Kernel time: {time_ms:.3f} ms")
print(f"Memory bandwidth: {bandwidth:.1f} GB/s")

Kernel time: 0.194 ms
Memory bandwidth: 41.2 GB/s


## Profiler output

In [9]:
import torch.profiler

x = torch.randn(1000, 1000, device='cuda')

with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CUDA],
    record_shapes=True
) as prof:
    for _ in range(5):
        custom_ops.roll(x, [100, 200], [0, 1])

# Show kernel details
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=5))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
void custom_ops::tensor::roll_kernel<float>(float co...         0.00%       0.000us         0.00%       0.000us       0.000us     543.070us        98.18%     543.070us     108.614us             5  
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.047us         1.82%      10.047us       0.670us            15  
         