In [1]:
# Clear all outputs and restart
from IPython.display import clear_output
clear_output(wait=True)
print("=== FRESH START ===")

=== FRESH START ===


In [5]:
# Multi-block kernel concurrency test
# Tests the new multi_ensemble_kernel which launches one block per ensemble

import time
import math
import numpy as np
import cupy as cp
import sys

# Clear and set path
if '/mnt/d/packing/code/core/' in sys.path:
    sys.path.remove('/mnt/d/packing/code/core/')
sys.path.insert(0, '/mnt/d/packing/code/core/')

import pack_cuda
pack_cuda.USE_FLOAT32 = True

def make_input(N):
    # small random poses in sensible range
    rng = np.random.RandomState(123)
    x = rng.uniform(-5.0, 5.0, size=N)
    y = rng.uniform(-5.0, 5.0, size=N)
    t = rng.uniform(-math.pi, math.pi, size=N)
    xyt = np.stack([x, y, t], axis=1).astype(np.float64)
    return xyt

def run_trial(num_blocks, iters_per_call, xyt1_np, xyt2_np):
    pack_cuda._ensure_initialized()
    
    # Convert to GPU once to avoid repeated pinned memory allocations
    xyt1_gpu = cp.asarray(xyt1_np, dtype=cp.float32 if pack_cuda.USE_FLOAT32 else cp.float64)
    xyt2_gpu = cp.asarray(xyt2_np, dtype=cp.float32 if pack_cuda.USE_FLOAT32 else cp.float64)
    
    # Create list of ensembles (all identical for this test)
    xyt1_list = [xyt1_gpu for _ in range(num_blocks)]
    xyt2_list = [xyt2_gpu for _ in range(num_blocks)]
    
    # Warmup with full synchronization
    totals, grads = pack_cuda.overlap_multi_ensemble(xyt1_list, xyt2_list, compute_grad=True)
    cp.cuda.Device().synchronize()
    
    # Timed run with proper synchronization
    cp.cuda.Device().synchronize()
    start = time.perf_counter()
    for k in range(iters_per_call):
        totals, grads = pack_cuda.overlap_multi_ensemble(xyt1_list, xyt2_list, compute_grad=True)
    cp.cuda.Device().synchronize()
    end = time.perf_counter()
    
    elapsed = end - start
    total_calls = iters_per_call
    calls_per_sec = total_calls / elapsed
    return calls_per_sec, elapsed


# Initialize everything first
pack_cuda._ensure_initialized()

# Parameters to tune
N = 20
iters = 10
xyt = make_input(N)

cp.cuda.Device().synchronize()

packs = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
print("Blocks\tCalls/sec\tElapsed(s)")
for b in packs:
    cps, t = run_trial(b, iters, xyt, xyt)
    cp.cuda.Device().synchronize()
    print(f"{b}\t{int(cps)}\t\t{t:.3f}")


Blocks	Calls/sec	Elapsed(s)
1	312		0.032
2	351		0.028
4	251		0.040
8	294		0.034
16	277		0.036
32	235		0.043
64	383		0.026
128	202		0.049
32	235		0.043
64	383		0.026
128	202		0.049
256	70		0.142
256	70		0.142
512	28		0.352
512	28		0.352


In [3]:
# Check GPU properties
props = cp.cuda.runtime.getDeviceProperties(0)
print(f"GPU: {props['name'].decode()}")
print(f"Number of SMs: {props['multiProcessorCount']}")
print(f"Max threads per SM: {props['maxThreadsPerMultiProcessor']}")
print(f"Max threads per block: {props['maxThreadsPerBlock']}")
print(f"Max blocks per SM: {props['maxThreadsPerMultiProcessor'] // props['maxThreadsPerBlock']}")

GPU: NVIDIA GeForce RTX 4070 Ti
Number of SMs: 60
Max threads per SM: 1536
Max threads per block: 1024
Max blocks per SM: 1


In [4]:
print(f"CUDA version: {cp.cuda.runtime.runtimeGetVersion()}")

CUDA version: 12080


In [6]:
# Test with smaller N to see if occupancy improves
N_small = 50
iters_small = 40
xyt_small = make_input(N_small)

packs_test = [1, 2, 4, 8, 16, 32, 64, 128]
print("\n=== Testing with N=50 (50 threads/block) ===")
print("Blocks\tCalls/sec\tElapsed(s)")
for b in packs_test:
    cps, t = run_trial(b, iters_small, xyt_small, xyt_small)
    cp.cuda.Device().synchronize()
    print(f"{b}\t{int(cps)}\t\t{t:.3f}")


=== Testing with N=50 (50 threads/block) ===
Blocks	Calls/sec	Elapsed(s)
1	128		0.311
2	788		0.051
4	770		0.052
8	759		0.053
1	128		0.311
2	788		0.051
4	770		0.052
8	759		0.053
16	730		0.055
32	605		0.066
64	375		0.106
16	730		0.055
32	605		0.066
64	375		0.106
128	181		0.221
128	181		0.221


In [7]:
# Test: Compare multi_ensemble vs launching separate kernels
# To see if the issue is with the multi-ensemble kernel itself

def run_separate_kernels(num_blocks, iters_per_call, xyt1_np, xyt2_np):
    """Launch num_blocks separate single-block kernels instead of one multi-block kernel"""
    pack_cuda._ensure_initialized()
    
    xyt1_gpu = cp.asarray(xyt1_np, dtype=cp.float32 if pack_cuda.USE_FLOAT32 else cp.float64)
    xyt2_gpu = cp.asarray(xyt2_np, dtype=cp.float32 if pack_cuda.USE_FLOAT32 else cp.float64)
    
    # Warmup
    for _ in range(num_blocks):
        totals, grads = pack_cuda.overlap_list_total(xyt1_gpu, xyt2_gpu, compute_grad=True)
    cp.cuda.Device().synchronize()
    
    # Timed run
    cp.cuda.Device().synchronize()
    start = time.perf_counter()
    for k in range(iters_per_call):
        for _ in range(num_blocks):
            totals, grads = pack_cuda.overlap_list_total(xyt1_gpu, xyt2_gpu, compute_grad=True)
    cp.cuda.Device().synchronize()
    end = time.perf_counter()
    
    elapsed = end - start
    return iters_per_call / elapsed, elapsed

print("\n=== Comparing multi-ensemble vs separate kernel launches (N=20) ===")
print("Blocks\tMulti\tSeparate")
for b in [1, 2, 4, 8, 16, 32, 64]:
    cps_multi, _ = run_trial(b, iters, xyt, xyt)
    cps_sep, _ = run_separate_kernels(b, iters, xyt, xyt)
    print(f"{b}\t{int(cps_multi)}\t{int(cps_sep)}")


=== Comparing multi-ensemble vs separate kernel launches (N=20) ===
Blocks	Multi	Separate
1	2064	5483
2	2973	3530
4	1577	1681
8	933	893
16	1304	444
32	436	220
64	356	110
64	356	110


In [8]:
# Check actual kernel occupancy
# Try to determine how many blocks can run per SM

# Get the kernel function
pack_cuda._ensure_initialized()
kernel = pack_cuda._multi_ensemble_kernel

# Check kernel attributes
print("Kernel attributes:")
print(f"  Max threads per block: {kernel.max_threads_per_block}")
print(f"  Num regs: {kernel.num_regs}")
print(f"  Shared size bytes: {kernel.shared_size_bytes}")
print(f"  Const size bytes: {kernel.const_size_bytes}")
print(f"  Local size bytes: {kernel.local_size_bytes}")

# Calculate theoretical occupancy
threads_per_block = 20  # Our N
max_threads_per_sm = props['maxThreadsPerMultiProcessor']
max_blocks_per_sm_thread = max_threads_per_sm // threads_per_block

print(f"\nWith {threads_per_block} threads/block:")
print(f"  Max blocks/SM (thread limit): {max_blocks_per_sm_thread}")
print(f"  Max active threads/SM: {min(max_threads_per_sm, max_blocks_per_sm_thread * threads_per_block)}")

Kernel attributes:
  Max threads per block: 896
  Num regs: 72
  Shared size bytes: 0
  Const size bytes: 144
  Local size bytes: 1520

With 20 threads/block:
  Max blocks/SM (thread limit): 76
  Max active threads/SM: 1520


In [9]:
# Calculate register-limited occupancy
# RTX 4070 Ti (Ada Lovelace): 65,536 registers per SM

regs_per_sm = 65536
regs_per_thread = kernel.num_regs
threads_per_block = 20

# How many blocks can fit based on registers?
regs_per_block = regs_per_thread * threads_per_block
max_blocks_per_sm_regs = regs_per_sm // regs_per_block

print(f"Register analysis:")
print(f"  Registers per SM: {regs_per_sm}")
print(f"  Registers per thread: {regs_per_thread}")
print(f"  Registers per block (N={threads_per_block}): {regs_per_block}")
print(f"  Max blocks/SM (register limit): {max_blocks_per_sm_regs}")
print(f"  Max blocks/SM (thread limit): {max_blocks_per_sm_thread}")
print(f"  Actual max blocks/SM: {min(max_blocks_per_sm_regs, max_blocks_per_sm_thread)}")
print(f"\nWith 60 SMs:")
print(f"  Max concurrent blocks: {60 * min(max_blocks_per_sm_regs, max_blocks_per_sm_thread)}")
print(f"  Max concurrent threads: {60 * min(max_blocks_per_sm_regs, max_blocks_per_sm_thread) * threads_per_block}")

Register analysis:
  Registers per SM: 65536
  Registers per thread: 72
  Registers per block (N=20): 1440
  Max blocks/SM (register limit): 45
  Max blocks/SM (thread limit): 76
  Actual max blocks/SM: 45

With 60 SMs:
  Max concurrent blocks: 2700
  Max concurrent threads: 54000


In [11]:
# Check if this is actually compute-bound or something else
# Let's measure kernel execution time vs total time

import cupy as cp

def run_with_events(num_blocks, iters_per_call, xyt1_np, xyt2_np):
    """Measure GPU execution time using CUDA events"""
    pack_cuda._ensure_initialized()
    
    xyt1_gpu = cp.asarray(xyt1_np, dtype=cp.float32 if pack_cuda.USE_FLOAT32 else cp.float64)
    xyt2_gpu = cp.asarray(xyt2_np, dtype=cp.float32 if pack_cuda.USE_FLOAT32 else cp.float64)
    
    xyt1_list = [xyt1_gpu for _ in range(num_blocks)]
    xyt2_list = [xyt2_gpu for _ in range(num_blocks)]
    
    # Warmup
    totals, grads = pack_cuda.overlap_multi_ensemble(xyt1_list, xyt2_list, compute_grad=True)
    cp.cuda.Device().synchronize()
    
    # Measure with events
    start_event = cp.cuda.Event()
    end_event = cp.cuda.Event()
    
    start_event.record()
    for k in range(iters_per_call):
        totals, grads = pack_cuda.overlap_multi_ensemble(xyt1_list, xyt2_list, compute_grad=True)
    end_event.record()
    end_event.synchronize()
    
    gpu_time_ms = cp.cuda.get_elapsed_time(start_event, end_event)
    return gpu_time_ms / 1000.0  # Convert to seconds

print("\n=== GPU kernel execution time (N=20) ===")
print("Blocks\tGPU time(s)\tCalls/sec")
for b in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
    gpu_time = run_with_events(b, iters, xyt, xyt)
    cps = iters / gpu_time
    print(f"{b}\t{gpu_time:.3f}\t\t{int(cps)}")


=== GPU kernel execution time (N=20) ===
Blocks	GPU time(s)	Calls/sec
1	0.033		303
2	0.032		310
4	0.042		239
8	0.043		234
16	0.042		237
32	0.052		191
64	0.035		288
128	0.080		125
256	0.122		81
512	0.338		29
1024	0.559		17
