In [1]:
# Clear all outputs and restart
from IPython.display import clear_output
clear_output(wait=True)
print("=== FRESH START ===")

=== FRESH START ===


In [None]:
# Multi-block kernel concurrency test
# Tests both block size scaling and parallel kernel execution in multiple streams

import time
import math
import numpy as np
import cupy as cp
import sys

# Clear and set path
import os
if '/mnt/d/packing/code/core/' in sys.path:
    sys.path.remove('/mnt/d/packing/code/core/')
sys.path.insert(0, os.path.join(os.getcwd(), '../core'))

import pack_cuda
pack_cuda.USE_FLOAT32 = True

def make_input(N):
    # small random poses in sensible range
    rng = np.random.RandomState(123)
    x = rng.uniform(-5.0, 5.0, size=N)
    y = rng.uniform(-5.0, 5.0, size=N)
    t = rng.uniform(-math.pi, math.pi, size=N)
    xyt = np.stack([x, y, t], axis=1).astype(np.float64)
    return xyt

def run_trial_single_stream(num_blocks, iters_per_call, xyt1_np, xyt2_np):
    """Test: vary number of blocks (ensembles) in single stream"""
    pack_cuda._ensure_initialized()
    
    # Convert to GPU once to avoid repeated pinned memory allocations
    xyt1_gpu = cp.asarray(xyt1_np, dtype=cp.float32 if pack_cuda.USE_FLOAT32 else cp.float64)
    xyt2_gpu = cp.asarray(xyt2_np, dtype=cp.float32 if pack_cuda.USE_FLOAT32 else cp.float64)
    
    # Create list of ensembles (all identical for this test)
    xyt1_list = [xyt1_gpu for _ in range(num_blocks)]
    xyt2_list = [xyt2_gpu for _ in range(num_blocks)]
    
    # Warmup with full synchronization
    totals, grads = pack_cuda.overlap_multi_ensemble(xyt1_list, xyt2_list, compute_grad=True)
    cp.cuda.Device().synchronize()
    
    # Timed run with proper synchronization
    cp.cuda.Device().synchronize()
    start = time.perf_counter()
    for k in range(iters_per_call):
        totals, grads = pack_cuda.overlap_multi_ensemble(xyt1_list, xyt2_list, compute_grad=True)
    cp.cuda.Device().synchronize()
    end = time.perf_counter()
    
    elapsed = end - start
    total_calls = iters_per_call
    calls_per_sec = total_calls / elapsed
    return calls_per_sec, elapsed

def run_trial_multi_stream(num_streams, iters_per_stream, num_blocks_per_stream, xyt1_np, xyt2_np):
    """Test: launch multiple kernels in parallel via streams"""
    pack_cuda._ensure_initialized()
    
    # Convert to GPU once
    xyt1_gpu = cp.asarray(xyt1_np, dtype=cp.float32 if pack_cuda.USE_FLOAT32 else cp.float64)
    xyt2_gpu = cp.asarray(xyt2_np, dtype=cp.float32 if pack_cuda.USE_FLOAT32 else cp.float64)
    
    # Create list of ensembles for each stream
    xyt1_list = [xyt1_gpu for _ in range(num_blocks_per_stream)]
    xyt2_list = [xyt2_gpu for _ in range(num_blocks_per_stream)]
    
    # Create multiple streams
    streams = [cp.cuda.Stream(non_blocking=True) for _ in range(num_streams)]
    
    # Warmup
    for stream in streams:
        with stream:
            pack_cuda.overlap_multi_ensemble(xyt1_list, xyt2_list, compute_grad=True, stream=stream)
    cp.cuda.Device().synchronize()
    
    # Timed run: launch kernels on different streams (back-to-back)
    cp.cuda.Device().synchronize()
    start = time.perf_counter()
    for i in range(iters_per_stream):
        for s_idx, stream in enumerate(streams):
            with stream:
                pack_cuda.overlap_multi_ensemble(xyt1_list, xyt2_list, compute_grad=True, stream=stream)
    cp.cuda.Device().synchronize()
    end = time.perf_counter()
    
    elapsed = end - start
    total_kernel_calls = iters_per_stream * num_streams
    calls_per_sec = total_kernel_calls / elapsed
    return calls_per_sec, elapsed


# Initialize everything first
pack_cuda._ensure_initialized()

# Parameters to tune
N = 20
iters = 100
xyt = make_input(N)

cp.cuda.Device().synchronize()

print("=" * 60)
print("TEST 1: Block Size Scaling (Single Stream)")
print("=" * 60)
print("Blocks\tCalls/sec\tElapsed(s)")
packs = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
for b in packs:
    cps, t = run_trial_single_stream(b, iters, xyt, xyt)
    cp.cuda.Device().synchronize()
    print(f"{b}\t{int(cps)}\t\t{t:.3f}")

print("\n" + "=" * 60)
print("TEST 2: Parallel Kernel Execution (Multiple Streams)")
print("=" * 60)
print("Streams\tBlocks/Stream\tCalls/sec\tElapsed(s)")
num_blocks_per_stream = 1024  # Fixed block count
stream_counts = [1,2,4,8,16,32]
for num_streams in stream_counts:
    cps, t = run_trial_multi_stream(num_streams, iters, num_blocks_per_stream, xyt, xyt)
    cp.cuda.Device().synchronize()
    print(f"{num_streams}\t{num_blocks_per_stream}\t\t{int(cps)}\t\t{t:.3f}")


local
TEST 1: Block Size Scaling (Single Stream)
Blocks	Calls/sec	Elapsed(s)
1	4361		0.023
2	3476		0.029
4	3121		0.032
8	1788		0.056
16	1280		0.078
32	716		0.140
64	400		0.250
128	203		0.492
256	95		1.052
512	46		2.171
1024	22		4.380

TEST 2: Parallel Kernel Execution (Multiple Streams)
Streams	Blocks/Stream	Calls/sec	Elapsed(s)
1	1024		23		4.341
2	1024		20		9.616
