In [1]:
# Simple kernel concurrency test - block size scaling and multi-stream parallelism
# Supports both simple kernel and overlap kernel from pack_cuda

import time
import math
import numpy as np
import cupy as cp
import sys
import os
sys.path.insert(0, os.path.join(os.getcwd(), '../core'))

import pack_cuda
import kaggle_support as kgs

# ============================================================================
# KERNEL SETUP
# ============================================================================

# Define simple work kernel with block indexing for multi-block execution
simple_kernel_code = r'''
extern "C" __global__
void simple_work(const float* input, float* output, int n, int work_factor) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < n) {
        float val = input[idx];
        // Do some arithmetic work to make it non-trivial
        for (int i = 0; i < work_factor; i++) {
            val = val * 1.001f + 0.001f;
            val = sqrtf(val * val + 1.0f);
        }
        output[idx] = val;
    }
}
'''

# Compile the simple kernel
simple_kernel = cp.RawKernel(simple_kernel_code, 'simple_work')

# Initialize pack_cuda and get the overlap kernel
pack_cuda.USE_FLOAT32=True
pack_cuda._ensure_initialized()
overlap_kernel = pack_cuda._multi_overlap_list_total_kernel

# ============================================================================
# HELPER FUNCTIONS FOR DATA PREPARATION
# ============================================================================

def prepare_simple_data(num_blocks, n_threads):
    """Prepare data for simple kernel: single contiguous array"""
    total_size = num_blocks * n_threads
    input_data = cp.random.randn(total_size, dtype=cp.float32)
    output_data = cp.zeros(total_size, dtype=cp.float32)
    return input_data, output_data, total_size

def prepare_overlap_data(num_ensembles, n_trees):
    """Prepare data for overlap kernel: arrays of pointers and metadata"""
    xyt1_arrays = []
    xyt2_arrays = []
    
    # Use float32 if USE_FLOAT32 is set, otherwise float64
    dtype = cp.float32 if pack_cuda.USE_FLOAT32 else cp.float64
    
    for _ in range(num_ensembles):
        xyt1 = cp.random.randn(n_trees, 3, dtype=dtype)
        xyt2 = cp.random.randn(n_trees, 3, dtype=dtype)
        
        xyt1_3xN = cp.ascontiguousarray(xyt1.T).ravel()
        xyt2_3xN = cp.ascontiguousarray(xyt2.T).ravel()
        xyt1_arrays.append(xyt1_3xN)
        xyt2_arrays.append(xyt2_3xN)
    
    xyt1_ptrs = cp.array([arr.data.ptr for arr in xyt1_arrays], dtype=cp.float32)
    xyt2_ptrs = cp.array([arr.data.ptr for arr in xyt2_arrays], dtype=cp.float32)
    n_array = cp.array([n_trees] * num_ensembles, dtype=cp.int32)
    out_totals = cp.zeros(num_ensembles, dtype=dtype)
    
    return xyt1_arrays, xyt2_arrays, xyt1_ptrs, xyt2_ptrs, n_array, out_totals

def prepare_simple_data_multi_stream(num_streams, num_blocks_per_stream, n_threads):
    """Prepare data for simple kernel with multiple streams"""
    total_size_per_stream = num_blocks_per_stream * n_threads
    inputs = [cp.random.randn(total_size_per_stream, dtype=cp.float32) for _ in range(num_streams)]
    outputs = [cp.zeros(total_size_per_stream, dtype=cp.float32) for _ in range(num_streams)]
    return inputs, outputs, total_size_per_stream

# ============================================================================
# TEST FUNCTIONS
# ============================================================================

def run_trial_single_stream(num_blocks, iters_per_call, n_threads, work_factor, kernel_type):
    """Test: vary number of blocks (grid size) with same thread count per block"""
    
    if kernel_type == 'simple':
        input_data, output_data, total_size = prepare_simple_data(num_blocks, n_threads)
        
        # Warmup
        simple_kernel(
            (num_blocks,), (n_threads,),
            (input_data, output_data, total_size, work_factor)
        )
        cp.cuda.Device().synchronize()
        
        # Timed run
        cp.cuda.Device().synchronize()
        start = time.perf_counter()
        for k in range(iters_per_call):
            simple_kernel(
                (num_blocks,), (n_threads,),
                (input_data, output_data, total_size, work_factor)
            )
        cp.cuda.Device().synchronize()
        end = time.perf_counter()
        
    elif kernel_type == 'overlap':
        num_ensembles = num_blocks
        # For overlap kernel, use n_threads as number of trees per ensemble
        # Each tree requires 4 threads (one per polygon piece)
        n_trees = n_threads//4
        xyt1_arrays, xyt2_arrays, xyt1_ptrs, xyt2_ptrs, n_array, out_totals = prepare_overlap_data(num_ensembles, n_trees)
        
        # Warmup
        overlap_kernel(
            (num_ensembles,), (n_threads,),
            (xyt1_ptrs, n_array, xyt2_ptrs, n_array, out_totals,
             cp.array([0], dtype=cp.float32), np.int32(num_ensembles))
        )
        cp.cuda.Device().synchronize()
        
        # Timed run
        cp.cuda.Device().synchronize()
        start = time.perf_counter()
        for k in range(iters_per_call):
            overlap_kernel(
                (num_ensembles,), (n_threads,),
                (xyt1_ptrs, n_array, xyt2_ptrs, n_array, out_totals,
                 cp.array([0], dtype=cp.float32), np.int32(num_ensembles))
            )
        cp.cuda.Device().synchronize()
        end = time.perf_counter()
    
    else:
        raise ValueError(f"Unknown kernel_type: {kernel_type}")
    
    elapsed = end - start
    total_kernel_calls = iters_per_call
    calls_per_sec = total_kernel_calls / elapsed
    return calls_per_sec, elapsed

def run_trial_multi_stream(num_streams, iters_per_stream, num_blocks_per_stream, n_threads, work_factor, kernel_type):
    """Test: launch multiple kernels in parallel via streams (each kernel has multiple blocks)"""
    
    if kernel_type == 'simple':
        inputs, outputs, total_size_per_stream = prepare_simple_data_multi_stream(num_streams, num_blocks_per_stream, n_threads)
        streams = [cp.cuda.Stream(non_blocking=True) for _ in range(num_streams)]
        
        # Warmup
        for s_idx, stream in enumerate(streams):
            simple_kernel(
                (num_blocks_per_stream,), (n_threads,),
                (inputs[s_idx], outputs[s_idx], total_size_per_stream, work_factor),
                stream=stream
            )
        cp.cuda.Device().synchronize()
        
        # Timed run
        cp.cuda.Device().synchronize()
        start = time.perf_counter()
        for i in range(iters_per_stream):
            for s_idx, stream in enumerate(streams):
                simple_kernel(
                    (num_blocks_per_stream,), (n_threads,),
                    (inputs[s_idx], outputs[s_idx], total_size_per_stream, work_factor),
                    stream=stream
                )
        cp.cuda.Device().synchronize()
        end = time.perf_counter()
        
    elif kernel_type == 'overlap':
        # For overlap kernel, use n_threads as number of trees per ensemble
        # Each tree requires 4 threads (one per polygon piece)
        num_trees = n_threads
        dtype = cp.float32 if pack_cuda.USE_FLOAT32 else cp.float64
        
        # Total ensembles = num_streams * num_blocks_per_stream
        total_ensembles = num_streams * num_blocks_per_stream
        
        # Create data for all ensembles
        xyt1_arrays = []
        xyt2_arrays = []
        for _ in range(total_ensembles):
            xyt1 = cp.random.randn(num_trees, 3, dtype=dtype)
            xyt2 = cp.random.randn(num_trees, 3, dtype=dtype)
            
            xyt1_3xN = cp.ascontiguousarray(xyt1.T).ravel()
            xyt2_3xN = cp.ascontiguousarray(xyt2.T).ravel()
            xyt1_arrays.append(xyt1_3xN)
            xyt2_arrays.append(xyt2_3xN)
        
        xyt1_ptrs = cp.array([arr.data.ptr for arr in xyt1_arrays], dtype=cp.uint64)
        xyt2_ptrs = cp.array([arr.data.ptr for arr in xyt2_arrays], dtype=cp.uint64)
        n_array = cp.array([num_trees] * total_ensembles, dtype=cp.int32)
        out_totals = cp.zeros(total_ensembles, dtype=dtype)
        
        # Warmup
        overlap_kernel(
            (num_blocks_per_stream,), (num_trees * 4,),
            (xyt1_ptrs, n_array, xyt2_ptrs, n_array, out_totals,
             cp.array([0], dtype=cp.uint64), np.int32(num_blocks_per_stream))
        )
        cp.cuda.Device().synchronize()
        
        # Timed run
        cp.cuda.Device().synchronize()
        start = time.perf_counter()
        for i in range(iters_per_stream):
            overlap_kernel(
                (num_blocks_per_stream,), (num_trees * 4,),
                (xyt1_ptrs, n_array, xyt2_ptrs, n_array, out_totals,
                 cp.array([0], dtype=cp.uint64), np.int32(num_blocks_per_stream))
            )
        cp.cuda.Device().synchronize()
        end = time.perf_counter()
    
    else:
        raise ValueError(f"Unknown kernel_type: {kernel_type}")
    
    elapsed = end - start
    total_kernel_calls = iters_per_stream * num_streams
    calls_per_sec = total_kernel_calls / elapsed
    return calls_per_sec, elapsed

# ============================================================================
# TEST EXECUTION
# ============================================================================

# Test parameters
# For simple kernel: n_threads is threads per block
# For overlap kernel: n_threads is trees per ensemble (kernel uses n_threads * 4 actual threads)
n_threads = 20  # Thread count per block (simple) or trees per ensemble (overlap)
work_factor = 1000000  # Work iterations per thread
iters = 1

# KERNEL TYPE FLAG: 'simple' or 'overlap'
KERNEL_TYPE = 'overlap'

print("=" * 60)
print(f"KERNEL TYPE: {KERNEL_TYPE.upper()}")
print("=" * 60)

print("\n" + "=" * 60)
print("TEST 1: Block Count Scaling (Single Stream, Multiple Blocks)")
print(f"Config: {n_threads} threads/block, {work_factor} work iterations")
print("=" * 60)
print("Blocks\tKernels/sec\tElapsed(s)")
block_counts = [1, 2, 4]
for b in block_counts:
    kps, t = run_trial_single_stream(b, iters, n_threads, work_factor, kernel_type=KERNEL_TYPE)
    cp.cuda.Device().synchronize()
    print(f"{b}\t{int(kps)}\t\t{t:.3f}")

print("\n" + "=" * 60)
print("TEST 2: Parallel Kernel Execution (Multiple Streams, Each with Multiple Blocks)")
print(f"Config: {n_threads} threads/block, {work_factor} work iterations")
if KERNEL_TYPE == 'overlap':
    print("Note: num_streams must be a multiple of 4")
print("=" * 60)
print("Streams\tBlocks/Stream\tKernels/sec\tElapsed(s)")
num_blocks_per_stream = 64  # Fixed block count per kernel
stream_counts = [1, 2, 4, 8, 16, 32, 64, 128, 256]
for num_streams in stream_counts:
    kps, t = run_trial_multi_stream(num_streams, iters, num_blocks_per_stream, n_threads, work_factor, kernel_type=KERNEL_TYPE)
    cp.cuda.Device().synchronize()
    print(f"{num_streams}\t{num_blocks_per_stream}\t\t{int(kps)}\t\t{t:.3f}")

print("\n" + "=" * 60)
print("GPU Info")
print("=" * 60)
props = cp.cuda.runtime.getDeviceProperties(0)
print(f"GPU: {props['name'].decode()}")
print(f"Number of SMs: {props['multiProcessorCount']}")


local
KERNEL TYPE: OVERLAP

TEST 1: Block Count Scaling (Single Stream, Multiple Blocks)
Config: 20 threads/block, 1000000 work iterations
Blocks	Kernels/sec	Elapsed(s)
KERNEL TYPE: OVERLAP

TEST 1: Block Count Scaling (Single Stream, Multiple Blocks)
Config: 20 threads/block, 1000000 work iterations
Blocks	Kernels/sec	Elapsed(s)


local
KERNEL TYPE: OVERLAP

TEST 1: Block Count Scaling (Single Stream, Multiple Blocks)
Config: 20 threads/block, 1000000 work iterations
Blocks	Kernels/sec	Elapsed(s)
KERNEL TYPE: OVERLAP

TEST 1: Block Count Scaling (Single Stream, Multiple Blocks)
Config: 20 threads/block, 1000000 work iterations
Blocks	Kernels/sec	Elapsed(s)


CUDARuntimeError: cudaErrorIllegalAddress: an illegal memory access was encountered