In [1]:
import time
import numpy as np
import cupy as cp

# Simple kernel: each thread does some work and writes to output
simple_kernel_code = r'''
extern "C" __global__
void simple_work(const float* input, float* output, int n, int work_factor) {
    int idx = threadIdx.x;
    if (idx < n) {
        float val = input[idx];
        // Do some arithmetic work to make it non-trivial
        for (int i = 0; i < work_factor; i++) {
            val = val * 1.001f + 0.001f;
            val = sqrtf(val * val + 1.0f);
        }
        output[idx] = val;
    }
}
'''

# Compile the kernel
simple_kernel = cp.RawKernel(simple_kernel_code, 'simple_work')

In [2]:
def run_simple_trial(num_streams, iters_per_stream, n_threads=200, work_factor=1000):
    """
    Run a simple kernel concurrency test.
    
    Args:
        num_streams: Number of concurrent streams
        iters_per_stream: Number of kernel launches per stream
        n_threads: Number of threads per block (and array size)
        work_factor: Amount of work each thread does (loop iterations)
    """
    # Create input/output arrays for each stream
    streams = [cp.cuda.Stream(non_blocking=True) for _ in range(num_streams)]
    inputs = [cp.random.randn(n_threads, dtype=cp.float32) for _ in range(num_streams)]
    outputs = [cp.zeros(n_threads, dtype=cp.float32) for _ in range(num_streams)]
    
    # Warmup - one launch per stream
    for s_idx, stream in enumerate(streams):
        with stream:
            simple_kernel(
                (1,), (n_threads,),
                (inputs[s_idx], outputs[s_idx], n_threads, work_factor),
                stream=stream
            )
    
    # Wait for warmup to finish
    for s in streams:
        s.synchronize()
    
    # Timed run: launch iters_per_stream kernels on each stream
    start = time.time()
    for k in range(iters_per_stream):
        for s_idx, stream in enumerate(streams):
            with stream:
                simple_kernel(
                    (1,), (n_threads,),
                    (inputs[s_idx], outputs[s_idx], n_threads, work_factor),
                    stream=stream
                )
    
    # Wait for all to finish
    for s in streams:
        s.synchronize()
    end = time.time()
    
    elapsed = end - start
    total_kernels = num_streams * iters_per_stream
    kernels_per_sec = total_kernels / elapsed
    return kernels_per_sec, elapsed

In [6]:
# Run the test with increasing numbers of streams
n_threads = 200
work_factor = 100000  # Adjust this to make kernels take longer if needed
iters = 10

print(f"Testing simple kernel concurrency")
print(f"Config: {n_threads} threads/block, {work_factor} work iterations, {iters} launches/stream")
print()
print("Streams\tKernels/sec\tElapsed(s)")

packs = [1, 2, 4, 8, 16, 32, 64]
for s in packs:
    kps, t = run_simple_trial(s, iters, n_threads, work_factor)
    print(f"{s}\t{int(kps)}\t\t{t:.3f}")

Testing simple kernel concurrency
Config: 200 threads/block, 100000 work iterations, 10 launches/stream

Streams	Kernels/sec	Elapsed(s)
1	157		0.063
2	314		0.064
4	629		0.064
8	1254		0.064
16	1265		0.126
32	1264		0.253
64	1264		0.506


In [4]:
# Check GPU info for reference
props = cp.cuda.runtime.getDeviceProperties(0)
print(f"\nGPU: {props['name'].decode()}")
print(f"Number of SMs: {props['multiProcessorCount']}")
print(f"CUDA version: {cp.cuda.runtime.runtimeGetVersion()}")


GPU: NVIDIA GeForce RTX 4070 Ti
Number of SMs: 60
CUDA version: 12080
