In [1]:
import sys
import os
sys.path.insert(0, os.path.join(os.getcwd(), '../core'))
import kaggle_support as kgs
import cupy as cp
handle = kgs.dill_load(kgs.temp_dir + '/test_cp_handle.pickle')
cp.cuda.runtime.ipcOpenMemHandle(handle)

local


CUDARuntimeError: cudaErrorInvalidResourceHandle: invalid resource handle

CUDARuntimeError: cudaErrorInvalidResourceHandle: invalid resource handle

In [None]:
## Why IPC Handles Don't Work This Way

**The problem:** IPC handles are only valid while:
1. The original process that created them is **still running**
2. The GPU memory allocation still exists
3. You open the handle in a **different process** (not the same one)

**What's happening:**
- You created the handle in one Python session
- Saved it to a file
- That Python session ended → GPU memory was freed
- Now the handle points to freed memory → invalid handle error

**For actual IPC, you need:**
1. Process A: Creates array, gets IPC handle, **stays running**
2. Process A: Shares handle (via file, socket, shared memory, etc.)
3. Process B: Opens the handle **while Process A is still running**
4. Both processes can now access the same GPU memory

## Alternative: Multi-processing Demo

If you want to test IPC, here's a proper setup using Python's multiprocessing:

In [5]:
import multiprocess as mp
import cupy as cp

def child_process(handle_bytes, shape, dtype, nbytes):
    """Child process that receives and opens an IPC handle"""
    import cupy as cp
    
    # CRITICAL: Initialize CUDA in this process first
    cp.cuda.Device(0).use()
    
    print(f"Child: Received handle")
    
    try:
        # Open it in this process - handle_bytes is the raw bytes
        device_ptr = cp.cuda.runtime.ipcOpenMemHandle(handle_bytes)
        
        # Reconstruct the array
        mem = cp.cuda.UnownedMemory(device_ptr, nbytes, owner=None)
        memptr = cp.cuda.MemoryPointer(mem, 0)
        arr_child = cp.ndarray(shape=shape, dtype=dtype, memptr=memptr)
        
        print(f"Child: Array contents: {arr_child[:5]}")
        
        # Modify it to prove it's shared
        arr_child[:] = 42.0
        print(f"Child: Modified array to all 42s")
        
        # Clean up
        cp.cuda.runtime.ipcCloseMemHandle(device_ptr)
        return True
    except Exception as e:
        print(f"Child: Error: {e}")
        return False

# Parent process - allocate with IPC flag
# Use cudaMalloc which supports IPC by default
arr = cp.arange(100, dtype=cp.float32)
print(f"Parent: Original array: {arr[:5]}")

try:
    # Get IPC handle - this returns a cudaIpcMemHandle_t (64 bytes)
    handle = cp.cuda.runtime.ipcGetMemHandle(arr.data.ptr)
    print(f"Parent: Got IPC handle, type: {type(handle)}, len: {len(handle) if hasattr(handle, '__len__') else 'N/A'}")
    
    # Start child process - pass handle as bytes
    result_queue = mp.Queue()
    p = mp.Process(target=child_process, args=(handle, arr.shape, arr.dtype, arr.nbytes))
    p.start()
    p.join()
    
    if p.exitcode == 0:
        print(f"Parent: Array after child ran: {arr[:5]}")
    else:
        print(f"Parent: Child process failed with exit code {p.exitcode}")
except Exception as e:
    print(f"Parent: Error - {e}")
    print("Note: IPC may not be supported on all GPU configurations")

Parent: Original array: [0. 1. 2. 3. 4.]
Parent: Got IPC handle, type: <class 'bytes'>, len: 64
Child: Received handle
Child: Error: cudaErrorInvalidResourceHandle: invalid resource handle
Parent: Array after child ran: [0. 1. 2. 3. 4.]


## Important Notes on IPC

**Why IPC often fails:**
- Not all CUDA memory allocations support IPC
- IPC requires specific GPU hardware support (works on most modern NVIDIA GPUs)
- WSL2 and some virtualized environments don't support CUDA IPC
- IPC only works between processes on the **same physical machine**

**When does your GPU support IPC?**
Check with: `nvidia-smi -q | grep "IPC"` or run the cell below:

In [6]:
# Check if IPC is supported on your GPU
import subprocess
result = subprocess.run(['nvidia-smi', '-q'], capture_output=True, text=True)
if 'IPC' in result.stdout:
    for line in result.stdout.split('\n'):
        if 'IPC' in line or 'Compute Mode' in line:
            print(line)
else:
    print("IPC information not found in nvidia-smi output")
    print("This doesn't necessarily mean IPC is unsupported - try the test above")

IPC information not found in nvidia-smi output
This doesn't necessarily mean IPC is unsupported - try the test above


In [None]:
# Check if we're in WSL2 (where IPC often doesn't work)
import os
is_wsl = os.path.exists('/proc/version') and 'microsoft' in open('/proc/version').read().lower()
print(f"Running in WSL: {is_wsl}")

# Check CUDA IPC properties
props = cp.cuda.runtime.getDeviceProperties(0)
print(f"Device name: {props['name'].decode()}")
print(f"Compute capability: {props['major']}.{props['minor']}")

# Try a simple IPC test
try:
    test_arr = cp.zeros(10, dtype=cp.float32)
    test_handle = cp.cuda.runtime.ipcGetMemHandle(test_arr.data.ptr)
    print("✓ IPC handle creation works")
    
    # The real test is opening it in another process, which we saw fails
    print("✗ IPC handle opening in child process fails (as seen above)")
    print("\nConclusion: CUDA IPC is NOT working in your environment")
except Exception as e:
    print(f"✗ IPC handle creation failed: {e}")

## Practical Alternative: Multi-Process GPU Computing WITHOUT IPC

Since IPC doesn't work in your environment, here's how to share GPU work between processes:

In [None]:
import multiprocess as mp
import numpy as np

def worker_process(input_data, output_queue, worker_id):
    """Worker that does GPU computation"""
    import cupy as cp
    
    # Each process gets its own GPU arrays
    arr = cp.array(input_data)
    
    # Do some GPU computation
    result = arr * worker_id + 100
    
    # Return results via CPU memory
    output_queue.put(result.get())
    print(f"Worker {worker_id}: Processed {len(arr)} elements")

# Parent: Create data
data = np.arange(100, dtype=np.float32)
print(f"Parent: Input data: {data[:5]}")

# Spawn multiple workers
output_queue = mp.Queue()
processes = []

for i in range(3):
    p = mp.Process(target=worker_process, args=(data, output_queue, i))
    p.start()
    processes.append(p)

# Collect results
results = []
for _ in range(3):
    results.append(output_queue.get())

# Wait for all processes
for p in processes:
    p.join()

print(f"\nCollected {len(results)} results:")
for i, r in enumerate(results):
    print(f"  Worker {i}: {r[:5]}")

## Summary: CUDA IPC Limitations

**Your situation:** CUDA IPC is **not working** in your environment (likely WSL2 or virtualization)

**What works instead:**
1. ✓ **Copy data between processes** via NumPy arrays (as shown above)
2. ✓ **Save/load arrays** to disk (fast with NVMe SSDs)
3. ✓ **Use CUDA streams** for async operations within one process
4. ✓ **Multi-GPU with one process** using `cp.cuda.Device(gpu_id)`

**What doesn't work:**
- ✗ Zero-copy sharing of GPU memory between processes via IPC
- ✗ Saving IPC handles to disk for later use

**When would IPC work?**
- Native Linux (not WSL2)
- Bare metal CUDA installation
- Modern NVIDIA GPU (compute capability 2.0+)
- No virtualization layers

In [None]:
# Save GPU array metadata and data
arr = cp.arange(100, dtype=cp.float32) * 2

# Method 1: Save to numpy file
cp.save(kgs.temp_dir + '/my_array.npy', arr)

# Later, load it back
arr_loaded = cp.load(kgs.temp_dir + '/my_array.npy')
print("Loaded array:", arr_loaded[:5])

# Method 2: Use pickle for metadata + data pointer info (within same process)
data_dict = {
    'shape': arr.shape,
    'dtype': arr.dtype,
    'ptr': arr.data.ptr,
    'data': arr.get()  # Copy to CPU for saving
}
kgs.dill_save(kgs.temp_dir + '/array_data.pickle', data_dict)

# Load it back
loaded_dict = kgs.dill_load(kgs.temp_dir + '/array_data.pickle')
arr_restored = cp.array(loaded_dict['data'])
print("Restored array:", arr_restored[:5])