In [None]:
# ipc_demo.py
import os
import sys
import time
import traceback
import multiprocess as mp

def child(conn, shape, dtype_str, nbytes, dev_id=0):
    def log(msg):
        print(f"[child {os.getpid()}] {msg}", flush=True)

    try:
        import cupy as cp
        log("imported cupy")

        handle = conn.recv()
        handle = bytes(handle)  # ensure it's a 64-byte bytes blob
        log(f"received handle len={len(handle)}")

        cp.cuda.runtime.setDevice(dev_id)
        cp.cuda.runtime.memGetInfo()   # force runtime init
        log("cuda runtime initialized")

        # Try flags=0 first (often safer than lazy peer access)
        ptr = cp.cuda.runtime.ipcOpenMemHandle(handle, 0)
        log(f"ipcOpenMemHandle ok, ptr={ptr}")

        try:
            mem = cp.cuda.UnownedMemory(ptr, nbytes, owner=handle, device_id=dev_id)
            arr = cp.ndarray(shape, dtype=cp.dtype(dtype_str),
                             memptr=cp.cuda.MemoryPointer(mem, 0))
            log(f"wrapped ptr as array, arr[0]={int(arr[0].get())}")

            arr *= 10
            cp.cuda.runtime.deviceSynchronize()
            log("wrote arr *= 10 and synchronized")
        finally:
            cp.cuda.runtime.ipcCloseMemHandle(ptr)
            log("ipcCloseMemHandle done")

        conn.send(("ok", None))
    except Exception:
        conn.send(("err", traceback.format_exc()))
    finally:
        conn.close()


def main():
    import cupy as cp

    ctx = mp.get_context("spawn")  # CUDA-safe
    parent, child_end = ctx.Pipe(duplex=True)

    cp.cuda.runtime.setDevice(0)
    cp.cuda.runtime.memGetInfo()

    a = cp.arange(10, dtype=cp.int32)
    handle = cp.cuda.runtime.ipcGetMemHandle(a.data.ptr)
    handle = bytes(handle)  # IMPORTANT: make it plain bytes
    print(f"[parent {os.getpid()}] handle len={len(handle)}", flush=True)

    p = ctx.Process(target=child, args=(child_end, a.shape, str(a.dtype), a.nbytes, 0))
    p.start()

    parent.send(handle)

    # Don't block forever
    if not parent.poll(30):
        print("[parent] timeout waiting for child reply", flush=True)
        p.terminate()
        p.join()
        sys.exit(1)

    status, err = parent.recv()
    p.join()

    if status != "ok":
        print("[parent] child error:\n", err, flush=True)
        sys.exit(1)

    cp.cuda.runtime.deviceSynchronize()
    print("[parent] a after child:", a.get(), flush=True)


if __name__ == "__main__":
    main()
