In [1]:
import cupy as cp
import numpy as np


def to_cupy(array: np.ndarray) -> cp.ndarray:
    """Convert a NumPy array to a CuPy array.

    Args:
        array (np.ndarray): Input NumPy array.

    Returns:
        cp.ndarray: Converted CuPy array.
    """
    return cp.asarray(array)



In [2]:
array = np.array([1, 2, 3, 4, 5])
cupy_array = to_cupy(array)
print(cupy_array)

[1 2 3 4 5]


In [3]:
import time
import gc

def measure_matrix_mul_times(start=1000, stop=11000, step=1000, dtype=np.float32, run_gc=True):
    """Measure one-shot matrix multiplication times for NumPy and CuPy for sizes n x n.

    Returns a list of tuples: (n, numpy_time_s, cupy_time_s_or_None_on_error).
    """
    results = []
    for n in range(start, stop + 1, step):
        print(f"Running n={n}...")
        try:
            A = np.random.random((n, n)).astype(dtype)
            B = np.random.random((n, n)).astype(dtype)
        except Exception as e:
            print(f"Failed to allocate NumPy arrays for n={n}: {e}")
            break

        # NumPy matmul timing
        t0 = time.perf_counter()
        C = np.dot(A, B)
        t1 = time.perf_counter()
        numpy_time = t1 - t0
        del C
        if run_gc:
            gc.collect()

        # Move to GPU and time CuPy matmul
        cupy_time = None
        try:
            cpA = to_cupy(A)
            cpB = to_cupy(B)
            # free host arrays early to reduce peak RAM usage
            del A, B
            if run_gc:
                gc.collect()

            # Ensure previous GPU work finished
            cp.cuda.Device().synchronize()
            t0 = time.perf_counter()
            cpC = cp.dot(cpA, cpB)
            # wait for GPU to finish
            cp.cuda.Device().synchronize()
            t1 = time.perf_counter()
            cupy_time = t1 - t0

            # cleanup GPU memory
            del cpC, cpA, cpB
            try:
                cp.get_default_memory_pool().free_all_blocks()
            except Exception:
                pass
        except Exception as e:
            print(f"GPU operation failed for n={n}: {e}")
            # continue to next size

        results.append((n, numpy_time, cupy_time))
        print(f"n={n} -> numpy: {numpy_time:.4f}s, cupy: {('N/A' if cupy_time is None else f'{cupy_time:.4f}s')}")

    return results

# Example usage (commented out to avoid heavy allocation on import):
# results = measure_matrix_mul_times()
# print(results)