In [19]:
import numpy as np
import cupy as cp
import time

# Matrix size (adjust for your hardware)
size = 2000
iterations = 1  # Number of iterations

# Create NumPy matrices
a_cpu = np.full((10000, 32*32), 1)
b_cpu = np.full((32*32, 10*10*10*10), 0.5)

# # Create CuPy matrices on the GPU
a_gpu = cp.asarray(a_cpu)
b_gpu = cp.asarray(b_cpu)

# Time CPU calculation
start_cpu = time.time()
for _ in range(iterations):
    c_cpu = a_cpu @ b_cpu

assert c_cpu[0,0] == 32*32*(1*0.5), c_cpu[0]

time_cpu = time.time() - start_cpu

# Time GPU calculation
start_gpu = time.time()
for _ in range(iterations):
    c_gpu = a_gpu @ b_gpu

# # Ensure all GPU calculations have finished
cp.cuda.Stream.null.synchronize()

time_gpu = time.time() - start_gpu

print(f"CPU time (for {iterations} iterations): {time_cpu:.4f} seconds")
print(f"GPU time (for {iterations} iterations): {time_gpu:.4f} seconds")
print(f"Speedup: {time_cpu / time_gpu:.2f}x")

CPU time (for 1 iterations): 1.7099 seconds
GPU time (for 1 iterations): 0.4246 seconds
Speedup: 4.03x


In [3]:
np.random.rand(10000, 32*32).shape

(10000, 1024)

In [12]:
(1*0.5 + 1*0.5 + 1*0.5) / 3

0.5

In [18]:
c_cpu[0,0]

np.float64(512.0)