In [1]:
import numpy as np
from numba import cuda, types, float32
from time import time

import os
os.environ["CUDA_HOME"]="/usr/local/cuda"

@cuda.jit
def matmul(A, B, C):
    """Perform square matrix multiplication of C = A * B
    """
    i, j = cuda.grid(2)
    if i < C.shape[0] and j < C.shape[1]:
        tmp = 0.
        for k in range(A.shape[1]):
            tmp += A[i, k] * B[k, j]
        C[i, j] = tmp
        
@cuda.jit
def fast_matmul(A, B, C):
    # Define an array in the shared memory
    # The size and type of the arrays must be known at compile time
    TPB = N

    sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
    sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)

    x, y = cuda.grid(2)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bpg = cuda.gridDim.x    # blocks per grid

    if x >= C.shape[0] and y >= C.shape[1]:
        # Quit if (x, y) is outside of valid C boundary
        return

    # Each thread computes one element in the result matrix.
    # The dot product is chunked into dot products of TPB-long vectors.
    tmp = 0.
    for i in range(bpg):
        # Preload data into shared memory
        sA[tx, ty] = A[x, ty + i * TPB]
        sB[tx, ty] = B[tx + i * TPB, y]

        # Wait until all threads finish preloading
        cuda.syncthreads()

        # Computes partial product on the shared memory
        for j in range(TPB):
            tmp += sA[tx, j] * sB[j, ty]

        # Wait until all threads finish computing
        cuda.syncthreads()

    C[x, y] = tmp

# This part is for initializing everything
M = 256
N = 32

#a = np.arange(M*N).reshape(M,N).astype(np.float32)
#b = np.arange(M*N).reshape(N,M).astype(np.float32)
a = np.random.randn(M, N).astype(np.float32)
b = np.random.randn(N, M).astype(np.float32)
c = np.zeros((M, M)).astype(np.float32)

t0 = time()
cpu = np.dot(a, b)
print(f'cpu time = {time()-t0} s')

d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.to_device(c)

block_size = (N,N)
grid_size = (int(M/N),int(M/N))

t0 = time()
matmul[grid_size,block_size](d_a, d_b, d_c)
#fast_matmul[grid_size,block_size](d_a, d_b, d_c)
print(f'gpu time = {time()-t0} s')

c = d_c.copy_to_host()
print(f'{cpu=}')
print(f'{c=}')
err = np.linalg.norm(cpu - c)
print(f'{err=}')

cpu time = 0.0007545948028564453 s




gpu time = 0.32591843605041504 s
cpu=array([[-12.32922   ,  10.799813  ,  -1.9149985 , ...,   4.6531787 ,
          6.22411   ,  -0.3889591 ],
       [ -1.4291192 ,   0.7369752 ,   2.2361498 , ...,   3.9228063 ,
         -2.4182982 ,   7.3295965 ],
       [ -2.942463  , -14.044039  ,  -3.5106852 , ...,  -5.270478  ,
          6.2338905 ,   0.46176562],
       ...,
       [  4.2029166 ,  -6.642787  ,   4.9657907 , ...,   1.5351754 ,
          1.206077  ,  -4.886201  ],
       [  2.1515427 ,  -6.754644  ,   2.4469705 , ...,  -4.249156  ,
          0.03139742,  -5.661947  ],
       [ -0.72065705,   0.24282883,  -1.875034  , ...,  -8.139893  ,
          0.6097091 ,  -8.083664  ]], dtype=float32)
c=array([[-12.329219  ,  10.799812  ,  -1.9149987 , ...,   4.653178  ,
          6.2241096 ,  -0.38895994],
       [ -1.429119  ,   0.7369754 ,   2.2361493 , ...,   3.922806  ,
         -2.4182968 ,   7.3295975 ],
       [ -2.942463  , -14.044038  ,  -3.5106854 , ...,  -5.270478  ,
          6.2338