In [1]:
from numba import cuda, vectorize, float32
from scipy import sparse
import random
import math
import string

import time
import numpy as np

In [89]:
np.random.seed(1234)
M = 800
N = 400

a = np.random.randint(5, size=(M, N))
b = np.random.randint(5, size=(M, N))

start_time = time.time()
c_true = np.matmul(a, b.T)
end_time = time.time()

end_time-start_time

0.47931694984436035

In [90]:
a.shape

(800, 400)

In [91]:
c_true.shape

(800, 800)

In [92]:
a.shape, b.shape, c_true.shape

((800, 400), (800, 400), (800, 800))

In [108]:
TPB_x = 8 #<---threads per block
TPB_y = 4 #<---threads per block
@cuda.jit
def fast_matmul(A, B, C):
    # Define an array in the shared memory
    # The size and type of the arrays must be known at compile time
    sA = cuda.shared.array(shape=(TPB_x, TPB_y), dtype=float32)
    sB = cuda.shared.array(shape=(TPB_x, TPB_y), dtype=float32)

    x, y = cuda.grid(2)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bpg = cuda.gridDim.x    # blocks per grid

    if x >= C.shape[0] and y >= C.shape[1]:
        # Quit if (x, y) is outside of valid C boundary
        return

    # Each thread computes one element in the result matrix.
    # The dot product is chunked into dot products of TPB-long vectors.
    tmp = 0.
    for i in range(bpg):
        # Preload data into shared memory
        sA[tx, ty] = A[x, ty + i * TPB_y]
        sB[tx, ty] = B[tx + i * TPB_x, y]

        # Wait until all threads finish preloading
        cuda.syncthreads()

        # Computes partial product on the shared memory
        for j in range(TPB_y):
            tmp += sA[tx, j] * sB[j, ty]

        # Wait until all threads finish computing
        cuda.syncthreads()

    C[x, y] = tmp

In [109]:
c = np.zeros(shape=(a.shape[0], b.shape[0]))

In [110]:
a.shape, b.shape

((800, 400), (800, 400))

In [111]:
blockspergrid_x = int(np.ceil(c.shape[0]/TPB_x))
blockspergrid_y = int(np.ceil(c.shape[1]/TPB_y))

In [112]:
blockspergrid_x, blockspergrid_y

(100, 200)

In [113]:
blockspergrid_x*TPB_x, blockspergrid_y*TPB_y

(800, 800)

In [119]:
start_time = time.time()
fast_matmul[[blockspergrid_x, blockspergrid_y], [TPB_x, TPB_y]](a, b.T, c)
end_time = time.time()

In [120]:
c

array([[1595., 1582., 1611., ..., 1608., 1636.,  796.],
       [1641., 1663., 1619., ..., 1664., 1638.,  787.],
       [1651., 1550., 1489., ..., 1572., 1566.,  762.],
       ...,
       [1558., 1482., 1540., ..., 1527., 1581.,  771.],
       [1611., 1591., 1597., ..., 1552., 1564.,  811.],
       [1813., 1697., 1736., ..., 1720., 1693.,  813.]])

In [121]:
np.matmul(a, b.T)

array([[1558, 1635, 1548, ..., 1612, 1652, 1506],
       [1625, 1595, 1596, ..., 1662, 1660, 1590],
       [1549, 1609, 1530, ..., 1554, 1600, 1481],
       ...,
       [1551, 1514, 1498, ..., 1593, 1624, 1466],
       [1570, 1674, 1634, ..., 1622, 1577, 1558],
       [1714, 1704, 1645, ..., 1738, 1719, 1682]])

In [28]:
np.array_equal(c, np.matmul(a, b.T))

True

In [91]:
num_of_chars = 208

a = np.array([[random.randrange(0, 2) for i in range(num_of_chars)] for i in range(200)])
b = np.array([[random.randrange(0, 2) for i in range(num_of_chars)] for i in range(400)])

b_t = np.transpose(b)

start_time = time.time()
c_true = np.matmul(a, b_t)
end_time = time.time()

end_time-start_time

0.03090977668762207

In [100]:
a = sparse.csr_matrix(a)
b_t = sparse.csr_matrix(b_t)

In [104]:
a

<200x208 sparse matrix of type '<class 'numpy.int32'>'
	with 20656 stored elements in Compressed Sparse Row format>

In [101]:
TPB = 128 #<---threads per block

blockspergrid_x = int(math.ceil(a.shape[0] / TPB))
blockspergrid_y = int(math.ceil(b.shape[0] / TPB))

blockspergrid = (blockspergrid_x, blockspergrid_y)

TPB, blockspergrid

(128, (2, 2))

In [102]:
start_time  =time.time()
matmul[[TPB, TPB], blockspergrid](a, b_t, c)
end_time  =time.time()

ValueError: cannot determine Numba type of <class 'scipy.sparse.csr.csr_matrix'>

In [95]:
end_time-start_time

0.009992361068725586