In [1]:
import torch
from torch import tensor


Whereas before we vectorized away the innermost loop of matmul, in this version we first focus on parallelizing the outer loops that iterae through the entries of the target array. Each invocation accepts a `grid = (i, j)` parameter to indicate the target coordinate.

In [2]:
def matmul(grid, a, b, c):
    i, j = grid
    if i < c.shape[0] and j < c.shape[1]:
        tmp = 0.
        for k in range(a.shape[1]):
            tmp += a[i, k] * b[k, j]
        c[i,j] = tmp


In [3]:
from pathlib import Path
from urllib.request import urlretrieve
import gzip, pickle

MNIST_URL='https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true'
path_data = Path('data')
path_data.mkdir(exist_ok=True)
path_gz = path_data/'mnist.pkl.gz'

if not path_gz.exists():
    urlretrieve(MNIST_URL, path_gz)

with gzip.open(path_gz, 'rb') as f:
    ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')

x_train,y_train,x_valid,y_valid = map(tensor, (x_train,y_train,x_valid,y_valid))

torch.manual_seed(1)
weights = torch.randn(784, 10)
bias = torch.zeros(10)

m1 = x_valid[:5]
m2 = weights
ar, ac = m1.shape 
br, bc = m2.shape

t1 = torch.zeros(ar, bc)

for i in range(ar):         # 5
    for j in range(bc):     # 10
        for k in range(ac): # 784
            t1[i, j] += m1[i, k] * m2[k, j]


In [4]:
res = torch.zeros(ar, bc)
matmul((0, 0), m1, m2, res)
res


tensor([[-10.9417,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000]])

We define a harness for calling `matmul` on each entry.

In [5]:
def launch_kernel(kernel, grid_x, grid_y, *args, **kwargs):
    for i in range(grid_x):
        for j in range(grid_y):
            kernel((i, j), *args, **kwargs)


We give it a go.

In [6]:
res = torch.zeros(ar, bc)
launch_kernel(matmul, ar, bc, m1, m2, res)
res


tensor([[-10.9417,  -0.6844,  -7.0038,  -4.0066,  -2.0857,  -3.3588,   3.9127,
          -3.4375, -11.4696,  -2.1153],
        [ 14.5430,   5.9977,   2.8914,  -4.0777,   6.5914, -14.7383,  -9.2787,
           2.1577, -15.2772,  -2.6758],
        [  2.2204,  -3.2171,  -4.7988,  -6.0453,  14.1661,  -8.9824,  -4.7922,
          -5.4446, -20.6758,  13.5657],
        [ -6.7097,   8.8998,  -7.4611,  -7.8966,   2.6994,  -4.7260, -11.0278,
         -12.9776,  -6.4443,   3.6376],
        [ -2.4444,  -6.4034,  -2.3984,  -9.0371,  11.1772,  -5.7724,  -8.9214,
          -3.7862,  -8.9827,   5.2797]])

In [7]:
import numpy as np
from numba import cuda
from fastcore.test import *

Using `cuda.jit`, the indices are provided by `cuda.grid(2)`. In general, the GPU may attempt to evaluate the function beyond the shape of `c`, so we 

In [8]:
@cuda.jit
def matmul(a,b,c):
    i, j = cuda.grid(2)
    if i < c.shape[0] and j < c.shape[1]:
        tmp = 0.
        for k in range(a.shape[1]): tmp += a[i, k] * b[k, j]
        c[i,j] = tmp


We setup the testing environment, and send all the relevant tensors to the GPU.

In [9]:
tr = x_train @ weights
r = np.zeros(tr.shape)
m1g, m2g, rg = map(cuda.to_device, (x_train, weights, r))


We shall execute 16 x 16 threads for each CUDA block, and compute the number of blocks in each direction that we shall use. If `c` has shape strictly smaller than `blocks * TPB`, then `matmul` will be run on invalid indices, hence the bounds checking in our definition previously.

In [10]:
import math

TPB = 16
rr, rc = r.shape
blocks = (math.ceil(rr / TPB), math.ceil(rc / TPB))
blocks


(3125, 1)

Testing correctness.

In [11]:
matmul[blocks, (TPB, TPB)](m1g, m2g, rg)
r = rg.copy_to_host()
test_close(tr, r, eps=1e-3)


Testing performance.

In [12]:
%%timeit -n 10
matmul[blocks, (TPB, TPB)](m1g, m2g, rg)
r = rg.copy_to_host()


7.85 ms ± 590 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Finally, we evaluate the speed of the inbuilt `matmul` of PyTorch on GPU.

In [13]:
m1c, m2c = x_train.cuda(), weights.cuda()


In [14]:
r = (m1c @ m2c).cpu()


Testing performance; we observe a close to 10x speedup.

In [15]:
%timeit -n 10 r = (m1c @ m2c).cpu()


2.05 ms ± 213 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
