In [None]:
from numba import cuda

In [None]:
import numpy as np

In [None]:
# Need two things - blocks per grid and threads per block

In [None]:
# simple scalar addition

In [None]:
@cuda.jit
def myadd(x, y, z):
    index = cuda.threadIdx.x
    z[index] = x[index] + y[index]

In [None]:
blockspergrid = 1
threadsperblock = 1
X = np.array([1])
Y = np.array([2])
Z = np.array([0])
myadd[1, 1](X, Y, Z)

In [None]:
Z

In [None]:
# Vector addition

In [None]:
@cuda.jit
def add_vectors(x, y, z):
    ix = cuda.threadIdx.x
    z[ix] = x[ix] + y[ix]

In [None]:
threadsperblock = 8
blockspergrid = 1

In [None]:
X = np.arange(1, 9)
Y = np.ones((8,))
Z = np.zeros((8,))
add_vectors[blockspergrid, threadsperblock](X, Y, Z)
print(Z)

In [None]:
# try with a larger vector

In [None]:
X = np.arange(1, 17)
Y = np.ones((16,))
Z = np.zeros((16,))
add_vectors[blockspergrid, threadsperblock](X, Y, Z)
print(Z)

In [None]:
# What went wrong?

In [None]:
threadsperblock = 16
Z = np.zeros((16,))
add_vectors[blockspergrid, threadsperblock](X, Y, Z)
print(Z)

In [None]:
# reduce threadsperblock back and change blockspergrid
threadsperblock = 8
blockspergrid = 2
Z = np.zeros((16,))
add_vectors[blockspergrid, threadsperblock](X, Y, Z)
print(Z)

In [None]:
# what went wrong?

In [None]:
@cuda.jit
def add_blocked_vector(x, y, z):
    ix = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    z[ix] = x[ix] + y[ix]

In [None]:
Z = np.zeros((16,))
add_blocked_vector[blockspergrid, threadsperblock](X, Y, Z)
print(Z)

In [None]:
# Adding 2d matrices!

In [None]:
X = np.random.randint(0, 10, size=(8, 8))
Y = np.random.randint(0, 10, size=(8, 8))

In [None]:
@cuda.jit
def mat_add(x, y, z):
    ix = cuda.threadIdx.x
    iy = cuda.threadIdx.y
    z[ix, iy] = x[ix, iy] + y[ix, iy]

In [None]:
blockspergrid = 1
threadsperblock = (8, 8)
Z = np.zeros((8, 8))
mat_add[blockspergrid, threadsperblock](X, Y, Z)
print(Z)

In [None]:
print(X + Y)

In [None]:
# try again with 4 by 4 blocks!
blockspergrid = 4
threadsperblock = (4, 4)
@cuda.jit
def mat_add_blocked(x, y, z):
    ix = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    iy = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
    z[ix, iy] = x[ix, iy] + y[ix, iy]
Z = np.zeros((8, 8))
mat_add_blocked[blockspergrid, threadsperblock](X, Y, Z)
print(Z.astype(int))

In [None]:
# What went wrong?

In [None]:
blockspergrid = (2, 2)
threadsperblock = (4, 4)
@cuda.jit
def mat_add_blocked(x, y, z):
    ix = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    iy = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
    z[ix, iy] = x[ix, iy] + y[ix, iy]
Z = np.zeros((8, 8))
mat_add_blocked[blockspergrid, threadsperblock](X, Y, Z)
print(Z.astype(int))

In [None]:
# Exercise: Try all this with different block, thread config!

In [None]:
# Dot product of 1D vectors

In [None]:
blockspergrid = 1
threadsperblock = 8
@cuda.jit
def mydot(x, y, z):
    ix = cuda.threadIdx.x
    z[0] += x[ix] * y[ix]

In [None]:
X = np.ones((8,))
Y = np.ones((8,))
Z = np.zeros((1,))
mydot[blockspergrid, threadsperblock](X, Y, Z)
print(Z)

In [None]:
# What went wrong?

In [None]:
temp_Z = np.zeros((8,))
@cuda.jit
def mydot(x, y, temp_z):
    ix = cuda.threadIdx.x
    temp_z[ix] = x[ix] * y[ix]

In [None]:
X = np.ones((8,))
Y = np.ones((8,))
mydot[blockspergrid, threadsperblock](X, Y, temp_Z)
print(temp_Z.sum())

In [None]:
# Single thread execution
@cuda.jit
def single_thread_dot(x, y, z):
    z[0] = 0.
    for i in range(x.shape[0]):
        z[0] += x[i] * y[i]
blockspergrid = 1
threadsperblock = 1
Z = np.zeros((1,))
single_thread_dot[blockspergrid, threadsperblock](X, Y, Z)
print(Z)

In [None]:
# Matrix multiplication!

In [None]:
X = np.random.randint(0, 10, size=(3, 3))
print("X:\n", X)
Y = np.linalg.pinv(X)
print("Y:\n", Y)
print("X * Y:\n", np.dot(X, Y))

In [None]:
blockspergrid = 1
threadsperblock = (3, 3)
@cuda.jit
def mat_multiply(x, y, z):
    ix = cuda.threadIdx.x
    iy = cuda.threadIdx.y
    xrow = x[ix, :]
    ycol = y[:, iy]
    for i in range(3):
        z[ix, iy] += xrow[i] * ycol[i]

In [None]:
Z = np.zeros((3, 3))
mat_multiply[blockspergrid, threadsperblock](X, Y, Z)

In [None]:
Z

In [None]:
# Exercise: Multiply two non-square matrices of size 3, 4 and 4, 5