In [1]:
import numpy as np
import math
from numba import cuda

In [2]:
@cuda.jit
def kernel(A, B, C):
    """A + B = C"""
    # Get index of thread launched in a 2 dimensional grid
    ii, jj = cuda.grid(2)

    # check if thread index corresponds to indices of matrices
    if (ii < A.shape[0]) and (jj < A.shape[1]):
        # C_ij = A_ij + B_ij
        C[ii, jj] = A[ii, jj] + B[ii, jj]

In [3]:
N = 10240

A = np.random.randn(N, N).astype(np.float32)
B = np.random.randn(N, N).astype(np.float32)

C = np.zeros((N, N), dtype=np.float32)

In [4]:
threads_per_block = (16, 16)
number_of_blocks = (math.ceil(N / 16), math.ceil(N / 16))

In [5]:
kernel[number_of_blocks, threads_per_block](A, B, C)



In [6]:
(C == (A + B)).all()

True