In [1]:
import numpy as np

from numba import cuda, vectorize

In [2]:
@cuda.jit
def f(a, b, c):
    # like threadIdx.x + (blockIdx.x * blockDim.x)
    tid = cuda.grid(1)
    size = len(c)

    if tid < size:

        c[tid] = a[tid] + b[tid]

In [3]:
N = 5000000
a = cuda.to_device(np.random.random(N))
b = cuda.to_device(np.random.random(N))
c = cuda.device_array_like(a)

In [4]:
f.forall(len(a))(a, b, c)

c.copy_to_host()


array([0.71352108, 0.36048853, 1.59320618, ..., 1.00591074, 1.33982384,
       0.66250398])

In [5]:
type(f)

numba.cuda.compiler.Dispatcher

In [8]:

a = np.array([1, 2, 3, 4])
b = np.array([10, 20, 30, 40])

In [9]:
@vectorize(['int64(int64, int64)'], target='cuda') # Type signature and target are required for the GPU
def add_ufunc(x, y):
    return x + y


In [10]:
add_ufunc(a, b)



array([11, 22, 33, 44])