In [None]:
height = 2_000
width = 3_000
maxiterations = 20

In [None]:
import numpy as np
import numba
import numba.cuda
import math
import matplotlib.pyplot as plt

This time we can actually check the name of the device using the API:

In [None]:
numba.cuda.get_current_device().name

Let's make the data each time (we won't always use the output `fractal`)

In [None]:
def prepare(height, width):
    x,y = np.ogrid[-1.5j:1.5j:height*1j, -2:2:width*1j]
    c = x + y
    fractal = np.zeros(c.shape, dtype=np.int32)
    return c, fractal

## Numpy

Let's try a Numpy run (we will use `%%time` instead of `%%timeit`, since this takes several seconds to run so we don't need a precision measurement and don't want to waste time):

In [None]:
def fractal_numpy(c, maxiterations):
    f = np.zeros_like(c, dtype=np.int32)
    z = c.copy()

    for i in range(1, maxiterations+1):
        z = z**2 + c                    # Compute z
        diverge = np.abs(z**2)  > 2**2  # Divergence criteria

        z[diverge] = 2               # Keep number size small
        f[~diverge] = i              # Fill in non-diverged iteration number
        
    return f

In [None]:
c, _ = prepare(height, width)

In [None]:
%%time
_ = fractal_numpy(c, maxiterations)

## Numba

Let's do a quick check with Numba from the CPU course, just to see how fast we can get on single CPU:

In [None]:
@numba.vectorize([numba.int32(numba.complex128, numba.int32)])
def fractal_numba_vectorize(cxy, maxiterations):
    z = cxy
    for i in range(maxiterations):
        z = z**2 + cxy
        if abs(z) > 2:
            return i
    return maxiterations

In [None]:
c, _ = prepare(height, width)

In [None]:
%%timeit
fractal_numba_vectorize(c, maxiterations)

## Numba CUDA: vectorize, host memory

In [None]:
@numba.vectorize([numba.int32(numba.complex128, numba.int32)], target='cuda')
def fractal_cuda_vectorize(cxy, maxiterations):
    z = cxy
    for i in range(maxiterations):
        z = z**2 + cxy
        if abs(z) > 2:
            return i
    return maxiterations

In [None]:
c, _ = prepare(height, width)

In [None]:
%%timeit
fractal_cuda_vectorize(c, maxiterations);

## Numba CUDA: vectorize, GPU memory

In [None]:
c, _ = prepare(height, width)
c = numba.cuda.to_device(c)

In [None]:
c

In [None]:
%%timeit
fractal_cuda_vectorize(c, maxiterations)
numba.cuda.synchronize()

Note that we did not copy the memory back to the CPU; it's still on the GPU.

## Numba CUDA: vectorize, skip allocation

In [None]:
c, f = prepare(height, width)
c = numba.cuda.to_device(c)
f = numba.cuda.to_device(f)

In [None]:
%%timeit
fractal_cuda_vectorize(c, maxiterations, out=f)
numba.cuda.synchronize()

## Numba CUDA: custom kernel

In [None]:
@numba.cuda.jit
def fractal_cuda_kernel(c_array, f, maxiterations):
    x, y = numba.cuda.grid(2)
    if x < c_array.shape[0] and y < c_array.shape[1]:
        f[x,y] = 0
        z = c_array[x,y]
        for i in range(maxiterations):
            z = z**2 + c_array[x,y]
            if abs(z**2) > 4:
                break
            f[x,y] = i

In [None]:
c, f = prepare(height, width)
c = numba.cuda.to_device(c)
f = numba.cuda.to_device(f)

Now we have to specify a custom kernel launch, rather than having it automated.

In [None]:
threadsperblock = (8, 8)
blockspergrid = (
    math.ceil(c.shape[0] / threadsperblock[0]),
    math.ceil(c.shape[1] / threadsperblock[1]),
)


In [None]:
blockspergrid

In [None]:
%%timeit
fractal_cuda_kernel[blockspergrid, threadsperblock](c, f, maxiterations)
np.array(f, dtype=f.dtype)
numba.cuda.synchronize()

We can plot this, just in case we made a mistake (even though we ran timeit above, plotting is valid, since we are reusing the same preallocated memory location):

In [None]:
plt.imshow(np.array(f, dtype=f.dtype));