In [1]:
import numba
from numba import jit, int32, prange, vectorize, float64, cuda
import numpy as np
import math

### Offload the computation to a GPU
* Assumptions:
    * N <= 512
* Hints:
    * Launch one block with threads <= 512

In [2]:
@cuda.jit
def native_python_monte_kernel(array_a, array_b, array_c):
    # Thread id in a 1D block
    tx = cuda.threadIdx.x
 
    if tx < array_a.size:  # Check array boundaries
        array_c[tx] = 1 if array_a[tx]**2 + array_b[tx]**2 <= 1.0  else 0

In [4]:
blocks = 2
threads = 256
N = 1000000
array_a = np.random.random(N)
array_b = np.random.random(N)
array_c = np.zeros(N)



In [6]:

native_python_monte_kernel[blocks, threads](array_a, array_b, array_c)
print(4.0 * np.sum(array_c) / N)

0.00082
