In [3]:
from numba import jit
import random

@jit(nopython=True)
def monte_carlo_pi(nsamples):
    acc = 0
    for i in range(nsamples):
        x = random.random()
        y = random.random()
        if (x ** 2 + y ** 2) < 1.0:
            acc += 1
    return 4.0 * acc / nsamples

In [4]:
import numpy as np
monte_carlo_pi(10000000)

3.141916

In [69]:
from numba import cuda, float32
import time
import math
@cuda.jit
def increment_by_one(an_array):
    # Thread id in a 1D block
    tx = cuda.threadIdx.x
    # Block id in a 1D grid
    ty = cuda.blockIdx.x
    # Block width, i.e. number of threads per block
    bw = cuda.blockDim.x
    # Compute flattened index inside the array
    pos = tx + ty * bw
    for i in range(1000000):
        if pos < an_array.size:  # Check array boundaries
            an_array[pos] = math.sqrt(abs(an_array[pos]))

an_array = np.random.normal(0,1,size=2**15)
print(an_array[:10])
threadsperblock = 64
blockspergrid = (an_array.size + (threadsperblock - 1)) // threadsperblock
t1 = time.time()
increment_by_one[blockspergrid, threadsperblock](an_array)
t2 = time.time()
print(an_array[:10])
print(t2-t1)

[-0.63200664  0.35868749 -0.88308861  0.85521855  0.40723921 -0.41009166
 -0.04985493  0.0773229   1.69922401  0.79534132]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
14.17832064628601


In [35]:
@jit(nopython=True)
def inc(an_array):
    for i in range(1000000):
        an_array = np.sqrt(np.abs(an_array))
    return an_array

t1 = time.time()
an_array = inc(np.random.normal(0,1,size=2**15))
t2 = time.time()
print(t2-t1)

24.22390604019165


In [39]:
an_array = np.random.normal(0,1,size=2**15)
t1 = time.time()
an_array = inc(an_array)
t2 = time.time()
print(t2-t1)

24.132588863372803


In [84]:
import matplotlib.pyplot as plt
# plt.scatter(x[:,0],x[:,1],alpha=0.02)

@cuda.jit
def mcmc(data):
    shared = cuda.shared.array(shape=(2**11, 2), dtype=float32)
    tx = cuda.threadIdx.x  # Thread ID
    ty = cuda.blockIdx.x  # Block ID
    bw = cuda.blockDim.x  # Block Size
    idx = bw*ty+tx
    
    theta = (0,0)
    x = data[idx]
    log_p = -(((theta[0]-x[0])**2)/(2*0.1) + ((theta[1]-x[1])**2)/(2*0.1))
    shared[tx] = log_p
    
    s = bw//2
    while s>0:
        if tx < s:
            shared[tx,0] += shared[tx+s,0]
            shared[tx,1] += shared[tx+s,1]
        cuda.syncthreads()
        s>>=1
    
    
#     for (unsigned int s=blockDim.x/2; s>0; s>>=1) {
#     if (tid < s) {
#     sdata[tid] += sdata[tid + s];
#     }
#     __syncthreads();
#     }


# Prior distribution
theta_0 =  np.random.multivariate_normal([1,1],cov=[[1, 0.5],[0.5, 1]])
data = np.random.multivariate_normal(theta_0,cov=[[0.1, 0],[0, 0.1]],size=2**5)
threadsperblock = 32
blockspergrid = (data.size + (threadsperblock - 1)) // threadsperblock
mcmc[blockspergrid, threadsperblock](data)


In [60]:
from scipy.stats import multivariate_normal as mn
mn.pdf([0,0],mean=[1,1],cov=[[1, 0.5],[0.5, 1]])

0.09435389770895924