In [1]:
from numba import cuda
from numba import float32,int32,int64

In [20]:
%pylab inline
N = 1024
gpu_res = 0
a = ones(N)
b = ones(N)
dotKernel(gpu_res,a,b)

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


In [21]:
gpu_res

0

In [19]:

TPB = 32
N = 1024
d_res = 0
@cuda.jit('void(int64,int64[:],int64[:])')
def dotKernel(d_res,d_a,d_b):
    
    
    bw = cuda.blockDim.x
    idx = cuda.threadIdx.x + cuda.blockDim.x * cuda.blockIdx.x

    s_idx = cuda.threadIdx.x
    
    s_prod = cuda.shared.array(shape = TPB,dtype =int64)
    s_prod[s_idx] = d_a[idx] * d_b[idx]
    cuda.syncthreads()
    
    #if (s_idx == 0):
    blockSum = 0
    for j in range(bw):
        blockSum += s_prod[s_idx]

    #cuda.atomic.add(d_res,blockSum)
    d_res += blockSum

    
    

In [2]:
@cuda.jit('int32(float64, float64, int32)', device=True)
def mandel(x, y, max_iters):
    """
    Given the real and imaginary parts of a complex number,
    determine if it is a candidate for membership in the Mandelbrot
    set given a fixed number of iterations.
    """
    c = complex(x, y)
    z = complex(0, 0)
    for i in range(max_iters):
        z = z*z + c
        if z.real * z.real + z.imag * z.imag >= 4:
            return i
    return 255

@cuda.autojit
def create_fractal(min_x, max_x, min_y, max_y, image, iters):
    height = image.shape[0]
    width = image.shape[1]
 
    pixel_size_x = (max_x - min_x) / width
    pixel_size_y = (max_y - min_y) / height
    for x in range(width):
        real = min_x + x * pixel_size_x
        for y in range(height):
            imag = min_y + y * pixel_size_y
            color = mandel(real, imag, iters)
            image[y, x] = color

  warn('autojit is deprecated and will be removed in a future release. Use jit instead.')


In [3]:
import numpy as np
image = np.zeros((100, 100), dtype=np.uint8)
create_fractal(-2.0, 1.0, -1.0, 1.0, image, 100)

In [63]:
cuda.select_device(0)

<weakproxy at 000000000A05C728 to Device at 000000000A0F2EB8>

In [64]:
@cuda.jit
def matmul(A, B, C):
    """Perform square matrix multiplication of C = A * B
    """
    i, j = cuda.grid(2)
    if i < C.shape[0] and j < C.shape[1]:
        tmp = 0.
        for k in range(A.shape[1]):
            tmp += A[i, k] * B[k, j]
        C[i, j] = tmp

In [75]:
A = np.random.rand(10,10)
B = np.random.rand(10,10)
C = np.zeros([10,10])
griddim = 1
blockdim = 10
matmul[griddim,blockdim](A,B,C)

In [77]:
A*B

array([[ 0.05569361,  0.31891184,  0.02119827,  0.34066984,  0.523083  ,
         0.11828131,  0.47566831,  0.45399405,  0.49504407,  0.21590379],
       [ 0.21728661,  0.05547052,  0.09887185,  0.27963839,  0.0054043 ,
         0.75343157,  0.04429046,  0.3586107 ,  0.39104793,  0.4843393 ],
       [ 0.10798866,  0.4079104 ,  0.08335702,  0.59735901,  0.77583604,
         0.65201966,  0.2735375 ,  0.59555094,  0.0214579 ,  0.10448313],
       [ 0.0542797 ,  0.12234489,  0.19965953,  0.16907075,  0.44277902,
         0.67205745,  0.00128287,  0.21935872,  0.09338792,  0.01121087],
       [ 0.22208475,  0.37041696,  0.11514023,  0.41451599,  0.123708  ,
         0.05621587,  0.01277507,  0.42413899,  0.71561049,  0.51176432],
       [ 0.30097146,  0.24390718,  0.13322169,  0.08636955,  0.27358748,
         0.26338679,  0.02575112,  0.51787332,  0.37039584,  0.18364011],
       [ 0.33273699,  0.24454613,  0.64586645,  0.64715779,  0.04990949,
         0.14027507,  0.21066158,  0.05503497

In [76]:
A*B

array([[ 0.05569361,  0.31891184,  0.02119827,  0.34066984,  0.523083  ,
         0.11828131,  0.47566831,  0.45399405,  0.49504407,  0.21590379],
       [ 0.21728661,  0.05547052,  0.09887185,  0.27963839,  0.0054043 ,
         0.75343157,  0.04429046,  0.3586107 ,  0.39104793,  0.4843393 ],
       [ 0.10798866,  0.4079104 ,  0.08335702,  0.59735901,  0.77583604,
         0.65201966,  0.2735375 ,  0.59555094,  0.0214579 ,  0.10448313],
       [ 0.0542797 ,  0.12234489,  0.19965953,  0.16907075,  0.44277902,
         0.67205745,  0.00128287,  0.21935872,  0.09338792,  0.01121087],
       [ 0.22208475,  0.37041696,  0.11514023,  0.41451599,  0.123708  ,
         0.05621587,  0.01277507,  0.42413899,  0.71561049,  0.51176432],
       [ 0.30097146,  0.24390718,  0.13322169,  0.08636955,  0.27358748,
         0.26338679,  0.02575112,  0.51787332,  0.37039584,  0.18364011],
       [ 0.33273699,  0.24454613,  0.64586645,  0.64715779,  0.04990949,
         0.14027507,  0.21066158,  0.05503497

In [70]:
C

array([[ 2.48516786,  3.32890875,  2.55585927,  2.38882994,  3.91430149,
         3.21107991,  2.48379384,  1.87325697,  0.        ,  0.        ],
       [ 2.42383531,  1.90374652,  2.19794027,  1.9517842 ,  2.99950206,
         1.71460413,  1.46773967,  1.37396405,  0.        ,  0.        ],
       [ 2.51525363,  2.48382412,  3.12390388,  2.14617115,  3.9720185 ,
         3.26267949,  1.7242481 ,  2.18649553,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        

In [31]:
TPB = 64
@cuda.jit('void(float32[:],float32[:],float32[:])')
def dot(a,b,c):
    #idx = cuda.threadIdx.x + cuda.blockDim.x * cuda.blockIdx.x
    i = cuda.grid(1)
    c[i] = 10
    

In [32]:
griddim = 1, 2
blockdim = 3, 4
a = np.ones(100)
b = np.ones(100)
c = np.zeros(100)
dot[griddim,blockdim](a,b,c)

In [35]:
c

array([  5.39824125e-315,   5.39824125e-315,   5.39824125e-315,
         0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
         0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
         0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
         0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
         0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
         0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
         0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
         0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
         0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
         0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
         0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
         0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
         0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
         0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
         0.00000000e+000,   0.00000000e+

In [34]:
stream = cuda.stream()
devary = cuda.to_device(an_array, stream=stream)
dot[griddim,blockdim](a,b,c)
devary.copy_to_host(an_array, stream=stream)

NameError: name 'an_array' is not defined

In [2]:
from numba import jit
bpg = 50
tpb = 32
n = bpg * tpb
acc = 0
@cuda.jit('void(float32[:,:],float32[:,:],float32[:,:],float32)')
def cu_square_matrix_mul(A, B, C,acc):
    sA = cuda.shared.array(shape=(tpb, tpb), dtype=float32)
    sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    bw = cuda.blockDim.x
    bh = cuda.blockDim.y

    x = tx + bx * bw
    y = ty + by * bh

    for i in range(bpg):
        if x < n and y < n:
            sA[ty, tx] = A[y, tx + i * tpb]
            sB[ty, tx] = B[ty + i * tpb, x]

        cuda.syncthreads()

        if x < n and y < n:
            for j in range(tpb):
                acc += sA[ty, j] * sB[j, tx]

        cuda.syncthreads()

    if x < n and y < n:
        C[y, x] = acc

In [None]:
dot()

In [3]:
import numpy as np
A = np.random.rand(10,10)
B = np.random.rand(10,10)
C = np.zeros([10,10])
cu_square_matrix_mul(A,B,C,0)

In [6]:
C

array([[  1.60683816e-314,   0.00000000e+000,   0.00000000e+000,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
          0.00000000e+000],
       [  0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
          0.00000000e+000],
       [  0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
          0.00000000e+000],
       [  0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
          0.00000000e+000],
       [  0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
          0.00000000e+000,   0.00000000e+00

In [None]:
cu_square_matrix_mul(A,B)

In [10]:
gpu_res

0

In [46]:
from numba import float32,int32,int64
TPB = 32
N = 1024
Res = 0
@cuda.jit('void(int64[:],int64[:])')
def dotKernel(d_a,d_b):
    
    
    bw = cuda.blockDim.x
    idx = cuda.threadIdx.x + cuda.blockDim.x * cuda.blockIdx.x
    if idx>=N: return
    s_idx = cuda.threadIdx.x
    
    s_prod = cuda.shared.array(shape = TPB,dtype =int64)
    s_prod[s_idx] = d_a[idx] * d_b[idx]
    cuda.syncthreads()
    
    if (s_idx == 0):
        blockSum = 0
        for j in range(bw):
            blockSum += s_prod[s_idx]
        
    
    
    

In [8]:
%pylab inline
a = ones(N)
b = ones(N)

dotKernel(c,a,b)


Populating the interactive namespace from numpy and matplotlib


NameError: name 'c' is not defined

In [5]:
bpg = 50
tpb = 32
n = bpg * tpb

@cuda.jit(argtypes=[float32[:,:], float32[:,:], float32[:,:]], target='gpu')
def cu_square_matrix_mul(A, B, C):
    sA = cuda.shared.array(shape=(tpb, tpb), dtype=float32)
    sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    bw = cuda.blockDim.x
    bh = cuda.blockDim.y

    x = tx + bx * bw
    y = ty + by * bh

    acc = 0.
    for i in range(bpg):
        if x < n and y < n:
            sA[ty, tx] = A[y, tx + i * tpb]
            sB[ty, tx] = B[ty + i * tpb, x]

        cuda.syncthreads()

        if x < n and y < n:
            for j in range(tpb):
                acc += sA[ty, j] * sB[j, tx]

        cuda.syncthreads()

    if x < n and y < n:
        C[y, x] = acc

CudaAPIError: Call to cuLinkCreate results in UNKNOWN_CUDA_ERROR