In [1]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from scipy.optimize import minimize


# Transferring Data

In [7]:
import numpy 
a=numpy.random.randn(4,4)
a=a.astype(numpy.float32)
a_gpu=cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu,a)


array([[-0.69440347,  0.25208026, -0.7883007 , -0.73613   ],
       [-0.1213868 ,  0.4582234 ,  0.66922635, -1.4988806 ],
       [-0.61748666, -0.53340596,  0.7325721 , -0.09882516],
       [-0.28886718, -0.2389829 ,  0.08443267,  0.8690153 ]],
      dtype=float32)

# Executing a Kernel

In [3]:
mod=SourceModule("""
__global__ void doublify(float *a)
{
    int idx=threadIdx.x+threadIdx.y*4;
    a[idx]*=2;
}
""")
func=mod.get_function("doublify")
func(a_gpu,block=(4,4,1))
a_doubled=numpy.empty_like(a)
print(a)
cuda.memcpy_dtoh(a_doubled,a_gpu)
print(a_doubled)

[[-0.07021137 -0.86102444 -1.9212255   1.4122069 ]
 [-0.51889646  0.1509971   2.080334    0.03691731]
 [ 0.2966108   1.1588308  -0.5102222  -1.999049  ]
 [-1.1879143  -0.67628145 -0.716728    0.23844439]]
[[-0.14042273 -1.7220489  -3.842451    2.8244138 ]
 [-1.0377929   0.3019942   4.160668    0.07383463]
 [ 0.5932216   2.3176615  -1.0204444  -3.998098  ]
 [-2.3758285  -1.3525629  -1.433456    0.47688878]]


# Bonus: Abstracting Away the Complications

In [4]:
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import pycuda.autoinit
import numpy
a=numpy.random.randn(4,4).astype(numpy.float32)
a_gpu=gpuarray.to_gpu(a)
%timeit a_doubled=(2*a_gpu).get()
# print(a_doubled)
# print(a_gpu)

246 µs ± 847 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [5]:
%timeit a*2

2.96 µs ± 54.8 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


# Advanced Topics

In [6]:
from numba import jit
import numpy as np
import time

x = np.arange(100).reshape(10, 10)

@jit(nopython=True)
def go_fast(a): # Function is compiled and runs in machine code
    trace = 0.0
    for i in range(a.shape[0]):
        trace += np.tanh(a[i, i])
    return a + trace

# DO NOT REPORT THIS... COMPILATION TIME IS INCLUDED IN THE EXECUTION TIME!
start = time.perf_counter()
go_fast(x)
end = time.perf_counter()
print("Elapsed (with compilation) = {}s".format((end - start)))

# NOW THE FUNCTION IS COMPILED, RE-TIME IT EXECUTING FROM CACHE
start = time.perf_counter()
go_fast(x)
end = time.perf_counter()
print("Elapsed (after compilation) = {}s".format((end - start)))

Elapsed (with compilation) = 0.8302448028698564s
Elapsed (after compilation) = 0.0001378101296722889s
