#Install dependency

In [2]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2021.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 12.6 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pytools>=2011.2
  Downloading pytools-2021.2.8.tar.gz (63 kB)
[K     |████████████████████████████████| 63 kB 2.0 MB/s 
Collecting mako
  Downloading Mako-1.1.5-py2.py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 4.8 MB/s 
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (PEP 517) ... [?25l[?25hdone
  Created wheel for pycuda: filename=pycuda-2021.1-cp37-cp37m-linux_x86_64.whl size=626972 sha256=1d8196a38656b340f48c8e1dfbdc5c16256e52f316a4f5cf4b12ae70c14e244d
  Stored in directory: /root/.cache/pip/wheels/c4/ef/49/dc6a5feb8d980b37c83d465ecab24949a6aa19458522a9e001
  Building wheel for pytools (setup.py) ... [?25l[?25hdone
  

#Exercise

In [None]:
# create 3x 1D integer (int32) array with 1M items
# A array - contains items from 1 to 1M
# B array - contains items from 1 to 1M
# C array - contains random items in range (0-100)
# Operations with arrays - Output = A * C - B - do operations with corresponding indexes
# create CPU function for this operations called 'cpu_processing'
# create GPU kernel with function called 'gpu_processing'
# dimension of block size have to be power of 2 (e.g. 64, 128, 256)
# measure computational times for CPU a GPU function 
# print last ten items from output CPU and GPU array

In [None]:
import time
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.autoinit import context
from pycuda.compiler import SourceModule
import numpy
import random

from pycuda.compiler import SourceModule
mod = SourceModule(...)

def cpu_processing(A, B, C):
  # Output = A * C - B
 ...
  return Output

# array dimensions
numElements = ...

gpu_processing = mod.get_function(...)

a = ...
b = ...
c = ...

dest = numpy.empty_like(a)

for i in range(0, numElements):
  # generate data to a, b, c

start = time.time()
output = cpu_processing(a, b, c)
stop = time.time()
print("\nCPU Processing time: {0} s".format(round(stop-start, 3)))
print("CPU output: "+str(...))

...

blockSize = ...
gridDim = ...
print("Grid dimension: "+str(gridDim))

...

gpu_processing(...)
context.synchronize()

...

...

a_gpu.free()
b_gpu.free()
c_gpu.free()
dest_gpu.free()

#Solution

In [2]:
import time
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.autoinit import context
from pycuda.compiler import SourceModule
import numpy
import random

from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void gpu_processing(int *Output, int *A, int *B, int *C)
{
  int index = blockIdx.x *blockDim.x + threadIdx.x;
  
  Output[index] = A[index]*C[index]-B[index];
}
""")

def cpu_processing(A, B, C):
  # Output = A * C - B
  Output = numpy.zeros(numElements, dtype=numpy.int32)
  for i in range(0,len(A)):
    Output[i] = A[i]*C[i]-B[i]
  return Output

# matrix dimensions
numElements = int(1e6)

gpu_processing = mod.get_function("gpu_processing")

a = numpy.zeros(numElements, dtype=numpy.int32)
b = numpy.zeros(numElements, dtype=numpy.int32)
c = numpy.zeros(numElements, dtype=numpy.int32)

for i in range(0, numElements):
  a[i] = i+1
  b[i] = i+1
  c[i] = random.random()*100

start = time.time()
output = cpu_processing(a, b, c)
stop = time.time()
print("\nCPU Processing time: {0} s".format(round(stop-start, 3)))
print("CPU output: "+str(output[-10:]))

dest = numpy.empty_like(a)

a_gpu = cuda.mem_alloc(a.nbytes)
b_gpu = cuda.mem_alloc(b.nbytes)
c_gpu = cuda.mem_alloc(c.nbytes)
dest_gpu = cuda.mem_alloc(dest.nbytes)

cuda.memcpy_htod(a_gpu, a)
cuda.memcpy_htod(b_gpu, b)
cuda.memcpy_htod(c_gpu, c)

blockSize = 256
gridDim = int(numElements/blockSize + 1)
print("Grid dimension: "+str(gridDim))

start = time.time()

gpu_processing(dest_gpu, a_gpu, b_gpu, c_gpu, block=(blockSize,1,1), grid=(gridDim,1,1))
context.synchronize()

cuda.memcpy_dtoh(dest, dest_gpu)
stop = time.time()
print("\nGPU Processing time: {0} s".format(round(stop-start, 3)))
print("GPU output: "+str(dest[-10:]))

a_gpu.free()
b_gpu.free()
c_gpu.free()
dest_gpu.free()


CPU Processing time: 0.757 s
CPU output: [31999712 53999568 58999587 22999862 68999655 36999852 53999838 54999890
 14999985 35000000]
Grid dimension: 3907

GPU Processing time: 0.002 s
GPU output: [31999712 53999568 58999587 22999862 68999655 36999852 53999838 54999890
 14999985 35000000]
