In [1]:
!pip install pycuda

Collecting pycuda
[?25l  Downloading https://files.pythonhosted.org/packages/46/61/47d3235a4c13eec5a5f03594ddb268f4858734e02980afbcd806e6242fa5/pycuda-2020.1.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 4.8MB/s 
[?25hCollecting pytools>=2011.2
[?25l  Downloading https://files.pythonhosted.org/packages/b7/30/c9362a282ef89106768cba9d884f4b2e4f5dc6881d0c19b478d2a710b82b/pytools-2020.4.3.tar.gz (62kB)
[K     |████████████████████████████████| 71kB 10.3MB/s 
Collecting appdirs>=1.4.0
  Downloading https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl
Collecting mako
[?25l  Downloading https://files.pythonhosted.org/packages/a6/37/0e706200d22172eb8fa17d68a7ae22dec7631a0a92266634fb518a88a5b2/Mako-1.1.3-py2.py3-none-any.whl (75kB)
[K     |████████████████████████████████| 81kB 11.9MB/s 
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (setup.py) ..

#Hold connection
While not using sheet, you should start infinite loop function to hold session. If you want to execute another code, please interupt this function.

In [None]:
import time
while 1:
  time.sleep(10)

#Exercise
Create two GPU kernels using atomic functions according to logic of given CPU code.

First GPU kernel will use shared memory for saving temporal max and min values.

Second GPU kernel is version without using shared memory.

Compare computational time and results (min, max values) of all algoritms.






In [None]:
from pycuda.compiler import SourceModule
import pycuda.driver as cuda
import numpy
import time
import math

cuda.init()
dev = cuda.Device(0)
ctx = dev.make_context()

numElements = int(1e6)

BLOCK_SIZE = 256
gridDim = int(numElements/BLOCK_SIZE + 1)
print("Grid dimension: "+str(gridDim))

kernel_code = ...

kernel_code = kernel_code.replace("BLOCK_SIZE", str(BLOCK_SIZE))

# Create data
input = numpy.random.random(numElements).astype(numpy.float32)

# CPU version
start = time.time()
max = -1000
min = 1000

for i in range(numElements):
    val = int(numpy.arccos(input[i])*180/ 3.14)

    if val > max:
      max = val
    if val < min:
      min = val

stop = time.time()
print("\nCPU Processing time: {0} s".format(round(stop-start, 3)))
print("CPU max value: "+str(max))
print("CPU min value: "+str(min))

mod = SourceModule(kernel_code)

# with shared memory
SharedMem = mod.get_function("SharedMem")

...


# without shared memory
WithoutSharedMem = mod.get_function("WithoutSharedMem")

...

# clean allocated memory
...
ctx.pop()

Grid dimension: 3907

CPU Processing time: 4.792 s
CPU max value: 90
CPU min value: 0

GPU Processing time with shared memory: 0.0006 s
CPU max value: 90
CPU min value: 0

GPU Processing time without shared memory: 0.0376 s
CPU max value: 90
CPU min value: 0


#Solution

In [None]:
from pycuda.compiler import SourceModule
import pycuda.driver as cuda
import numpy
import time
import math

cuda.init()
dev = cuda.Device(0)
ctx = dev.make_context()

numElements = int(1e6)

BLOCK_SIZE = 256
gridDim = int(numElements/BLOCK_SIZE + 1)
print("Grid dimension: "+str(gridDim))

kernel_code = """
__device__ float toDeg = 180/3.14;

__global__ void SharedMem(const float *input, int *min, int *max, const int N) 
{
    __shared__ int min_shared[1];  //local block memory cache 
    __shared__ int max_shared[1];  //local block memory cache  

    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if(idx < N){

        min_shared[0] = 0;
        max_shared[0] = 0;
        __syncthreads();

        int val = acosf(input[idx])*toDeg;

        atomicMin(&(min_shared[0]), val);
        atomicMax(&(max_shared[0]), val);
        __syncthreads();

        // Commit to global memory
        if(threadIdx.x == 0){
            atomicMin(&min[0], min_shared[0]);
            atomicMax(&max[0], max_shared[0]);
        }
    }
}

__global__ void WithoutSharedMem(const float *input, int *min, int *max, const int N)
{      
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if(idx < N){
      int val = acosf(input[idx]) * toDeg;
      atomicMin(min, val);
      atomicMax(max, val);
    }
}
"""
kernel_code = kernel_code.replace("BLOCK_SIZE", str(BLOCK_SIZE))

# Create data
input = numpy.random.random(numElements).astype(numpy.float32)

# CPU version
start = time.time()
max = -1000
min = 1000

for i in range(numElements):
    val = int(numpy.arccos(input[i])*180/ 3.14)

    if val > max:
      max = val
    if val < min:
      min = val

stop = time.time()
print("\nCPU Processing time: {0} s".format(round(stop-start, 3)))
print("CPU max value: "+str(max))
print("CPU min value: "+str(min))

mod = SourceModule(kernel_code)

# with shared memory
SharedMem = mod.get_function("SharedMem")

min_out = numpy.zeros(1).astype(numpy.int32)
max_out = numpy.zeros(1).astype(numpy.int32)

input_gpu = cuda.mem_alloc(input.nbytes)
min_out_gpu = cuda.mem_alloc(min_out.nbytes)
max_out_gpu = cuda.mem_alloc(max_out.nbytes)

cuda.memcpy_htod(input_gpu, input)

start = time.time()

SharedMem(input_gpu, min_out_gpu, max_out_gpu, numpy.int32(numElements), block=(BLOCK_SIZE,1,1), grid=(gridDim,1,1), shared=4*2*1)
ctx.synchronize()

cuda.memcpy_dtoh(min_out, min_out_gpu)
cuda.memcpy_dtoh(max_out, max_out_gpu)

stop = time.time()
print("\nGPU Processing time with shared memory: {0} s".format(round(stop-start, 4)))
print("GPU max value: "+str(max_out[0]))
print("GPU min value: "+str(min_out[0]))


# without shared memory
cuda.memcpy_htod(min_out_gpu, numpy.zeros(1).astype(numpy.float32))
cuda.memcpy_htod(max_out_gpu, numpy.zeros(1).astype(numpy.float32))
start = time.time()

WithoutSharedMem = mod.get_function("WithoutSharedMem")
WithoutSharedMem(input_gpu, min_out_gpu, max_out_gpu, numpy.int32(numElements), block=(BLOCK_SIZE,1,1), grid=(gridDim,1,1))
ctx.synchronize()

cuda.memcpy_dtoh(min_out, min_out_gpu)
cuda.memcpy_dtoh(max_out, max_out_gpu)

stop = time.time()
print("\nGPU Processing time without shared memory: {0} s".format(round(stop-start, 4)))
print("GPU max value: "+str(max_out[0]))
print("GPU min value: "+str(min_out[0]))


# clean allocated memory
input_gpu.free()
min_out_gpu.free()
max_out_gpu.free()
ctx.pop()

Grid dimension: 3907

CPU Processing time: 4.442 s
CPU max value: 90
CPU min value: 0

GPU Processing time with shared memory: 0.0004 s
GPU max value: 90
GPU min value: 0

GPU Processing time without shared memory: 0.0296 s
GPU max value: 90
GPU min value: 0
