# License

    Jupyter notebook for accessing CUDA
    Copyright (C) 2018 Andre.Brodtkorb@ifi.uio.no, changed in October by André Brodtkorb

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

In [1]:
#Lets have matplotlib "inline"
%matplotlib inline

#Import packages we need
import numpy as np
import pycuda.compiler as cuda_compiler
from pycuda.gpuarray import GPUArray
import pycuda.driver as cuda_driver

from matplotlib import pyplot as plt

import IPythonMagic

In [2]:
import pytest
from ipytest import run_pytest, clean_tests

In [3]:
from Timer import Timer
import logging

In [4]:
%setup_logging
%cuda_context_handler contex

Python version 3.6.6 (default, Sep 12 2018, 18:26:19) 
[GCC 8.0.1 20180414 (experimental) [trunk revision 259383]]
Registering contex in user workspace
Creating context
PyCUDA version 2018.1.1
CUDA version (9, 1, 0)
Driver version 10000
Using 'Tesla K80' GPU
 => compute capability: (3, 7)
 => memory: 11130 / 11441 MB available
Created context handle <48571552>
Using CUDA cache dir /home/ubuntu/jupyter_notebooks/Borroni_Ale/MilanoGPU2018/notebooks/cuda_cache


In [25]:
cuda_kernel = """
__global__ void matrixVectorKernel(float* c, float* A, float* b, int a_rows, int a_cols) {
    unsigned int j = blockIdx.x*blockDim.x + threadIdx.x;
    
    //Out of bounds check
    if (j > a_rows) {
        return;
    }
    
    //Compute inner product of row of A with column of B
    float sum = 0.0f;
    for (int i=0; i<a_cols; ++i) {
        unsigned int k = j*a_cols + i;
        sum += A[k] * b[i];
    }
    
    //Write to global memory
    c[j] = sum;
}
"""
module = cuda_compiler.SourceModule(cuda_kernel)
kernel = module.get_function("matrixVectorKernel");

In [5]:
def gpuMatrixVector(a, b):
    #Upload data to the device
    #NOTE: We need to make sure that a=(a_rows, a_columns)
    # and that b=(a_colmuns, 1) (column vector)
    # and that c=(a_rows, 1)
    with Timer("Data allocation") as t:
        a_g = GPUArray(a.shape, np.float32)
        b_g = GPUArray(b.shape, np.float32)
        #Allocate output data
        c_g = GPUArray(a.shape[0], np.float32)
    with Timer("A upload") as t:
        a_g.set(a)
    with Timer("b upload") as t:
        b_g.set(b)
    
    
    
    #NOTE: We need to change this so that the grid*block is x = 1, y = number of rows in A
    block_size = (1, 1, 1) #These need to be [x, y, z]
    grid_size = (int(np.ceil(a.shape[0] / blocksize[0]), 1, 1)

    print("Block size is " + str(block_size))
    print("Grid size is " + str(grid_size)) 
    
    with Timer("Kernel execution") as t:
        #Execute program on device
        kernel(c_g, a_g, b_g, np.int32(a.shape[0]), np.int32(a.shape[1]), block=block_size, grid=grid_size)

    #Copy data from device to host
    c = np.empty((a.shape[0],1), dtype = np.float32)
    c_g.get(c)

    #Return our computer matrix-vector product
    return c
    

Exception caught: Resetting to CUDA context contex
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2988, in run_cell_async
    code_ast = compiler.ast_parse(cell, filename=cell_name)
  File "/home/ubuntu/.local/lib/python3.6/site-packages/IPython/core/compilerop.py", line 100, in ast_parse
    return compile(source, filename, symbol, self.flags | PyCF_ONLY_AST, 1)
  File "<ipython-input-5-5d413d43dae8>", line 22
    print("Block size is " + str(block_size))
        ^
SyntaxError: invalid syntax
Popping <48571552>
Pushing <48571552>


[0;31m---------------------------------------------------------------------------[0m
[0;31mSyntaxError[0m                               Traceback (most recent call last)
[0;32m~/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py[0m in [0;36mrun_cell_async[0;34m(self, raw_cell, store_history, silent, shell_futures)[0m
[1;32m   2987[0m                     [0;32melse[0m[0;34m:[0m[0;34m[0m[0m
[0;32m-> 2988[0;31m                         [0mcode_ast[0m [0;34m=[0m [0mcompiler[0m[0;34m.[0m[0mast_parse[0m[0;34m([0m[0mcell[0m[0;34m,[0m [0mfilename[0m[0;34m=[0m[0mcell_name[0m[0;34m)[0m[0;34m[0m[0m
[0m[1;32m   2989[0m                 [0;32mexcept[0m [0mself[0m[0;34m.[0m[0mcustom_exceptions[0m [0;32mas[0m [0me[0m[0;34m:[0m[0;34m[0m[0m

[0;32m~/.local/lib/python3.6/site-packages/IPython/core/compilerop.py[0m in [0;36mast_parse[0;34m(self, source, filename, symbol)[0m
[1;32m     99[0m         and are passed to 

Custom TB Handler failed, unregistering


In [6]:
#Size of our test
test_size = (2048, 2048)

#Create test input / output data
a = np.random.random(test_size).astype(np.float32)
b = np.random.random((test_size[1], 1)).astype(np.float32)
c = gpuMatrixVector(a, b)

fig = plt.figure()
plt.subplot(1,3,1)
plt.imshow(a)
plt.subplot(1,3,2)
plt.imshow(b)
plt.subplot(1,3,3)
plt.imshow(c)
fig.show()

NameError: name 'gpuMatrixVector' is not defined

In [28]:
#Compute reference using Numpy
c_ref = np.dot(a, b)

#Sum of absolute differences
sad = np.sum(np.abs(c - c_ref))

#Print result
# print("C   = ", c)
# print("Ref = ", c_ref)
print("Sad = %.30f" % sad)
print("Per element error: " + str(sad / test_size[1]))

Sad = 0.000000059604644775390625000000
Per element error: 1.9868214925130207e-08


In [31]:
clean_tests()

def test_gpuMatrixVector():
    #Let us test a matrix of size 1x1
    a = np.ones((1,1), dtype=np.float32)
    b = 2*np.ones((1,1), dtype=np.float32)
    c = gpuMatrixVector(a, b)
    assert c == pytest.approx(2.0)
    
    #Let us test the inner product works
    a = np.ones((1,2), dtype=np.float32)
    b = 2*np.ones((2,1), dtype=np.float32)
    c = gpuMatrixVector(a, b)
    assert c == pytest.approx(4.0)
    
    
    #Test a general matrix
    test_size = (4, 3)
    a = np.random.random(test_size).astype(np.float32)
    b = np.random.random((test_size[1], 1)).astype(np.float32)
    c = gpuMatrixVector(a, b)
    assert c == pytest.approx(a.dot(b), rel=1e-3)
    
run_pytest(filename='MatrixVectorTesting.ipynb', pytest_options=['-vvv'])    

platform linux -- Python 3.6.6, pytest-3.8.2, py-1.6.0, pluggy-0.7.1 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /home/ubuntu/jupyter_notebooks/Borroni_Ale/MilanoGPU2018/notebooks, inifile:
collecting ... collected 1 item

MatrixVectorTesting.py::test_gpuMatrixVector <- <ipython-input-31-2e0d19e86ccf> PASSED [100%]




0