
__Tutorial for CUDA programming using Python__
==================================================

## __Sparse matrix__

* #### __csr_matrix: Compressed Sparse Row format__
* #### csc_matrix: Compressed Sparse Column format
* #### bsr_matrix: Block Sparse Row format
* #### lil_matrix: List of Lists format
* #### dok_matrix: Dictionary of Keys format
* #### coo_matrix: COOrdinate format (aka IJV, triplet format)
* #### dia_matrix: DIAgonal format
* #### __(c.f.) ell_matrix: Ellapack format__ 


In [None]:
# !pip install pycuda # install cuda

In [None]:
import os
import numpy as np
from time import time

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import cg as sparse_cg

### __load predefined matrix__

In [None]:
# # download matrix
# !wget https://www.dropbox.com/s/7r2v6qq02ru8vpb/rectangle_032_csr_f64.matv?dl=0 -O rectangle_032_csr_f64.matv
# !wget https://www.dropbox.com/s/yyeftikqpye4ksz/rectangle_032_csr_i32.mati?dl=0 -O rectangle_032_csr_i32.mati 
# !wget https://www.dropbox.com/s/iw7fmjnk346jya6/rectangle_032_csr_i32.matp?dl=0 -O rectangle_032_csr_i32.matp
# !wget https://www.dropbox.com/s/vz0br0bxt04yyyt/rectangle_032_f64.sysr?dl=0 -O rectangle_032_f64.sysr    

In [None]:
# !wget https://www.dropbox.com/s/e0vxgccs1kjumc3/rectangle_256_csr_f64.matv?dl=0 -O rectangle_256_csr_f64.matv
# !wget https://www.dropbox.com/s/17x5k3mfx14crye/rectangle_256_csr_i32.mati?dl=0 -O rectangle_256_csr_i32.mati 
# !wget https://www.dropbox.com/s/c7sgo4nm10z8o1v/rectangle_256_csr_i32.matp?dl=0 -O rectangle_256_csr_i32.matp
# !wget https://www.dropbox.com/s/aa8re8jlpky8cym/rectangle_256_f64.sysr?dl=0 -O rectangle_256_f64.sysr

In [None]:
resolution = 256
matv = np.fromfile('rectangle_{:03d}_csr_f64.matv'.format(resolution), dtype=np.float64)
indices = np.fromfile('rectangle_{:03d}_csr_i32.mati'.format(resolution), dtype=np.int32)
indptr = np.fromfile('rectangle_{:03d}_csr_i32.matp'.format(resolution), dtype=np.int32)
b = np.fromfile('rectangle_{:03d}_f64.sysr'.format(resolution), dtype=np.float64)

In [None]:
num = len(indptr)-1
print(num)
A = csr_matrix((matv, indices, indptr), shape=(num, num))

In [None]:
# --- PyCUDA initialization
import pycuda
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

In [None]:
src_mvker = """

__global__ void csr_dot(double* matv, int* indices, int* indptr, double *x, double *y, int NUM)
{     
    int k;
    int strIdx;
    int endIdx;
    int tid = threadIdx.x+blockIdx.x*blockDim.x;

    double apVal;
    while(tid<NUM)
    {
        apVal = 0.0;
        strIdx = indptr[tid];
        endIdx = indptr[tid+1];

        for(k=strIdx ; k<endIdx; k++)
            apVal += matv[k]*x[indices[k]];
        
        y[tid] = apVal;
        
         /// passing
        tid += blockDim.x*gridDim.x;
    }
}
"""

In [None]:
def nextpow2(x):  
    return 1 if x == 0 else 2**(x - 1).bit_length()

def getThreadsAndBlocks(kerId, num, maxBlocks, maxThreads):
    if kerId < 3:
        threads = nextpow2(num) if num < maxThreads else maxThreads
        blocks = int((num + threads - 1) / threads)
    else:
        threads = nextpow2(int((num+1)/2)) if num < maxThreads else maxThreads 
        blocks = int((num + 2*2*threads -1) / (2*threads))
    
    if kerId == 6:
        blocks = blocks if maxBlocks > blocks else maxBlocks

    return threads, blocks

### __cuda matrix-vector multiplication using pycuda__

In [None]:
# thread, blocks, shared memory size
threads, blocks = getThreadsAndBlocks(6, num, 64, 512)
smems = 2*threads*8 if threads <= 32 else threads*8
print(threads, blocks, smems)

print('kernel build')
module = pycuda.compiler.SourceModule(source=src_mvker)

dev_matv = pycuda.gpuarray.to_gpu(matv)
dev_indices = pycuda.gpuarray.to_gpu(indices)
dev_indptr = pycuda.gpuarray.to_gpu(indptr)
dev_x = pycuda.gpuarray.to_gpu(b)
dev_y = pycuda.gpuarray.to_gpu(np.zeros_like(b))

# cuda compile ...
dev_csr_dot = module.get_function("csr_dot")

# block and grid dimensions
blockDim  = (threads, 1, 1)
gridDim   = (blocks, 1, 1)

# e_start.record()

print('pycuda matrix vector multiplication')

# ready ...
dev_csr_dot(dev_matv, dev_indices, dev_indptr, dev_x, dev_y, np.int32(num), block=blockDim, grid=gridDim, shared=smems)

t_start = time.time()
nIter = 32
for _ in range(nIter):
    dev_csr_dot(dev_matv, dev_indices, dev_indptr, dev_x, dev_y, np.int32(num), block=blockDim, grid=gridDim, shared=smems)

elapsed = time.time() - t_start

print("Processing time = {:f}".format(elapsed/nIter))


### __matrix-vector multiplication using scipy (cpu)__

In [None]:
# ready ...
A = csr_matrix((matv, indices, indptr), shape=(num, num))
x = np.copy(b)

y = A.dot(b)

nIter = 32
t_start = time.time()
for _ in range(nIter):
    y = A.dot(x)
    
elapsed = time.time() - t_start
print('process time cpu = {:f}'.format(elapsed/nIter))


### __cuda matrix-vector multiplication using cupy__

In [None]:
import cupy as cp
from cupyx.scipy.sparse import csr_matrix as device_csr

In [None]:
dev_A = device_csr(A)
dev_x = cp.array(b)

dev_y = dev_A.dot(dev_x)

nIter = 32
t_start = time()
for _ in range(nIter):
    dev_y = dev_A.dot(dev_x)
    
elapsed = time() - t_start
print('process time gpu = {:f}'.format(elapsed/nIter))

### __comparison__

In [None]:
diff = y - (dev_y.get())
print(diff)

In [None]:
np.linalg.norm(diff)