__Tutorial for CUDA programming using Python__
==================================================

## Saxpy 

SAXPY stands for “Single-Precision A·X Plus Y”. 

In [None]:
# !pip install pycuda # install cuda

In [None]:
# !nvidia-smi

In [None]:
import os
import time
import numpy as np

# --- PyCUDA initialization
import pycuda
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

In [None]:
def nextpow2(x):  
    return 1 if x == 0 else 2**(x - 1).bit_length()

def getThreadsAndBlocks(kerId, num, maxBlocks, maxThreads):
    if kerId < 3:
        threads = nextpow2(num) if num < maxThreads else maxThreads
        blocks = int((num + threads - 1) / threads)
    else:
        threads = nextpow2(int((num+1)/2)) if num < maxThreads else maxThreads 
        blocks = int((num + 2*2*threads -1) / (2*threads))
    
    if kerId == 6:
        blocks = blocks if maxBlocks > blocks else maxBlocks

    return threads, blocks

In [None]:
src_saxpy = """

__global__ void saxpy(float* x, float a, float* y, int NUM)
{
    unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
    while(tid < NUM)
    {
        y[tid] = a * x[tid] + y[tid];
        /// 
        tid += blockDim.x*gridDim.x;
    }    
}
"""

In [None]:
# e_start = pycuda.driver.Event()
# e_stop = pycuda.driver.Event()

N = 2**27

# thread, blocks, shared memory size
threads, blocks = getThreadsAndBlocks(1, N, 64, 64)
smems = 2*threads*8 if threads <= 32 else threads*8
print(threads, blocks, smems)

In [None]:
h_x = np.random.uniform(-1, 1, size=N).astype(np.float32)
h_y = np.random.uniform(-1, 1, size=N).astype(np.float32)

d_x = pycuda.gpuarray.to_gpu(h_x)
d_y = pycuda.gpuarray.to_gpu(h_y)

a = 2.0

In [None]:
# cuda compile ...
print('kernel build')
module = pycuda.compiler.SourceModule(source=src_saxpy)

dev_saxpy = module.get_function("saxpy")

In [None]:
# block and grid dimensions
blockDim  = (threads, 1, 1)
gridDim   = (blocks, 1, 1)

# e_start.record()
print('saxpy : gpu')
t_start = time.time()
dev_saxpy(d_x, np.float32(a), d_y, np.int32(N), block=blockDim, grid=gridDim)
elapsed = time.time() - t_start

# e_stop.record() 
# e_stop.synchronize()
# which is better ?

pycuda.driver.Context.synchronize()

print("Processing time = {:f}".format(elapsed))


In [None]:
print('saxpy : cpu')
t_start = time.time()

h_y = h_x * a + h_y

elapsed = time.time() - t_start
print("Processing time = {:f}".format(elapsed))