# cupy cuda learning

In [1]:
import numpy as np
import cupy as cp
import cv2
import string
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import  animation

In [2]:
def load_kernel(src,kname):
    _ = cp.zeros((1,)) # 初始化cuda context
    kernel = cp.core.core.compile_with_cache(src).get_function(kname)
    return kernel

In [3]:
source = string.Template("""
extern "C" {
    __global__  void vector_add_int(int * a,int *b,int * c){
        int tid = blockIdx.x;
        c[tid] = a[tid] + b[tid];
    }
}
""").substitute()

In [4]:
vector_add = load_kernel(source,'vector_add_int')

In [5]:
a = cp.arange(101).astype('int32')
b = cp.arange(101).astype('int32')
c = cp.zeros((101,),dtype=cp.int32)

In [6]:
vector_add(grid=(101,1),block=(1,),args = (a,b,c))

In [7]:
print(c.get())

[  0   2   4   6   8  10  12  14  16  18  20  22  24  26  28  30  32  34
  36  38  40  42  44  46  48  50  52  54  56  58  60  62  64  66  68  70
  72  74  76  78  80  82  84  86  88  90  92  94  96  98 100 102 104 106
 108 110 112 114 116 118 120 122 124 126 128 130 132 134 136 138 140 142
 144 146 148 150 152 154 156 158 160 162 164 166 168 170 172 174 176 178
 180 182 184 186 188 190 192 194 196 198 200]


In [8]:
N = 2 * 4
threadsPerBlock = 256
blockPerGrid = max(32, (N + threadsPerBlock - 1) // threadsPerBlock)

In [9]:
dot_source = string.Template("""
extern "C"{

__global__ void dot_float(float * a,float * b,float *c){

    //对于GPU上启动的每个线程块，CUDA C编译器都将创建该共享变量的一个副本。
    //线程块中的每个线程都共享这块内存
    __shared__ float cache[${threadsPerBlock}];
    
    //总索引
    int tid = threadIdx.x + blockDim.x * blockIdx.x;
    
    int cacheIndex = threadIdx.x;
    float temp = 0;
    
    while(tid < ${N}){
        temp += a[tid] + b[tid];
        tid += blockDim.x * gridDim.x;
    }
    
    cache[cacheIndex] = temp;
    
    //保证线程块中的线程都执行完__synthreads()之前的语句
    __syncthreads();
    
    int i = blockDim.x / 2;
    while(i != 0){
        if (cacheIndex < i){
            cache[cacheIndex] += cache[cacheIndex + i];
            __syncthreads();
            i /= 2;
        }
    }
    
    if(cacheIndex == 0){
        //将每个block内的线程之和保存到c中
        c[blockIdx.x] = cache[0];
    }
}

}
""").substitute(threadsPerBlock=threadsPerBlock,N=N)

In [10]:
a = cp.arange(N,dtype=cp.float32)
b = cp.arange(N,dtype=cp.float32) * 2
print(blockPerGrid)
c = cp.zeros((N,),dtype=cp.float32)
print(c.sum().get())

32
0.0


In [11]:
dot_kernel = load_kernel(dot_source,'dot_float')

In [12]:
dot_kernel(grid=(N, 1),block=(1,1),args=(a,b,c))

In [13]:
print(c.sum().get())

84.0


In [14]:
print(cp.dot(a,b).get())

280.0
