# CUDA编程基础-共享内存与线程同步

In [1]:
import pycuda.autoinit
import pycuda.driver as cu
import numpy as np
from pycuda.compiler import SourceModule
import string
import cv2
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
def load_kernel_from_file(fname,kname):
    sources = None
    kernel = None
    with open(fname,encoding='utf-8') as f:
        sources = str(f.read())
        sm = SourceModule(sources)
        kernel = sm.get_function(kname)
    return kernel

def load_kernel_from_string(sources,kname):
    sm = SourceModule(str(sources))
    kernel = sm.get_function(kname)
    return kernel,sm

## 1.静态内存共享

In [3]:
sources = string.Template(
"""
#define imin(a,b) (a<b?a:b)

const int N = 33 * 1024;
const int threadPerBlock = 256;
const int blockPerGrid = imin( 32, (N+threadPerBlock-1) / threadPerBlock );

    __global__ void dot_static( float *a, float *b, float *c)
    {
    //共享内存, 每个block都有一份拷贝
    __shared__ float cache[threadPerBlock];
    // thread的索引
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    // 共享内存的索引,每个block都有cache, 故只用threadIdx.x即可
    int cacheIdx = threadIdx.x;

    float temp = 0;
    while(tid<N)
    {
        //当前tid的thread负责把tid,和tid间隔threadIdx总量整数倍的向量做乘-加操作.
        temp += a[tid] * b[tid];
        tid += blockDim.x * gridDim.x;
    }
    // 完成求和之后,当前thread把和放在对应的cache中
    cache[cacheIdx] = temp;
    // 在当前block内做同步操作, 等所有thread都完成乘-加运算之后才能做reduction.
    __syncthreads();

    //reduction, 向量缩减.
    //缩减后的结果在cache[0]里.
    int i = blockDim.x/2;
    while (i!=0)
    {
        if (cacheIdx<i)
        {
            cache[cacheIdx] += cache[cacheIdx + i];

        }
        //同步, 等所有thread都完成了当次缩减了才能做下一次的缩减.
        //书上说: 同步不能放在if里面, 否则报错.
        //经过试验没有报错, 结果正确.
        __syncthreads();
        i /= 2;
    }
    // 一个block输出一个值,即cache[0]. 所以c的长度和block数量相同.
    // 限制cacheIdx == 0是为了只做一次赋值操作,节省时间.
    if (cacheIdx == 0)
    {
        c[blockIdx.x] = cache[0];
    }
    // 没有做剩下的累加操作是因为在CPU上做小批量的累加更加有效.
}

"""
).substitute()

In [4]:
dot_static,sm = load_kernel_from_string(sources,'dot_static')

In [5]:
N = 33 * 1024
threadPerBlock = 256
block = (threadPerBlock,1,1)
grid = (min(32,int((N+threadPerBlock-1) / threadPerBlock)),1,1)

In [6]:
a = np.zeros((N,),dtype=np.float32)
b = np.zeros((N,),dtype=np.float32)
c = np.zeros_like(a)

In [7]:
for i in range(N):
    a[i] = i
    b[i] = i * 2

In [8]:
dot_static(cu.In(a),cu.In(b),cu.Out(c),grid=grid,block=block)

In [9]:
# 计算剩下的累加求和
cc = 0
for i in range(grid[0]):
    cc += c[i]

## 2.使用动态内存共享

In [10]:
sources = string.Template(
"""
__global__ void reduce0(float *g_idata, float *g_odata) {
extern __shared__ float sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;

sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for(unsigned int s=1; s < blockDim.x; s *= 2) {
   if (tid % (2*s) == 0) {
      sdata[tid] += sdata[tid + s];
   }
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
"""
).substitute()

In [11]:
reduce0,sm = load_kernel_from_string(sources,'reduce0')

In [None]:
a = np.random.randn(400).astype(np.float32)

dest = np.zeros_like(a)
reduce0(cu.In(a),cu.Out(dest),block=(400,1,1))