# CUDA编程基础-向量和矩阵加减

In [1]:
import pycuda.autoinit
import pycuda.driver as cu
import numpy as np
from pycuda.compiler import SourceModule
import string

In [2]:
def load_kernel_from_file(fname,kname):
    sources = None
    kernel = None
    with open(fname,encoding='utf-8') as f:
        sources = str(f.read())
        sm = SourceModule(sources)
        kernel = sm.get_function(kname)
    return kernel

def load_kernel_from_string(sources,kname):
    sm = SourceModule(str(sources))
    kernel = sm.get_function(kname)
    return kernel

## 1.向量加减

In [3]:
sources = string.Template("""
    __global__ void vec_add_kernel(float *A,float *B,float *C){
        int i = threadIdx.x;
        C[i] = A[i] + B[i];
    }
    
    __global__ void vec_sub_kernel(float * A,float *B,float *C){
        int i = threadIdx.x;
        C[i] = A[i] - B[i];
    }
    
     __global__ void vec_add_kernel_2(float *A,float *B,float *C,int N){
        int i = blockDim.x * blockIdx.x + threadIdx.x;
        if (i < N)
            C[i] = A[i] + B[i];
    }
""").substitute()

In [5]:
vec_add_kernel = load_kernel_from_string(sources,'vec_add_kernel')
vec_sub_kernel = load_kernel_from_string(sources,'vec_sub_kernel')
vec_add_kernel_2 = load_kernel_from_string(sources,'vec_add_kernel_2')

In [6]:
N = 1024
vec_A = np.arange(1,N + 1).astype(np.float32)
vec_B = np.arange(1,N + 1).astype(np.float32)
vec_C = np.zeros_like(vec_A)

In [13]:
block = (1024,1,1)
grid = (int((N + block[0] - 1) / block[0]),int((N + block[1] - 1) / block[1]),1)
print(block,grid)

(1024, 1, 1) (1, 1024, 1)


In [14]:
vec_add_kernel_2(cu.In(vec_A),cu.In(vec_B),cu.Out(vec_C),np.int32(N),grid=grid,block=block)

In [15]:
print((vec_C == (vec_A + vec_B)).all())

True


## 2.矩阵加减

In [22]:
sources = string.Template("""
    __global__ void mat_add_kernel(float A[$N][$N],float B[$N][$N],
                                    float C[$N][$N]){
        int i = threadIdx.x;
        int j = threadIdx.y;
        C[i][j] = A[i][j] + B[i][j];
    }
    
    __global__ void mat_add_kernel_2(float * A,float * B,
                                    float *C,int N){
        
        int i = blockIdx.x * blockDim.x + threadIdx.x;
        int j = blockIdx.y * blockDim.y + threadIdx.y;
        int idx = j * N + i;
        if (i < N && j < N){
           C[idx] = A[idx] + B[idx];
        }
        
    }
   
""").substitute(N=512)

In [23]:
mat_add_kernel = load_kernel_from_string(sources,'mat_add_kernel')
mat_add_kernel_2 = load_kernel_from_string(sources,'mat_add_kernel_2')

In [28]:
N = 16
block = (N,N,1)
grid = (1,1,1)

In [29]:
mat_A = np.arange(1,257).reshape((N,N)).astype('float32')
mat_B = np.arange(1,257).reshape((N,N)).astype('float32')
mat_C = np.zeros_like(mat_A)

In [30]:
mat_add_kernel(cu.In(mat_A),cu.In(mat_B),cu.Out(mat_C),grid=grid,block=block)

In [31]:
W = 512
mat_A = np.arange(1,262145).astype(np.float32).reshape((W,W))
mat_B = np.arange(1,262145).astype(np.float32).reshape((W,W))
mat_C = np.zeros_like(vec_A)
print(vec_A.shape)

(512, 512)


In [32]:
block = (32,32,1)
grid = (int (W / block[0]),int(W / block[1]),1)
print(grid)

(16, 16, 1)


In [33]:
mat_add_kernel_2(cu.In(mat_A),cu.In(mat_B),cu.Out(mat_C),np.int32(W),grid=grid,block=block)

In [34]:
print((mat_C == (mat_A + mat_B)).all())

True
