## CUDA Vector addition

In [1]:
 %%file vector_add_cuda.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>

#define N 2020
#define MAX_ERR 1e-6

__global__ void vector_add(double *out, double *a, double *b, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if(i < n)
        out[i] = a[i] + b[i];
}

int main(){
    double *a, *b, *out;
    double *d_a, *d_b, *d_out; 

    // Allocate host memory
    a   = (double*)malloc(sizeof(double) * N);
    b   = (double*)malloc(sizeof(double) * N);
    out = (double*)malloc(sizeof(double) * N);

    // Initialize host arrays
    for(int i = 0; i < N; i++){
        a[i] = i / 100.0;
        b[i] = (N - i) / 100.0;
    }

    // Allocate device memory
    cudaMalloc((void**)&d_a, sizeof(double) * N);
    cudaMalloc((void**)&d_b, sizeof(double) * N);
    cudaMalloc((void**)&d_out, sizeof(double) * N);

    // Transfer data from host to device memory
    cudaMemcpy(d_a, a, sizeof(double) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(double) * N, cudaMemcpyHostToDevice);

    // Executing kernel
    int threadsPerBlock = 1024;
    //int blocksPerGrid =(N + threadsPerBlock - 1) / threadsPerBlock;
    int blocksPerGrid = N/threadsPerBlock + (N % threadsPerBlock == 0 ? 0:1);
    vector_add<<<blocksPerGrid, threadsPerBlock>>>(d_out, d_a, d_b, N);
    
    // Transfer data back to host memory
    cudaMemcpy(out, d_out, sizeof(double) * N, cudaMemcpyDeviceToHost);
     
    // Verification
    for(int i = 0; i < N; i++){
        assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);
        if (i % 101 == 0)
          printf("%.2f + %.2f = %.2f\n", a[i], b[i], out[i]);
    }
    printf("PASSED\n");

    // Deallocate device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);

    // Deallocate host memory
    free(a); 
    free(b); 
    free(out);
}

Writing vector_add_cuda.cu


In [2]:
!PATH=/usr/local/cuda-10.1/bin:${PATH} nvcc -o vector_add_cuda vector_add_cuda.cu && ./vector_add_cuda

0.00 + 20.20 = 20.20
1.01 + 19.19 = 20.20
2.02 + 18.18 = 20.20
3.03 + 17.17 = 20.20
4.04 + 16.16 = 20.20
5.05 + 15.15 = 20.20
6.06 + 14.14 = 20.20
7.07 + 13.13 = 20.20
8.08 + 12.12 = 20.20
9.09 + 11.11 = 20.20
10.10 + 10.10 = 20.20
11.11 + 9.09 = 20.20
12.12 + 8.08 = 20.20
13.13 + 7.07 = 20.20
14.14 + 6.06 = 20.20
15.15 + 5.05 = 20.20
16.16 + 4.04 = 20.20
17.17 + 3.03 = 20.20
18.18 + 2.02 = 20.20
19.19 + 1.01 = 20.20
PASSED
