<a href="https://colab.research.google.com/github/jeonggunlee/CUDATeaching/blob/master/01_cuda_lab/05_vectorAdd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Vector Addition

In [0]:
%%writefile vecAdd.cu

#include <iostream>
#include <cuda.h>
using namespace std;

int *a, *b;  // host data
int *c, *c2;  // results

__global__ void vecAdd(int *A,int *B,int *C,int N)
{
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   C[i] = A[i] + B[i]; 
}

void vecAdd_h(int *A1,int *B1, int *C1, int N)
{
   for(int i=0;i<N;i++)
      C1[i] = A1[i] * B1[i];
}

int main(int argc,char **argv)
{
   printf("Begin \n");
   int n=10000000;
   int nBytes = n*sizeof(int);
   int block_size, block_no; 
    
   a = (int *)malloc(nBytes);
   b = (int *)malloc(nBytes);
   c = (int *)malloc(nBytes);
   c2 = (int *)malloc(nBytes);
    
   int *a_d,*b_d,*c_d;
   block_size=4000;
   block_no = n/block_size;
   dim3 dimBlock(block_size,1,1);
   dim3 dimGrid(block_no,1,1);
    
   for(int i=0;i<n;i++)
      a[i]=i,b[i]=i;
    
   printf("Allocating device memory on host..\n");
   cudaMalloc((void **)&a_d,n*sizeof(int));
   cudaMalloc((void **)&b_d,n*sizeof(int));
   cudaMalloc((void **)&c_d,n*sizeof(int));
    
   printf("Copying to device..\n");
   cudaMemcpy(a_d,a,n*sizeof(int),cudaMemcpyHostToDevice);
   cudaMemcpy(b_d,b,n*sizeof(int),cudaMemcpyHostToDevice);
   clock_t start_d=clock();
    
   printf("Doing GPU Vector add\n");
   vecAdd<<<block_no,block_size>>>(a_d,b_d,c_d,n);
   cudaDeviceSynchronize();
   clock_t end_d = clock();
   clock_t start_h = clock();
    
   printf("Doing CPU Vector add\n");
   vecAdd_h(a,b,c2,n);
   clock_t end_h = clock();
    
   double time_d = (double)(end_d-start_d)/CLOCKS_PER_SEC;
   double time_h = (double)(end_h-start_h)/CLOCKS_PER_SEC;
    
   cudaMemcpy(c,c_d,n*sizeof(int),cudaMemcpyDeviceToHost);
    
   printf("on GPU: %f, on CPU: %f\n",time_d,time_h);
   printf("Speedup: %f \n", time_h/time_d);
   cudaFree(a_d);
   cudaFree(b_d);
   cudaFree(c_d);
   return 0;
}

Overwriting vecAdd.cu


In [0]:
!nvcc -o vecAdd vecAdd.cu
!ls

sample_data  vecAdd  vecAdd.cu


In [0]:
!./vecAdd

Begin 
Allocating device memory on host..
Copying to device..
Doing GPU Vector add
Doing CPU Vector add
on GPU: 0.000010, on CPU: 0.091319
Speedup: 9131.900000 


In [1]:
!nvidia-smi

Mon May 27 14:12:00 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8    30W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

![Tesla K80](https://images.anandtech.com/doci/8729/TK80.jpg)


---

---

###Note: nvprof cannot be work correctly on Colab.


In [0]:
!nvprof --print-gpu-trace ./vecAdd

Begin 
Allocating device memory on host..
Copying to device..
Doing GPU Vector add
Doing CPU Vector add
on GPU: 0.000008, on CPU: 0.090094
Speedup: 11261.750000 
