<a href="https://colab.research.google.com/github/jeonggunlee/CUDATeaching/blob/master/01_cuda_lab/05_vectorAdd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
%%writefile vecAdd.cu

#include <iostream>
#include <cuda.h>
using namespace std;

int *a, *b;  // host data
int *c, *c2;  // results

__global__ void vecAdd(int *A,int *B,int *C,int N)
{
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   C[i] = A[i] + B[i]; 
}

void vecAdd_h(int *A1,int *B1, int *C1, int N)
{
   for(int i=0;i<N;i++)
      C1[i] = A1[i] * B1[i];
}

int main(int argc,char **argv)
{
   printf("Begin \n");
   int n=10000000;
   int nBytes = n*sizeof(int);
   int block_size, block_no; 
    
   a = (int *)malloc(nBytes);
   b = (int *)malloc(nBytes);
   c = (int *)malloc(nBytes);
   c2 = (int *)malloc(nBytes);
    
   int *a_d,*b_d,*c_d;
   block_size=4000;
   block_no = n/block_size;
   dim3 dimBlock(block_size,1,1);
   dim3 dimGrid(block_no,1,1);
    
   for(int i=0;i<n;i++)
      a[i]=i,b[i]=i;
    
   printf("Allocating device memory on host..\n");
   cudaMalloc((void **)&a_d,n*sizeof(int));
   cudaMalloc((void **)&b_d,n*sizeof(int));
   cudaMalloc((void **)&c_d,n*sizeof(int));
    
   printf("Copying to device..\n");
   cudaMemcpy(a_d,a,n*sizeof(int),cudaMemcpyHostToDevice);
   cudaMemcpy(b_d,b,n*sizeof(int),cudaMemcpyHostToDevice);
   clock_t start_d=clock();
    
   printf("Doing GPU Vector add\n");
   vecAdd<<<block_no,block_size>>>(a_d,b_d,c_d,n);
   cudaDeviceSynchronize();
   clock_t end_d = clock();
   clock_t start_h = clock();
    
   printf("Doing CPU Vector add\n");
   vecAdd_h(a,b,c2,n);
   clock_t end_h = clock();
    
   double time_d = (double)(end_d-start_d)/CLOCKS_PER_SEC;
   double time_h = (double)(end_h-start_h)/CLOCKS_PER_SEC;
    
   cudaMemcpy(c,c_d,n*sizeof(int),cudaMemcpyDeviceToHost);
    
   printf("on GPU: %f, on CPU: %f\n",time_d,time_h);
   printf("Speedup: %f \n", time_h/time_d);
   cudaFree(a_d);
   cudaFree(b_d);
   cudaFree(c_d);
   return 0;
}

Overwriting vecAdd.cu


In [15]:
!nvcc -o vecAdd vecAdd.cu
!ls

sample_data  vecAdd  vecAdd.cu


In [16]:
!./vecAdd

Begin 
Allocating device memory on host..
Copying to device..
Doing GPU Vector add
Doing CPU Vector add
on GPU: 0.000010, on CPU: 0.091319
Speedup: 9131.900000 


In [17]:
!nvprof --print-gpu-trace ./vecAdd

Begin 
Allocating device memory on host..
Copying to device..
Doing GPU Vector add
Doing CPU Vector add
on GPU: 0.000008, on CPU: 0.090094
Speedup: 11261.750000 


In [19]:
!which nvprof

/usr/local/cuda/bin/nvprof


In [20]:
%cd /usr/local/cuda

/usr/local/cuda-10.0


In [21]:
!ls

bin	extras	 libnsight  NsightCompute-1.0  nvvm	share	 tools
compat	include  libnvvp    nsightee_plugins   README	src	 version.txt
doc	lib64	 LICENSE    nvml	       samples	targets


In [22]:
%cd samples

/usr/local/cuda-10.0/samples


In [23]:
!ls

0_Simple     2_Graphics  4_Finance	6_Advanced	 common    Makefile
1_Utilities  3_Imaging	 5_Simulations	7_CUDALibraries  EULA.txt


In [33]:
%cd /usr/local/cuda-10.0/samples/0_Simple

/usr/local/cuda-10.0/samples/0_Simple


In [34]:
!ls

asyncAPI	    simpleAssert_nvrtc		  simpleStreams
cdpSimplePrint	    simpleAtomicIntrinsics	  simpleSurfaceWrite
cdpSimpleQuicksort  simpleAtomicIntrinsics_nvrtc  simpleTemplates
clock		    simpleCallback		  simpleTemplates_nvrtc
clock_nvrtc	    simpleCooperativeGroups	  simpleTexture
cppIntegration	    simpleCubemapTexture	  simpleTextureDrv
cppOverload	    simpleCudaGraphs		  simpleVoteIntrinsics
cudaOpenMP	    simpleIPC			  simpleVoteIntrinsics_nvrtc
cudaTensorCoreGemm  simpleLayeredTexture	  simpleZeroCopy
fp16ScalarProduct   simpleMPI			  systemWideAtomics
inlinePTX	    simpleMultiCopy		  template
inlinePTX_nvrtc     simpleMultiGPU		  UnifiedMemoryStreams
matrixMul	    simpleOccupancy		  vectorAdd
matrixMulCUBLAS     simpleP2P			  vectorAddDrv
matrixMulDrv	    simplePitchLinearTexture	  vectorAdd_nvrtc
matrixMul_nvrtc     simplePrintf
simpleAssert	    simpleSeparateCompilation


In [35]:
%cd vectorAdd

/usr/local/cuda-10.0/samples/0_Simple/vectorAdd


In [36]:
!ls

Makefile  NsightEclipse.xml  readme.txt  vectorAdd.cu


In [37]:
!make

/usr/local/cuda-10.0/bin/nvcc -ccbin g++ -I../../common/inc  -m64    -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -o vectorAdd.o -c vectorAdd.cu
/usr/local/cuda-10.0/bin/nvcc -ccbin g++   -m64      -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -o vectorAdd vectorAdd.o 
mkdir -p ../../bin/x86_64/linux/release
cp vectorAdd ../../bin/x86_64/linux/release


In [38]:
!../../bin/x86_64/linux/release/vectorAdd

[Vector addition of 50000 elements]
Failed to allocate device vector A (error code no CUDA-capable device is detected)!


In [44]:
!nvcc -o vectorAdd vectorAdd.cu -l../common/

[01m[KvectorAdd.cu:25:10:[m[K [01;31m[Kfatal error: [m[Khelper_cuda.h: No such file or directory
 #include [01;31m[K<helper_cuda.h>[m[K
          [01;31m[K^~~~~~~~~~~~~~~[m[K
compilation terminated.


MessageError: ignored