<a href="https://colab.research.google.com/github/jeonggunlee/CUDATeaching/blob/master/01_cuda_lab/04_helloCUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## CUDA Basic Programming

##GPU 코딩의 기본 단계 익히기

- CPU 메모리 설정
- CPU 메모리 데이터 설정
- GPU 메모리 설정 : cudaMalloc(...)
- CPU --> GPU 데이터 전송: cudaMemcpy(to, from, sizeofdata, cudaMemcpyHostToDevice)
- GPU 함수 (Kernel) 수행
- GPU --> CPU 연산 결과 데이터 전송: : cudaMemcpy(to, from, sizeofdata, cudaMemcpyDeviceToHost);
- 연산 결과를 CPU에서 사용

*  *  *

##n 값을 변경시켜가며 nvprof을 통해서 수행시간 및 데이터 전달 시간을 살펴보세요.

In [35]:
%%writefile cudabasic.cu

#include <iostream>
#include <cuda.h>
using namespace std;

int *host_A, *host_C1, *host_C2;       // host data
int *device_A, *device_C;   // results

__global__ void vecAddOne(int *A, int *C, int N)
{
   int i = blockIdx.x * blockDim.x + threadIdx.x;
    
   if( i < N )
      C[i] = A[i] + 1; 
}

void vecAddOne_h(int *A1, int *C1, int N)
{
   for(int i=0;i<N;i++)
      C1[i] = A1[i] + 1;
}

int main(int argc,char **argv)
{
   int n=1024*1024;
   int nBytes = n*sizeof(int);
   int block_size = 32, block_no = n / block_size; 

   // ===============================================================
   // CPU 메모리 설정 
   //
   host_A = (int *)malloc(nBytes);
   host_C1 = (int *)malloc(nBytes);    
   host_C2 = (int *)malloc(nBytes);    

   // ===============================================================    
   printf("Allocating device memory on host..\n");
   cudaMalloc((void **)&device_A, n*sizeof(int));
   cudaMalloc((void **)&device_C, n*sizeof(int));
   // ===============================================================    
   printf("Copying to device..\n");
   cudaMemcpy(device_A, host_A, n*sizeof(int),cudaMemcpyHostToDevice);
   // ===============================================================
   printf("Doing GPU Vector + 1 \n");
   vecAddOne<<<block_no,block_size>>>(device_A, device_C, n);   
   cudaDeviceSynchronize();
   // ===============================================================
   printf("Doing a CPU Vector add\n");    
   vecAddOne_h(host_A, host_C1, n);
   
   cudaMemcpy(host_C2, device_C, n*sizeof(int), cudaMemcpyDeviceToHost);

   // 결과 비교
   printf("결과 비교\n");
   for(int i=0; i<n;i++)
   {
       if(host_C1[i] != host_C2[i])
       {
           printf("Something Wrong ! \n");
           break;
       }
   }
   cudaFree(device_A);
   cudaFree(device_C);
   free(host_A);
   free(host_C1);
   free(host_C2);
   return 0;
}  

Overwriting cudabasic.cu


In [0]:
!nvcc -o cudabasic cudabasic.cu

In [37]:
!./cudabasic

Allocating device memory on host..
Copying to device..
Doing GPU Vector + 1 
Doing a CPU Vector add
결과 비교


*  *  *
*  *  *
## nvprof:


참조:

- https://devblogs.nvidia.com/cuda-pro-tip-nvprof-your-handy-universal-gpu-profiler/

In [38]:
!nvprof ./cudabasic

Allocating device memory on host..
==835== NVPROF is profiling process 835, command: ./cudabasic
Copying to device..
Doing GPU Vector + 1 
Doing a CPU Vector add
결과 비교
==835== Profiling application: ./cudabasic
==835== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   55.53%  1.6226ms         1  1.6226ms  1.6226ms  1.6226ms  [CUDA memcpy DtoH]
                   40.72%  1.1897ms         1  1.1897ms  1.1897ms  1.1897ms  [CUDA memcpy HtoD]
                    3.75%  109.72us         1  109.72us  109.72us  109.72us  vecAddOne(int*, int*, int)
      API calls:   97.66%  228.51ms         2  114.26ms  294.69us  228.22ms  cudaMalloc
                    1.94%  4.5426ms         2  2.2713ms  1.5016ms  3.0410ms  cudaMemcpy
                    0.16%  363.22us         2  181.61us  145.53us  217.69us  cudaFree
                    0.09%  203.17us         1  203.17us  203.17us  203.17us  cuDeviceTotalMem
                    0.07%  15

---
---

## Thread와 Block 이해하기

- blockIdx.x / blockIdx.y
- blockDim.x / blockDim.y
- threadIdx.x / threadIdx.y
- threadDim.x / threadDim.y


In [24]:
%%writefile helloCUDA.cu

#include <stdio.h>

__global__ void helloCUDA(void)
{
  printf("Hello thread %d in block %d\n", threadIdx.x, blockIdx.x);
}

int main()
{
  int n = 12;
  int blockDim = 4;            // Block내의 Thread의 수
  int gridDim = n / blockDim;  // Grid에서 Block의 수
  
  // 따라서, 전체 생성 thread의 수는 blockDim * threadDim  
    
  helloCUDA<<<gridDim, blockDim>>>();
    
  cudaDeviceSynchronize();
  return 0;
}

Overwriting helloCUDA.cu


In [0]:
!nvcc -o helloCUDA helloCUDA.cu

In [26]:
!./helloCUDA

Hello thread 0 in block 2
Hello thread 1 in block 2
Hello thread 2 in block 2
Hello thread 3 in block 2
Hello thread 0 in block 1
Hello thread 1 in block 1
Hello thread 2 in block 1
Hello thread 3 in block 1
Hello thread 0 in block 0
Hello thread 1 in block 0
Hello thread 2 in block 0
Hello thread 3 in block 0


In [40]:
!nvprof --print-gpu-trace ./helloCUDA

==882== NVPROF is profiling process 882, command: ./helloCUDA
Hello thread 0 in block 2
Hello thread 1 in block 2
Hello thread 2 in block 2
Hello thread 3 in block 2
Hello thread 0 in block 1
Hello thread 1 in block 1
Hello thread 2 in block 1
Hello thread 3 in block 1
Hello thread 0 in block 0
Hello thread 1 in block 0
Hello thread 2 in block 0
Hello thread 3 in block 0
==882== Profiling application: ./helloCUDA
==882== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*           Device   Context    Stream  Name
411.15ms  92.760us              (3 1 1)         (4 1 1)        32        0B        0B     Tesla T4 (0)         1         7  helloCUDA(void) [106]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.