<a href="https://colab.research.google.com/github/jeonggunlee/CUDATeaching/blob/master/01_cuda_lab/07_memoryType.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Memory Architecutre
### GPU의 메모리 구조를 고려한 최적 Coding

- Local Memory
- Global Memory
- Shared Memory

### 참조

- https://github.com/jeonggunlee/cs344

In [2]:
%%writefile memoryType.cu

// Convenience function for checking CUDA runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
  if (result != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
    assert(result == cudaSuccess);
  }
#endif
  return result;
}

// CUDA에서 제공하는 서로 다른 타입의 메모리 공간 활용하기

#include <stdio.h>

/**********************
 * using local memory *
 **********************/

// a __device__ or __global__ function runs on the GPU
__global__ void use_local_memory_GPU(float in)
{
    float f;    // variable "f" is in local memory and private to each thread
    f = in;     // parameter "in" is in local memory and private to each thread
    // ... real code would presumably do other stuff here ... 

    // ADDED
    int i, index = threadIdx.x;
    float average, sum = 0.0f;

    for (i=0; i<index; i++) { sum += i; }
    average = sum / (index + 1.0f);

    __syncthreads();    // ensure all the writes to shared memory have completed
    
}

/**********************
 * using global memory *
 **********************/

// a __global__ function runs on the GPU & can be called from host
__global__ void use_global_memory_GPU(float *array)
{
    int i, index = threadIdx.x;
    float average, sum = 0.0f;

    // "array" is a pointer into global memory on the device
    // array[threadIdx.x] = 2.0f * (float) threadIdx.x;

    for (i=0; i<index; i++) { sum += array[i]; }
    average = sum / (index + 1.0f);
    
    __syncthreads();    // ensure all the writes to shared memory have completed

    if (array[index] > average) { array[index] = average; }     
}

/**********************
 * using shared memory *
 **********************/

// (for clarity, hardcoding 128 threads/elements and omitting out-of-bounds checks)
__global__ void use_shared_memory_GPU(float *array)
{
    // local variables, private to each thread
    int i, index = threadIdx.x;
    float average, sum = 0.0f;

    // __shared__ variables are visible to all threads in the thread block
    // and have the same lifetime as the thread block
    __shared__ float sh_arr[128];

    // copy data from "array" in global memory to sh_arr in shared memory.
    // here, each thread is responsible for copying a single element.
    sh_arr[index] = array[index];

    __syncthreads();    // ensure all the writes to shared memory have completed

    // now, sh_arr is fully populated. Let's find the average of all previous elements
    for (i=0; i<index; i++) { sum += sh_arr[i]; }
    average = sum / (index + 1.0f);

    if (array[index] > average) { array[index] = average; } 

}

int main(int argc, char **argv)
{
    int blockSize = 256;
    int nBlock= 1024;
    float ms;
    cudaEvent_t startEvent, stopEvent;
    
    checkCuda( cudaEventCreate(&startEvent) );
    checkCuda( cudaEventCreate(&stopEvent) );
    
    /*
     * First, call a kernel that shows using local memory 
     */
    checkCuda( cudaEventRecord(startEvent,0) );
    use_local_memory_GPU<<<nBlock, blockSize>>>(2.0f);
    checkCuda( cudaEventRecord(stopEvent,0) );
    checkCuda( cudaEventSynchronize(stopEvent) );
    checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
    printf("Local : %f\n", ms);
    
    /*
     * Next, call a kernel that shows using global memory
     */
    float h_arr[blockSize];   // convention: h_ variables live on host
    float *d_arr;       // convention: d_ variables live on device (GPU global mem)

    // allocate global memory on the device, place result in "d_arr"
    cudaMalloc((void **) &d_arr, sizeof(float) * blockSize);
    // now copy data from host memory "h_arr" to device memory "d_arr"
    cudaMemcpy((void *)d_arr, (void *)h_arr, sizeof(float) * blockSize, cudaMemcpyHostToDevice);

    // launch the kernel (1 block of 128 threads)
    checkCuda( cudaEventRecord(startEvent,0) );
    use_global_memory_GPU<<<nBlock, blockSize>>>(d_arr);  // modifies the contents of array at d_arr
    checkCuda( cudaEventRecord(stopEvent,0) );
    checkCuda( cudaEventSynchronize(stopEvent) );
    checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
    printf("Global: %f\n", ms);

    
    // copy the modified array back to the host, overwriting contents of h_arr
    cudaMemcpy((void *)h_arr, (void *)d_arr, sizeof(float) * blockSize, cudaMemcpyDeviceToHost);
    // ... do other stuff ...

    /*
     * Next, call a kernel that shows using shared memory
     */

    // as before, pass in a pointer to data in global memory
    checkCuda( cudaEventRecord(startEvent,0) );
    use_shared_memory_GPU<<<nBlock, blockSize>>>(d_arr); 
    checkCuda( cudaEventRecord(stopEvent,0) );
    checkCuda( cudaEventSynchronize(stopEvent) );
    checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
    printf("Shared: %f\n", ms);

    
    // copy the modified array back to the host
    cudaMemcpy((void *)h_arr, (void *)d_arr, sizeof(float) * blockSize, cudaMemcpyHostToDevice);
    // ... do other stuff ...
    return 0;
}

Overwriting memoryType.cu


In [3]:
!nvcc -o memoryType memoryType.cu





In [8]:
!./memoryType

Local : 0.020672
Global: 0.122144
Shared: 0.122144


In [0]:
!nvprof ./memoryType

==683== NVPROF is profiling process 683, command: ./memoryType
Local : 0.039360
Global: 0.026976
Shared: 0.026976
==683== Profiling application: ./memoryType
==683== Profiling result:
No kernels were profiled.
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
      API calls:   85.54%  1.28212s         3  427.37ms  9.2140us  1.28210s  cudaEventSynchronize
                   14.41%  215.93ms         2  107.96ms  1.1800us  215.93ms  cudaEventCreate
                    0.02%  330.56us         1  330.56us  330.56us  330.56us  cudaMalloc
                    0.01%  175.42us         1  175.42us  175.42us  175.42us  cuDeviceTotalMem
                    0.01%  153.00us        96  1.5930us     131ns  62.566us  cuDeviceGetAttribute
                    0.00%  60.460us         3  20.153us  11.842us  28.395us  cudaLaunchKernel
                    0.00%  56.360us         3  18.786us  3.2680us  30.320us  cudaMemcpy
                    0.00%  25.164us         1  25.164us