# Introduction Course From NVIDIA

This is built for GTX 750 TI. Use proper computer capability for your own GPU

In [1]:
cuda_major_version=f"5"
cuda_minor_version=f"0"

cc_ver=f"{cuda_major_version}{cuda_minor_version}"
cuda_args = f"-gencode arch=compute_{cc_ver},code=sm_{cc_ver}"
print(cuda_args)

-gencode arch=compute_50,code=sm_50


## Basic Hello World using CPU

In [2]:
%%writefile add.cpp

#include <iostream>
#include <math.h>

// function to add the elements of two arrays
void add(int n, float *x, float *y)
{
  for (int i = 0; i < n; i++)
      y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20; // 1M elements

  float *x = new float[N];
  float *y = new float[N];

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the CPU
  add(N, x, y);

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  delete [] x;
  delete [] y;

  return 0;
}

Overwriting add.cpp


In [3]:
! g++ add.cpp -o add
! ./add

Max error: 0


## Basic Hello World using GPU

In [4]:
%%writefile add.cu

#include <iostream>
#include <math.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
  for (int i = 0; i < n; i++)
    y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20
 ;
  float *x, *y;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  add<<<1, 1>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);
  
  return 0;
}

Overwriting add.cu


In [5]:
! nvcc add.cu -o add_cuda {cuda_args}
! ./add_cuda

Max error: 0


## Profile the GPU-based Hello World Program

In [6]:
! nvprof ./add_cuda

==9376== NVPROF is profiling process 9376, command: ./add_cuda
Max error: 0
==9376== Profiling application: ./add_cuda
==9376== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  172.32ms         1  172.32ms  172.32ms  172.32ms  add(int, float*, float*)
      API calls:   74.83%  172.32ms         1  172.32ms  172.32ms  172.32ms  cudaDeviceSynchronize
                   23.92%  55.086ms         2  27.543ms  352.13us  54.734ms  cudaMallocManaged
                    0.74%  1.6996ms         1  1.6996ms  1.6996ms  1.6996ms  cudaLaunchKernel
                    0.45%  1.0386ms         2  519.28us  504.34us  534.22us  cudaFree
                    0.06%  127.78us       114  1.1200us      77ns  52.862us  cuDeviceGetAttribute
                    0.00%  10.833us         1  10.833us  10.833us  10.833us  cuDeviceGetName
                    0.00%  6.7190us         1  6.7190us  6.7190us  6.7190us  cuDeviceGetPCIBusId
        

## Document the GPUs that are in use

In [7]:
! nvidia-smi

Fri Jul 19 20:07:31 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.06              Driver Version: 555.42.06      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 750 Ti      Off |   00000000:02:00.0 Off |                  N/A |
| 32%   32C    P0              2W /   38W |       2MiB /   2048MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Run the Hello World using GPU with 4 Threads on 1 Block

In [8]:
%%writefile add_block.cu

#include <iostream>
#include <math.h>
#include <stdio.h>
using namespace std;

// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
  int index = threadIdx.x;
  int stride = blockDim.x;
  for (int i = index; i < n; i += stride)
      y[i] = x[i] + y[i];

}

int main(void)
{
  int N = 1<<20;
  float *x, *y;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 4 * N elements on the GPU
  add<<<1, 4>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);
  
  return 0;
}

Overwriting add_block.cu


In [9]:
! nvcc add_block.cu -o add_block {cuda_args}
! nvprof ./add_block

==9423== NVPROF is profiling process 9423, command: ./add_block
Max error: 0
==9423== Profiling application: ./add_block
==9423== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  59.519ms         1  59.519ms  59.519ms  59.519ms  add(int, float*, float*)
      API calls:   51.19%  59.522ms         1  59.522ms  59.522ms  59.522ms  cudaDeviceSynchronize
                   46.36%  53.898ms         2  26.949ms  359.68us  53.539ms  cudaMallocManaged
                    1.47%  1.7056ms         1  1.7056ms  1.7056ms  1.7056ms  cudaLaunchKernel
                    0.85%  994.07us         2  497.04us  488.25us  505.82us  cudaFree
                    0.11%  129.97us       114  1.1400us      82ns  53.039us  cuDeviceGetAttribute
                    0.01%  9.6370us         1  9.6370us  9.6370us  9.6370us  cuDeviceGetName
                    0.01%  7.6550us         1  7.6550us  7.6550us  7.6550us  cuDeviceGetPCIBusId
      

## Run the Hello World using GPU with 256 Threads on multiple Blocks

In [10]:
%%writefile add_grid.cu

#include <iostream>
#include <math.h>

using namespace std;

// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  for (int i = index; i < n; i += stride)
    y[i] = x[i] + y[i];

}

int main(void)
{
  int N = 1<<20;
  float *x, *y;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  int blockSize = 256;
  //int numBlocks = (N + blockSize - 1) / blockSize;
  int numBlocks = 256;

  cout << "N is: " << N << endl << "numBlocks is: " << numBlocks << endl;

  add<<<numBlocks, blockSize>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);
  
  return 0;
}

Overwriting add_grid.cu


In [11]:
! nvcc add_grid.cu -o add_grid {cuda_args}
! ./add_grid
! nvprof ./add_grid

N is: 1048576
numBlocks is: 256
Max error: 0
==9474== NVPROF is profiling process 9474, command: ./add_grid
N is: 1048576
numBlocks is: 256
Max error: 0
==9474== Profiling application: ./add_grid
==9474== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  167.91us         1  167.91us  167.91us  167.91us  add(int, float*, float*)
      API calls:   92.04%  54.803ms         2  27.402ms  528.19us  54.275ms  cudaMallocManaged
                    4.97%  2.9589ms         1  2.9589ms  2.9589ms  2.9589ms  cudaLaunchKernel
                    2.43%  1.4483ms         2  724.16us  687.21us  761.11us  cudaFree
                    0.29%  171.38us         1  171.38us  171.38us  171.38us  cudaDeviceSynchronize
                    0.24%  143.07us       114  1.2550us      85ns  62.578us  cuDeviceGetAttribute
                    0.02%  10.671us         1  10.671us  10.671us  10.671us  cuDeviceGetName
                    0.01%  7

## Remove files that are no longer needed

In [12]:
! rm add*