# Introduction to CUDA programming


### Check the cuda compiler version

In [None]:
!nvidia-smi

In [None]:
!nvcc --version

## nvcc for Jupyter notebook

In [None]:
!pip install nvcc4jupyter

In [None]:
%load_ext nvcc4jupyter

In [None]:
%%cuda
#include <iostream>
int main()
{
    std::cout << "Hello World";
    return 0;
}

In [None]:
%%cuda
/* Hello World cuda program*/
#include <iostream>
#include <cuda.h>
#include <stdio.h>

using namespace std;

__global__ void myKernel() {
    printf("GPU: Hello World.\n"); // cout doesnot work in device code
}

int main(){
    std::cout << "CPU : Hello World";
    myKernel<<<1, 1>>>();
    return 0;
}

In [None]:
%%cuda
#include <iostream>
#include <cuda.h>
#include <stdio.h>

using namespace std;

__global__ void myKernel() {
    printf("GPU: Hello World.\n"); // Avoid print statements inside the kernel function
}

int main(){
    std::cout << "CPU : Hello World";
    myKernel<<<1, 1>>>();
    cudaThreadSynchronize();
    return 0;
}

In [None]:
%%cuda
#include <stdio.h>
#include <cuda.h>
__global__ void myKernel(){
    printf("Hello World.\n");
  }
int main() {
    myKernel<<<1, 32>>>();
    cudaThreadSynchronize();
    return 0;
  }

In [None]:
%%cuda
#include <stdio.h>
#include <cuda.h>
#define N 10
__global__ void myKernel() {
     printf("%d\n", threadIdx.x);
     }
int main() {
    myKernel<<<1, N>>>();
    cudaThreadSynchronize();
    return 0; }

# Typical Cuda program
- Load data to CPU memory
- Transfer data to GPU memory
- Lauch kernels to act on the data
- Transfer back data to CPU memory
- Proceed with CPU execution

In [None]:
%%cuda
#include <stdio.h>
#include <cuda.h>
#define N 10
__global__ void scaleArr(int *a){
      a[threadIdx.x] = threadIdx.x * 10;
}

int main() {
    int a_hs[N], *a_dev;
    int i;
    cudaMalloc(&a_dev, N * sizeof(int));

    scaleArr<<<1, N>>>(a_dev);
    cudaMemcpy(a_hs, a_dev, N * sizeof(int), cudaMemcpyDeviceToHost);
    for (i = 0; i < N; ++i)
      printf("%d\n", a_hs[i]);
    return 0;
  }

## Finding the Unique ID of different threads?

https://github.com/tpn/pdfs/blob/master/CUDA%20Thread-Indexing%20Cheatsheet.pdf