<a href="https://colab.research.google.com/github/jezgillen/practicing-CUDA/blob/master/CUDA-colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instructions
Go to **Runtime**->**Change runtime type** and set hardware accelerator to GPU. Then run the following cells.

In [1]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-muj13m7e
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-muj13m7e
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4305 sha256=aa3002ea0bc02051c1da8e3e76285f0b0816d9c00ae9d185444d5eb81e657c1b
  Stored in directory: /tmp/pip-ephem-wheel-cache-nynnx_my/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [2]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [3]:
%%cu
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
 
// Print device properties
void printDevProp(cudaDeviceProp devProp)
{
    printf("Major revision number:         %d\n",  devProp.major);
    printf("Minor revision number:         %d\n",  devProp.minor);
    printf("Name:                          %s\n",  devProp.name);
    printf("Total global memory:           %u\n",  devProp.totalGlobalMem);
    printf("Total shared memory per block: %u\n",  devProp.sharedMemPerBlock);
    printf("Total registers per block:     %d\n",  devProp.regsPerBlock);
    printf("Warp size:                     %d\n",  devProp.warpSize);
    printf("Maximum memory pitch:          %u\n",  devProp.memPitch);
    printf("Maximum threads per block:     %d\n",  devProp.maxThreadsPerBlock);
    for (int i = 0; i < 3; ++i)
    printf("Maximum dimension %d of block:  %d\n", i, devProp.maxThreadsDim[i]);
    for (int i = 0; i < 3; ++i)
    printf("Maximum dimension %d of grid:   %d\n", i, devProp.maxGridSize[i]);
    printf("Clock rate:                    %d\n",  devProp.clockRate);
    printf("Total constant memory:         %u\n",  devProp.totalConstMem);
    printf("Texture alignment:             %u\n",  devProp.textureAlignment);
    printf("Concurrent copy and execution: %s\n",  (devProp.deviceOverlap ? "Yes" : "No"));
    printf("Number of multiprocessors:     %d\n",  devProp.multiProcessorCount);
    printf("Kernel execution timeout:      %s\n",  (devProp.kernelExecTimeoutEnabled ? "Yes" : "No"));
    return;
}
 
int main()
{
    // Number of CUDA devices
    int devCount;
    cudaGetDeviceCount(&devCount);
    printf("CUDA Device Query...\n");
    printf("There are %d CUDA devices.\n", devCount);
 
    // Iterate through devices
    for (int i = 0; i < devCount; ++i)
    {
        // Get device properties
        printf("\nCUDA Device #%d\n", i);
        cudaDeviceProp devProp;
        cudaGetDeviceProperties(&devProp, i);
        printDevProp(devProp);
    }
 
    printf("\nPress any key to exit...");
    char c;
    scanf("%c", &c);
 
    return 0;
}

CUDA Device Query...
There are 1 CUDA devices.

CUDA Device #0
Major revision number:         7
Minor revision number:         0
Name:                          Tesla V100-SXM2-16GB
Total global memory:           4060610560
Total shared memory per block: 49152
Total registers per block:     65536
Warp size:                     32
Maximum memory pitch:          2147483647
Maximum threads per block:     1024
Maximum dimension 0 of block:  1024
Maximum dimension 1 of block:  1024
Maximum dimension 2 of block:  64
Maximum dimension 0 of grid:   2147483647
Maximum dimension 1 of grid:   65535
Maximum dimension 2 of grid:   65535
Clock rate:                    1530000
Total constant memory:         65536
Texture alignment:             512
Concurrent copy and execution: Yes
Number of multiprocessors:     80
Kernel execution timeout:      No

Press any key to exit...


# Vector Addition

In [4]:
%%cu
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
__global__ void vecAddKernel(float* A, float* B, float* C, int  n){
    int i = threadIdx.x+blockDim.x*blockIdx.x;
    if(i <n) {
        C[i] = A[i] + B[i];
    }
}
void vecAdd(float *h_A, float *h_B, float *h_C, int n){
    int size = n * sizeof(float); 
    float *d_A, *  d_B, *  d_C;
    cudaMalloc((void **) &d_A, size);    
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMalloc((void **) &d_B, size);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    cudaMalloc((void **) &d_C, size);
    
    // Run ceil(n/256.0) blocks of 256   threads each
    vecAddKernel<<<ceil(n/256.0),256>>>(d_A , d_B, d_C , n);
    
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    cudaFree(d_A);  
    cudaFree(d_B);  
    cudaFree(d_C);
}

int main(){
    int n = 10;
    float* h_A = (float*)malloc(n*sizeof(float));
    float* h_B = (float*)malloc(n*sizeof(float));
    float* h_C = (float*)malloc(n*sizeof(float));
    printf("Vector 1:\n");
    for(int i=0;i<n;i++){
        h_A[i] = (rand()%1000)/1000.;
        printf("%.5f ",h_A[i]);
    }
    printf("\n");
    printf("Vector 2:\n");
    for(int i=0;i<n;i++){
        h_B[i] = (rand()%1000)/1000.;
        printf("%.5f ",h_B[i]);
    }
    printf("\n");
    printf("Sequential Results:\n");
    for(int i=0;i<n;i++){
        printf("%.5f ",h_A[i]+h_B[i]);
    }
    printf("\n");
    printf("Parallel Results:\n");
    vecAdd(h_A, h_B, h_C, n);
    for(int i=0;i<n;i++){
        printf("%.5f ",h_C[i]);
    }
    printf("\n");

}


Vector 1:
0.38300 0.88600 0.77700 0.91500 0.79300 0.33500 0.38600 0.49200 0.64900 0.42100 
Vector 2:
0.36200 0.02700 0.69000 0.05900 0.76300 0.92600 0.54000 0.42600 0.17200 0.73600 
Sequential Results:
0.74500 0.91300 1.46700 0.97400 1.55600 1.26100 0.92600 0.91800 0.82100 1.15700 
Parallel Results:
0.74500 0.91300 1.46700 0.97400 1.55600 1.26100 0.92600 0.91800 0.82100 1.15700 



# Matrix Multiplication

In [19]:
%%cu
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>

#define TILE_WIDTH 16


__global__ void matMulKernel(float *A_gpu, float *B_gpu, float *C_gpu, int width){
    __shared__ float A_tile[TILE_WIDTH][TILE_WIDTH];
    __shared__ float B_tile[TILE_WIDTH][TILE_WIDTH];

    // # Calculate my row and col
    int row = blockIdx.y*blockDim.y + threadIdx.y;
    int col = blockIdx.x*blockDim.x + threadIdx.x;
    float tmp = 0;
    // # Start loop 
    for(int phase=0; phase<width/TILE_WIDTH; phase++){
        // # Work together with block to move a section of A and B into shared memory
        // # select rows of blocks with corner [blockIdx.y*blockDim.y, blockDim.x*phase]
        A_tile[threadIdx.y][threadIdx.x] = A_gpu[(row)*width + (blockDim.x*phase+threadIdx.x)];
        B_tile[threadIdx.y][threadIdx.x] = B_gpu[(blockDim.y*phase+threadIdx.y)*width + (col)];
        // # sync
        __syncthreads();
        // # sum over vectors and add to accumulator
        for(int i=0;i<TILE_WIDTH;i++){
            tmp += A_tile[threadIdx.y][i]*B_tile[i][threadIdx.x];
        }
        __syncthreads();
    }
    C_gpu[row*width + col] = tmp;
}

void matMulWrapper(float *A, float *B, float *C, int width){
    int memSize = width*width * sizeof(float); 
    float *d_A, *  d_B, *  d_C;
    cudaMalloc((void **) &d_A, memSize);    
    cudaMemcpy(d_A, A, memSize, cudaMemcpyHostToDevice);
    cudaMalloc((void **) &d_B, memSize);
    cudaMemcpy(d_B, B, memSize, cudaMemcpyHostToDevice);
    cudaMalloc((void **) &d_C, memSize);
    
    dim3 gridDim, blockDim;
    gridDim.x = width/TILE_WIDTH; gridDim.y = width/TILE_WIDTH;
    blockDim.x = TILE_WIDTH; blockDim.y = TILE_WIDTH;
    matMulKernel<<<gridDim, blockDim>>>(d_A , d_B, d_C , width);
    
    cudaMemcpy(C, d_C, memSize, cudaMemcpyDeviceToHost);
    cudaFree(d_A);  
    cudaFree(d_B);  
    cudaFree(d_C);

}

void matMulSeq(float* A, float* B, float* C, int size){
    for(int i=0;i<size;i++){
        for(int j=0;j<size;j++){
            float tmp = 0;
            for(int k=0;k<size;k++){
                tmp += A[i*size + k]*B[k*size + j];
            }
            C[i*size + j] = tmp;
        }
    }
}

void printMat(float* A, int width){
    for(int i=0;i<width;i++){
        for(int j=0;j<width;j++){
            printf("%.3f ",A[i*width + j]);
        }
        printf("\n");
    }
    printf("\n");
}
void initMat(float* A, int width){
    for(int i=0;i<width;i++){
        for(int j=0;j<width;j++){
            A[i*width + j] = (rand()%1000)/5000.;
        }
    }
}

int main(){
    int width = 32; // must be multiple of TILE_WIDTH at the moment
    float* A = (float*)calloc(width*width,sizeof(float));
    float* B = (float*)calloc(width*width,sizeof(float));
    float* C1 = (float*)calloc(width*width,sizeof(float));
    float* C2 = (float*)calloc(width*width,sizeof(float));

    printf("Matrix 1:\n");
    initMat(A, width);
    printMat(A, width);
    printf("Matrix 2:\n");
    initMat(B, width);
    printMat(B, width);
    printf("Parallel Results:\n");
    matMulWrapper(A, B, C1, width);
    printMat(C1, width);
    printf("Sequential Results:\n");
    matMulSeq(A,B,C2,width);
    printMat(C2, width);

    free(A);free(B);free(C1);free(C2);
}


Matrix 1:
0.077 0.177 0.155 0.183 0.159 0.067 0.077 0.098 0.130 0.084 0.072 0.005 0.138 0.012 0.153 0.185 0.108 0.085 0.034 0.147 0.042 0.074 0.113 0.086 0.156 0.106 0.172 0.025 0.013 0.027 0.186 0.160 
0.004 0.012 0.014 0.033 0.079 0.091 0.002 0.008 0.046 0.075 0.084 0.184 0.157 0.107 0.040 0.065 0.063 0.074 0.083 0.105 0.018 0.196 0.191 0.175 0.172 0.034 0.199 0.056 0.061 0.185 0.017 0.065 
0.067 0.101 0.169 0.146 0.063 0.171 0.025 0.179 0.116 0.109 0.163 0.073 0.087 0.073 0.009 0.150 0.017 0.162 0.055 0.036 0.158 0.117 0.081 0.130 0.151 0.080 0.186 0.012 0.135 0.074 0.148 0.002 
0.045 0.117 0.019 0.108 0.159 0.114 0.087 0.076 0.093 0.120 0.019 0.180 0.063 0.098 0.130 0.151 0.060 0.056 0.057 0.088 0.173 0.138 0.089 0.124 0.088 0.146 0.006 0.023 0.019 0.154 0.096 0.135 
0.142 0.185 0.113 0.171 0.099 0.071 0.117 0.193 0.061 0.137 0.044 0.125 0.106 0.174 0.146 0.166 0.101 0.004 0.054 0.074 0.142 0.143 0.068 0.030 0.159 0.145 0.124 0.049 0.169 0.090 0.184 0.111 
0.076 0.098 0.153 0.046 0