In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


## Initialize a nvcc plugin for python notebook

In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-z92012nw
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-z92012nw
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4295 sha256=70080c73d0676f6530fb3fc56908c34e627a63057d8e8ed1a3dba8e2c0a0fcee
  Stored in directory: /tmp/pip-ephem-wheel-cache-qq6lml76/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


## Load the plugin extension

In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


# Parallel Matrix multiplication version 1

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>

#define MATRIX_SIZE 8192
#define CPU_MATRIX_SIZE 1024


__global__ void gpu_matrix_mult(int *a,int *b, int *c, int n)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if( col < n && row < n)
    {
        int sum = 0;
        for(int i = 0; i < n; i++)
        {
            sum += a[row * n + i] * b[i * n + col];
        }
        c[row * n + col] = sum;
    }
}

void cpu_matrix_mult (int *a, int *b, int *c, int n)
{
    int i,j,k;
    for (i = 0; i < n; i++)
    {
        for (j = 0; j < n; j++)
        {
            int sum_mult = 0;
            for (k = 0; k < n; k++)
            {
                sum_mult += a[i*n+k] * b[k*n+j];
            }
            c[i*n+j] = sum_mult;
        }
    }
}

int main(int argc, char const *argv[])
{
    int block_size;

    /// retrieve some info about the CUDA device
    int nDevices;
    cudaGetDeviceCount(&nDevices);
    for (int i = 0; i < nDevices; i++) {
      cudaDeviceProp prop;
      cudaGetDeviceProperties(&prop, i);
      printf("Device Number: %d\n", i);
      printf("  Device name: %s\n", prop.name);
      printf("  max Blocks Per MultiProcessor: %d\n", prop.maxBlocksPerMultiProcessor);
      printf("  max Threads Per MultiProcessor: %d\n", prop.maxThreadsPerMultiProcessor);
      printf("  max Threads Per Block: %d\n", prop.maxThreadsPerBlock);
      printf("  num SM: %d\n", prop.multiProcessorCount);
      printf("  num bytes sharedMem Per Block: %d\n", prop.sharedMemPerBlock);
      printf("  num bytes sharedMem Per Multiprocessor: %d\n", prop.sharedMemPerMultiprocessor);
      printf("  Memory Clock Rate (KHz): %d\n",
           prop.memoryClockRate);
      printf("  Memory Bus Width (bits): %d\n",
           prop.memoryBusWidth);
      printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
           2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
    }
    {
        int *a, *b, *c;
        a = (int*)malloc(sizeof(int)*CPU_MATRIX_SIZE*CPU_MATRIX_SIZE);
        b = (int*)malloc(sizeof(int)*CPU_MATRIX_SIZE*CPU_MATRIX_SIZE);
        c = (int*)malloc(sizeof(int)*CPU_MATRIX_SIZE*CPU_MATRIX_SIZE);
        // initialize matrix A
        for (int i = 0; i < CPU_MATRIX_SIZE; ++i) {
            for (int j = 0; j < CPU_MATRIX_SIZE; ++j) {
                a[i * CPU_MATRIX_SIZE + j] = 2;
            }
        }
        // initialize matrix B
        for (int i = 0; i < CPU_MATRIX_SIZE; ++i) {
            for (int j = 0; j < CPU_MATRIX_SIZE; ++j) {
                b[i * CPU_MATRIX_SIZE + j] = 3;
            }
        }
        // sequential version of matrix multiplication
        clock_t begin = clock();
        cpu_matrix_mult(a, b, c, CPU_MATRIX_SIZE);
        clock_t end = clock();
        double time_spent = ((double)((end - begin)) * 1000) / CLOCKS_PER_SEC;
        printf("Time elapsed on naive CPU sequential matrix multiplication of %dx%d . %dx%d: %f ms\n\n", CPU_MATRIX_SIZE, CPU_MATRIX_SIZE, CPU_MATRIX_SIZE, CPU_MATRIX_SIZE, time_spent);
        free(a);
        free(b);
        free(c);
    }

    for(block_size= 4; block_size <= 32; block_size *= 2)
    {
        int *a, *b, *c;
        cudaMallocManaged((void **) &a, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);
        cudaMallocManaged((void **) &b, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);
        cudaMallocManaged((void **) &c, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);

        // initialize matrix A
        for (int i = 0; i < MATRIX_SIZE; ++i) {
            for (int j = 0; j < MATRIX_SIZE; ++j) {
                a[i * MATRIX_SIZE + j] = 2;
            }
        }

        // initialize matrix B
        for (int i = 0; i < MATRIX_SIZE; ++i) {
            for (int j = 0; j < MATRIX_SIZE; ++j) {
                b[i * MATRIX_SIZE + j] = 3;
            }
        }


        float  naive_gpu_elapsed_time_ms;

        // some events to count the execution time
        //clock_t st, end;
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);


        unsigned int grid_rows = (MATRIX_SIZE + block_size - 1) / block_size;
        unsigned int grid_cols = (MATRIX_SIZE + block_size - 1) / block_size;
        dim3 dimGrid(grid_cols, grid_rows);
        dim3 dimBlock(block_size, block_size);


        cudaEventRecord(start, 0);
        gpu_matrix_mult<<<dimGrid, dimBlock>>>(a, b, c, MATRIX_SIZE);
        cudaThreadSynchronize();

        // time counting terminate

        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);

        // compute time elapsed on GPU computing
        cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
        printf("Time elapsed on naive GPU matrix multiplication of %dx%d . %dx%d (%d): %f ms.\n\n", MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, block_size, naive_gpu_elapsed_time_ms);


        // free memory
        cudaFree(a);
        cudaFree(b);
        cudaFree(c);
    }

    return 0;
}

Device Number: 0
  Device name: Tesla T4
  max Blocks Per MultiProcessor: 16
  max Threads Per MultiProcessor: 1024
  max Threads Per Block: 1024
  num SM: 40
  num bytes sharedMem Per Block: 49152
  num bytes sharedMem Per Multiprocessor: 65536
  Memory Clock Rate (KHz): 5001000
  Memory Bus Width (bits): 256
  Peak Memory Bandwidth (GB/s): 320.064000

Time elapsed on naive CPU sequential matrix multiplication of 1024x1024 . 1024x1024: 7929.036000 ms

Time elapsed on naive GPU matrix multiplication of 8192x8192 . 8192x8192 (4): 7408.363281 ms.

Time elapsed on naive GPU matrix multiplication of 8192x8192 . 8192x8192 (8): 6688.598145 ms.

Time elapsed on naive GPU matrix multiplication of 8192x8192 . 8192x8192 (16): 2410.977295 ms.

Time elapsed on naive GPU matrix multiplication of 8192x8192 . 8192x8192 (32): 1838.651367 ms.




# Parallel Matrix multiplication version 2 (TILE_WIDTH 32)

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime_api.h>

#define MATRIX_SIZE 8192
#define TILE_WIDTH 32

__global__ void gpu_matrix_mult(int *a,int *b, int *c, int n)
{
    __shared__ int ds_M[TILE_WIDTH][TILE_WIDTH];
    __shared__ int ds_N[TILE_WIDTH][TILE_WIDTH];


  int bx = blockIdx.x;  int by = blockIdx.y;
  int tx = threadIdx.x; int ty = threadIdx.y;

  int Row = by * blockDim.y + ty;
  int Col = bx * blockDim.x + tx;
  int Pvalue = 0;

  // Loop over the M and N tiles required to compute the P element
  for (int p = 0; p < (n-1) / TILE_WIDTH + 1; ++p) {
    // Collaborative loading of M and N tiles into shared memory
    if(Row < n && p * TILE_WIDTH+tx < n) {
        ds_M[ty][tx] = a[Row*n + p*TILE_WIDTH+tx];
    }
    else
    {
        ds_M[ty][tx] = 0.0;
    }
    if (p*TILE_WIDTH+ty < n && Col < n) {
        ds_N[ty][tx] = b[(p*TILE_WIDTH+ty)*n + Col];
    }
    else
    {
        ds_N[ty][tx] = 0.0;
    }
    __syncthreads();

    if(Row < n && Col < n) {
        for (int i = 0; i < TILE_WIDTH; ++i)
           Pvalue += ds_M[ty][i] * ds_N[i][tx];
    }
    __syncthreads();
  }
  if (Row < n && Col < n)
    c[Row*n+Col] = Pvalue;
}


int main(int argc, char const *argv[])
{
    int block_size;

    /// retrive some info about the CUDA device
    int nDevices;
    cudaGetDeviceCount(&nDevices);
    for (int i = 0; i < nDevices; i++) {
      cudaDeviceProp prop;
      cudaGetDeviceProperties(&prop, i);
      printf("Device Number: %d\n", i);
      printf("  Device name: %s\n", prop.name);
      printf("  max Blocks Per MultiProcessor: %d\n", prop.maxBlocksPerMultiProcessor);
      printf("  max Threads Per MultiProcessor: %d\n", prop.maxThreadsPerMultiProcessor);
      printf("  max Threads Per Block: %d\n", prop.maxThreadsPerBlock);
      printf("  num SM: %d\n", prop.multiProcessorCount);
      printf("  num bytes sharedMem Per Block: %d\n", prop.sharedMemPerBlock);
      printf("  num bytes sharedMem Per Multiprocessor: %d\n", prop.sharedMemPerMultiprocessor);
      printf("  Memory Clock Rate (KHz): %d\n",
           prop.memoryClockRate);
      printf("  Memory Bus Width (bits): %d\n",
           prop.memoryBusWidth);
      printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
           2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
    }

    for(block_size= 32; block_size <= 32; block_size *= 2)
    {
        int *a, *b, *c;
        cudaMallocManaged((void **) &a, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);
        cudaMallocManaged((void **) &b, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);
        cudaMallocManaged((void **) &c, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);

        // initialize matrix A
        for (int i = 0; i < MATRIX_SIZE; ++i) {
            for (int j = 0; j < MATRIX_SIZE; ++j) {
                a[i * MATRIX_SIZE + j] = 2;
            }
        }

        // initialize matrix B
        for (int i = 0; i < MATRIX_SIZE; ++i) {
            for (int j = 0; j < MATRIX_SIZE; ++j) {
                b[i * MATRIX_SIZE + j] = 3;
            }
        }

        float  naive_gpu_elapsed_time_ms;

        // some events to count the execution time
        //clock_t st, end;
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);


        unsigned int grid_rows = (MATRIX_SIZE + block_size - 1) / block_size;
        unsigned int grid_cols = (MATRIX_SIZE + block_size - 1) / block_size;
        dim3 dimGrid(grid_cols, grid_rows);
        dim3 dimBlock(block_size, block_size);


        cudaEventRecord(start, 0);
        gpu_matrix_mult<<<dimGrid, dimBlock>>>(a, b, c, MATRIX_SIZE);
        cudaThreadSynchronize();

        // time counting terminate

        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);

        // compute time elapsed on GPU computing
        cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
        printf("Time elapsed on naive GPU matrix multiplication of %dx%d . %dx%d (%d): %f ms.\n\n", MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, block_size, naive_gpu_elapsed_time_ms);


        // free memory
        cudaFree(a);
        cudaFree(b);
        cudaFree(c);
    }

    return 0;
}

Device Number: 0
  Device name: Tesla T4
  max Blocks Per MultiProcessor: 16
  max Threads Per MultiProcessor: 1024
  max Threads Per Block: 1024
  num SM: 40
  num bytes sharedMem Per Block: 49152
  num bytes sharedMem Per Multiprocessor: 65536
  Memory Clock Rate (KHz): 5001000
  Memory Bus Width (bits): 256
  Peak Memory Bandwidth (GB/s): 320.064000

Time elapsed on naive GPU matrix multiplication of 8192x8192 . 8192x8192 (32): 1412.809082 ms.




# Parallel Matrix multiplication version 2 (TILE_WIDTH 16)

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime_api.h>

#define MATRIX_SIZE 8192
#define TILE_WIDTH 16

__global__ void gpu_matrix_mult(int *a,int *b, int *c, int n)
{
    __shared__ int ds_M[TILE_WIDTH][TILE_WIDTH];
    __shared__ int ds_N[TILE_WIDTH][TILE_WIDTH];


  int bx = blockIdx.x;  int by = blockIdx.y;
  int tx = threadIdx.x; int ty = threadIdx.y;

  int Row = by * blockDim.y + ty;
  int Col = bx * blockDim.x + tx;
  int Pvalue = 0;

  // Loop over the M and N tiles required to compute the P element
  for (int p = 0; p < (n-1) / TILE_WIDTH + 1; ++p) {
    // Collaborative loading of M and N tiles into shared memory
    if(Row < n && p * TILE_WIDTH+tx < n) {
        ds_M[ty][tx] = a[Row*n + p*TILE_WIDTH+tx];
    }
    else
    {
        ds_M[ty][tx] = 0.0;
    }
    if (p*TILE_WIDTH+ty < n && Col < n) {
        ds_N[ty][tx] = b[(p*TILE_WIDTH+ty)*n + Col];
    }
    else
    {
        ds_N[ty][tx] = 0.0;
    }
    __syncthreads();

    if(Row < n && Col < n) {
        for (int i = 0; i < TILE_WIDTH; ++i)
           Pvalue += ds_M[ty][i] * ds_N[i][tx];
    }
    __syncthreads();
  }
  if (Row < n && Col < n)
    c[Row*n+Col] = Pvalue;
}


int main(int argc, char const *argv[])
{
    int block_size;

    /// retrive some info about the CUDA device
    int nDevices;
    cudaGetDeviceCount(&nDevices);
    for (int i = 0; i < nDevices; i++) {
      cudaDeviceProp prop;
      cudaGetDeviceProperties(&prop, i);
      printf("Device Number: %d\n", i);
      printf("  Device name: %s\n", prop.name);
      printf("  max Blocks Per MultiProcessor: %d\n", prop.maxBlocksPerMultiProcessor);
      printf("  max Threads Per MultiProcessor: %d\n", prop.maxThreadsPerMultiProcessor);
      printf("  max Threads Per Block: %d\n", prop.maxThreadsPerBlock);
      printf("  num SM: %d\n", prop.multiProcessorCount);
      printf("  num bytes sharedMem Per Block: %d\n", prop.sharedMemPerBlock);
      printf("  num bytes sharedMem Per Multiprocessor: %d\n", prop.sharedMemPerMultiprocessor);
      printf("  Memory Clock Rate (KHz): %d\n",
           prop.memoryClockRate);
      printf("  Memory Bus Width (bits): %d\n",
           prop.memoryBusWidth);
      printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
           2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
    }

    for(block_size= 16; block_size <= 16; block_size *= 2)
    {
        int *a, *b, *c;
        cudaMallocManaged((void **) &a, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);
        cudaMallocManaged((void **) &b, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);
        cudaMallocManaged((void **) &c, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);

        // initialize matrix A
        for (int i = 0; i < MATRIX_SIZE; ++i) {
            for (int j = 0; j < MATRIX_SIZE; ++j) {
                a[i * MATRIX_SIZE + j] = 2;
            }
        }

        // initialize matrix B
        for (int i = 0; i < MATRIX_SIZE; ++i) {
            for (int j = 0; j < MATRIX_SIZE; ++j) {
                b[i * MATRIX_SIZE + j] = 3;
            }
        }

        float  naive_gpu_elapsed_time_ms;

        // some events to count the execution time
        //clock_t st, end;
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);


        unsigned int grid_rows = (MATRIX_SIZE + block_size - 1) / block_size;
        unsigned int grid_cols = (MATRIX_SIZE + block_size - 1) / block_size;
        dim3 dimGrid(grid_cols, grid_rows);
        dim3 dimBlock(block_size, block_size);


        cudaEventRecord(start, 0);
        gpu_matrix_mult<<<dimGrid, dimBlock>>>(a, b, c, MATRIX_SIZE);
        cudaThreadSynchronize();

        // time counting terminate

        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);

        // compute time elapsed on GPU computing
        cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
        printf("Time elapsed on naive GPU matrix multiplication of %dx%d . %dx%d (%d): %f ms.\n\n", MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, block_size, naive_gpu_elapsed_time_ms);


        // free memory
        cudaFree(a);
        cudaFree(b);
        cudaFree(c);
    }

    return 0;
}

Device Number: 0
  Device name: Tesla T4
  max Blocks Per MultiProcessor: 16
  max Threads Per MultiProcessor: 1024
  max Threads Per Block: 1024
  num SM: 40
  num bytes sharedMem Per Block: 49152
  num bytes sharedMem Per Multiprocessor: 65536
  Memory Clock Rate (KHz): 5001000
  Memory Bus Width (bits): 256
  Peak Memory Bandwidth (GB/s): 320.064000

Time elapsed on naive GPU matrix multiplication of 8192x8192 . 8192x8192 (16): 1712.394165 ms.




# Parallel Matrix multiplication version 2 (TILE_WIDTH 8)

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime_api.h>

#define MATRIX_SIZE 8192
#define TILE_WIDTH 8

__global__ void gpu_matrix_mult(int *a,int *b, int *c, int n)
{
    __shared__ int ds_M[TILE_WIDTH][TILE_WIDTH];
    __shared__ int ds_N[TILE_WIDTH][TILE_WIDTH];


  int bx = blockIdx.x;  int by = blockIdx.y;
  int tx = threadIdx.x; int ty = threadIdx.y;

  int Row = by * blockDim.y + ty;
  int Col = bx * blockDim.x + tx;
  int Pvalue = 0;

  // Loop over the M and N tiles required to compute the P element
  for (int p = 0; p < (n-1) / TILE_WIDTH + 1; ++p) {
    // Collaborative loading of M and N tiles into shared memory
    if(Row < n && p * TILE_WIDTH+tx < n) {
        ds_M[ty][tx] = a[Row*n + p*TILE_WIDTH+tx];
    }
    else
    {
        ds_M[ty][tx] = 0.0;
    }
    if (p*TILE_WIDTH+ty < n && Col < n) {
        ds_N[ty][tx] = b[(p*TILE_WIDTH+ty)*n + Col];
    }
    else
    {
        ds_N[ty][tx] = 0.0;
    }
    __syncthreads();

    if(Row < n && Col < n) {
        for (int i = 0; i < TILE_WIDTH; ++i)
           Pvalue += ds_M[ty][i] * ds_N[i][tx];
    }
    __syncthreads();
  }
  if (Row < n && Col < n)
    c[Row*n+Col] = Pvalue;
}


int main(int argc, char const *argv[])
{
    int block_size;

    /// retrive some info about the CUDA device
    int nDevices;
    cudaGetDeviceCount(&nDevices);
    for (int i = 0; i < nDevices; i++) {
      cudaDeviceProp prop;
      cudaGetDeviceProperties(&prop, i);
      printf("Device Number: %d\n", i);
      printf("  Device name: %s\n", prop.name);
      printf("  max Blocks Per MultiProcessor: %d\n", prop.maxBlocksPerMultiProcessor);
      printf("  max Threads Per MultiProcessor: %d\n", prop.maxThreadsPerMultiProcessor);
      printf("  max Threads Per Block: %d\n", prop.maxThreadsPerBlock);
      printf("  num SM: %d\n", prop.multiProcessorCount);
      printf("  num bytes sharedMem Per Block: %d\n", prop.sharedMemPerBlock);
      printf("  num bytes sharedMem Per Multiprocessor: %d\n", prop.sharedMemPerMultiprocessor);
      printf("  Memory Clock Rate (KHz): %d\n",
           prop.memoryClockRate);
      printf("  Memory Bus Width (bits): %d\n",
           prop.memoryBusWidth);
      printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
           2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
    }

    for(block_size= 8; block_size <= 8; block_size *= 2)
    {
        int *a, *b, *c;
        cudaMallocManaged((void **) &a, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);
        cudaMallocManaged((void **) &b, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);
        cudaMallocManaged((void **) &c, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);

        // initialize matrix A
        for (int i = 0; i < MATRIX_SIZE; ++i) {
            for (int j = 0; j < MATRIX_SIZE; ++j) {
                a[i * MATRIX_SIZE + j] = 2;
            }
        }

        // initialize matrix B
        for (int i = 0; i < MATRIX_SIZE; ++i) {
            for (int j = 0; j < MATRIX_SIZE; ++j) {
                b[i * MATRIX_SIZE + j] = 3;
            }
        }

        float  naive_gpu_elapsed_time_ms;

        // some events to count the execution time
        //clock_t st, end;
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);


        unsigned int grid_rows = (MATRIX_SIZE + block_size - 1) / block_size;
        unsigned int grid_cols = (MATRIX_SIZE + block_size - 1) / block_size;
        dim3 dimGrid(grid_cols, grid_rows);
        dim3 dimBlock(block_size, block_size);


        cudaEventRecord(start, 0);
        gpu_matrix_mult<<<dimGrid, dimBlock>>>(a, b, c, MATRIX_SIZE);
        cudaThreadSynchronize();

        // time counting terminate

        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);

        // compute time elapsed on GPU computing
        cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
        printf("Time elapsed on naive GPU matrix multiplication of %dx%d . %dx%d (%d): %f ms.\n\n", MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, block_size, naive_gpu_elapsed_time_ms);


        // free memory
        cudaFree(a);
        cudaFree(b);
        cudaFree(c);
    }

    return 0;
}

Device Number: 0
  Device name: Tesla T4
  max Blocks Per MultiProcessor: 16
  max Threads Per MultiProcessor: 1024
  max Threads Per Block: 1024
  num SM: 40
  num bytes sharedMem Per Block: 49152
  num bytes sharedMem Per Multiprocessor: 65536
  Memory Clock Rate (KHz): 5001000
  Memory Bus Width (bits): 256
  Peak Memory Bandwidth (GB/s): 320.064000

Time elapsed on naive GPU matrix multiplication of 8192x8192 . 8192x8192 (8): 2694.651123 ms.




# Parallel Matrix multiplication version 2 (TILE_WIDTH 4)

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime_api.h>

#define MATRIX_SIZE 8192
#define TILE_WIDTH 4

__global__ void gpu_matrix_mult(int *a,int *b, int *c, int n)
{
    __shared__ int ds_M[TILE_WIDTH][TILE_WIDTH];
    __shared__ int ds_N[TILE_WIDTH][TILE_WIDTH];


  int bx = blockIdx.x;  int by = blockIdx.y;
  int tx = threadIdx.x; int ty = threadIdx.y;

  int Row = by * blockDim.y + ty;
  int Col = bx * blockDim.x + tx;
  int Pvalue = 0;

  // Loop over the M and N tiles required to compute the P element
  for (int p = 0; p < (n-1) / TILE_WIDTH + 1; ++p) {
    // Collaborative loading of M and N tiles into shared memory
    if(Row < n && p * TILE_WIDTH+tx < n) {
        ds_M[ty][tx] = a[Row*n + p*TILE_WIDTH+tx];
    }
    else
    {
        ds_M[ty][tx] = 0.0;
    }
    if (p*TILE_WIDTH+ty < n && Col < n) {
        ds_N[ty][tx] = b[(p*TILE_WIDTH+ty)*n + Col];
    }
    else
    {
        ds_N[ty][tx] = 0.0;
    }
    __syncthreads();

    if(Row < n && Col < n) {
        for (int i = 0; i < TILE_WIDTH; ++i)
           Pvalue += ds_M[ty][i] * ds_N[i][tx];
    }
    __syncthreads();
  }
  if (Row < n && Col < n)
    c[Row*n+Col] = Pvalue;
}


int main(int argc, char const *argv[])
{
    int block_size;

    /// retrive some info about the CUDA device
    int nDevices;
    cudaGetDeviceCount(&nDevices);
    for (int i = 0; i < nDevices; i++) {
      cudaDeviceProp prop;
      cudaGetDeviceProperties(&prop, i);
      printf("Device Number: %d\n", i);
      printf("  Device name: %s\n", prop.name);
      printf("  max Blocks Per MultiProcessor: %d\n", prop.maxBlocksPerMultiProcessor);
      printf("  max Threads Per MultiProcessor: %d\n", prop.maxThreadsPerMultiProcessor);
      printf("  max Threads Per Block: %d\n", prop.maxThreadsPerBlock);
      printf("  num SM: %d\n", prop.multiProcessorCount);
      printf("  num bytes sharedMem Per Block: %d\n", prop.sharedMemPerBlock);
      printf("  num bytes sharedMem Per Multiprocessor: %d\n", prop.sharedMemPerMultiprocessor);
      printf("  Memory Clock Rate (KHz): %d\n",
           prop.memoryClockRate);
      printf("  Memory Bus Width (bits): %d\n",
           prop.memoryBusWidth);
      printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
           2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
    }

    for(block_size= 4; block_size <= 4; block_size *= 2)
    {
        int *a, *b, *c;
        cudaMallocManaged((void **) &a, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);
        cudaMallocManaged((void **) &b, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);
        cudaMallocManaged((void **) &c, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);

        // initialize matrix A
        for (int i = 0; i < MATRIX_SIZE; ++i) {
            for (int j = 0; j < MATRIX_SIZE; ++j) {
                a[i * MATRIX_SIZE + j] = 2;
            }
        }

        // initialize matrix B
        for (int i = 0; i < MATRIX_SIZE; ++i) {
            for (int j = 0; j < MATRIX_SIZE; ++j) {
                b[i * MATRIX_SIZE + j] = 3;
            }
        }

        float  naive_gpu_elapsed_time_ms;

        // some events to count the execution time
        //clock_t st, end;
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);


        unsigned int grid_rows = (MATRIX_SIZE + block_size - 1) / block_size;
        unsigned int grid_cols = (MATRIX_SIZE + block_size - 1) / block_size;
        dim3 dimGrid(grid_cols, grid_rows);
        dim3 dimBlock(block_size, block_size);


        cudaEventRecord(start, 0);
        gpu_matrix_mult<<<dimGrid, dimBlock>>>(a, b, c, MATRIX_SIZE);
        cudaThreadSynchronize();

        // time counting terminate

        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);

        // compute time elapsed on GPU computing
        cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
        printf("Time elapsed on naive GPU matrix multiplication of %dx%d . %dx%d (%d): %f ms.\n\n", MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, block_size, naive_gpu_elapsed_time_ms);


        // free memory
        cudaFree(a);
        cudaFree(b);
        cudaFree(c);
    }

    return 0;
}

Device Number: 0
  Device name: Tesla T4
  max Blocks Per MultiProcessor: 16
  max Threads Per MultiProcessor: 1024
  max Threads Per Block: 1024
  num SM: 40
  num bytes sharedMem Per Block: 49152
  num bytes sharedMem Per Multiprocessor: 65536
  Memory Clock Rate (KHz): 5001000
  Memory Bus Width (bits): 256
  Peak Memory Bandwidth (GB/s): 320.064000

Time elapsed on naive GPU matrix multiplication of 8192x8192 . 8192x8192 (4): 8358.279297 ms.


