In [None]:
!nvidia-smi

Thu Aug 11 02:05:40 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0


In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-g0wj1dg1
  Running command git clone -q https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-g0wj1dg1


In [None]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [None]:
%%cu
#include <iostream>
    int
    main()
{
    std::cout << "Welcome To GeeksforGeeks\n";
    return 0;
}

Welcome To GeeksforGeeks



In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#define BLOCK_SIZE 16

/*************************************************
Function name: gpu_matrix_mult

Parameters:
            &a GPU device pointer to a m X n matrix (A)
            &b GPU device pointer to a n X k matrix (B)
            &c GPU device output pointer to a m X k matrix (C)

Note:
      grid and block should be configured as:
            dim3 dimGrid((k + BLOCKSIZE - 1) / BLOCK_SIZE, (m + BLOCK_SIZE - 1) / BLOCK_SIZE);
            dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
*************************************************/

__global__ void gpu_matrix_mult(int *a, int *b, int *c, int m, int n, int k)
{
    /*
    Part 1. Write GPU kernel code here for executing matrix multiplication
    Hint: Column index is calculated as, blockIdx.x * blockDim.x + threadIdx.x;
    */

    // Calculate row and column positions for this thread
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int sum;

    if (col < k && row < m)
    {
        // For each element in C, matrix multiply A and B
      sum = 0;
      for(int i = 0; i < n; i++)
      {
          sum += a[row*n + i]*b[i*k + col];
      }
      c[row*k + col] = sum;
    }
}

/*************************************************
Function name: gpu_tiled_matrix_mult

Parameters:
            &a GPU device pointer to a n X n matrix (A)
            &b GPU device pointer to a n X n matrix (B)
            &c GPU device output pointer to a n X n matrix (C)

Note:
      grid and block should be configured as:
            dim3 dimGrid((k + BLOCKSIZE - 1) / BLOCK_SIZE, (m + BLOCK_SIZE - 1) / BLOCK_SIZE);
            dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
*************************************************/

__global__ void gpu_tiled_matrix_mult(int *d_a, int *d_b, int *d_result, int n)
{
    
    //Part 2. Write GPU kernel code for executing tiled matrix multiplication
    //Hint: Need __syncthreads() for the correct results
    
    int k = 32;
    int tile_A = blockIdx.y * blockDim.y;
    int tile_B = blockIdx.x * blockDim.x;
    int row = tile_A + threadIdx.y;
    int col = tile_B + threadIdx.x;
    __shared__ int aTile[32][32];
    __shared__ int bTile[32][32];
    aTile[threadIdx.y][threadIdx.x] = d_a[row * k + threadIdx.x];
    bTile[threadIdx.y][threadIdx.x] = d_b[threadIdx.y * n + col];
    __syncthreads();
    int tile_element_sum = 0;
    for (int i =0; i < k; i++) {
      tile_element_sum += aTile[threadIdx.y][i] * bTile[i][threadIdx.x];
   }
   d_result[row * n + col] = tile_element_sum;

    //Part 2 ends here
    


}

/*************************************************
Function name: cpu_matrix_mult

Parameters:
            &a CPU host pointer to a n X n matrix (A)
            &b CPU host pointer to a n X n matrix (B)
            &c CPU host output pointer to a n X n matrix (C)
*************************************************/

__host__ void cpu_matrix_mult(int *h_a, int *h_b, int *h_result, int m, int n, int k) {
    for (int i = 0; i < m; ++i)
    {
        for (int j = 0; j < k; ++j)
        {
            int tmp = 0.0;
            for (int h = 0; h < n; ++h)
            {
                tmp += h_a[i * n + h] * h_b[h * k + j];
            }
            h_result[i * k + j] = tmp;
        }
    }
}

/*************************************************
Function name: main

Test and Compare
*************************************************/

int main(int argc, char const *argv[])
{
    int m, n, k;
    srand(time(0));
    
    //Set the size of matrices
    m = 256;
    n = 256;
    k = 256;
    
    //Part 3-1. Allocate memory in host DRAM, h_cc is used to store the CPU result

    int *h_a, *h_b, *h_c, *h_cc;
    cudaMallocHost((void **) &h_a, sizeof(int) * m * n);
    cudaMallocHost((void **) &h_b, sizeof(int) * n * k);
    cudaMallocHost((void **) &h_c, sizeof(int) * m * k);
    cudaMallocHost((void **) &h_cc, sizeof(int) * m * k);

    //Part 3-1 ends here
    
    int i, j;
    // Random initialize matrix A
    for (i = 0; i < m; ++i) {
        for (j = 0; j < n; ++j) {
            h_a[i * n + j] = rand() % 1024;
        }
    }

    // Random initialize matrix B
    for (i = 0; i < n; ++i) {
        for (j = 0; j < k; ++j) {
            h_b[i * k + j] = rand() % 1024;
        }
    }

    float gpu_elapsed_time_ms, cpu_elapsed_time_ms;

    // Events to measure the execution time
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Start to measure the execution time of GPU version
    cudaEventRecord(start, 0);

    
    //Part 3-2. Allocate memory space on the device (GPU) & Copy matrix A and B from host to device memory
    
    int *d_a, *d_b, *d_c;
    cudaMalloc((void **) &d_a, sizeof(int) * m * n);
    cudaMalloc((void **) &d_b, sizeof(int) * n * k);
    cudaMalloc((void **) &d_c, sizeof(int) * m * k);

    cudaMemcpy(d_a, h_a, sizeof(int) * m * n, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, sizeof(int) * n * k, cudaMemcpyHostToDevice);
 
    //Part 3-2 ends here
    

    unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
    unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
    dim3 dimGrid(grid_cols, grid_rows);
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

    
    //Part 3-3. Launch GPU kernel & Transfer results from the device to host
    //Note:   For the tiled multiplication with square matrices (i.e., m = n = k), launch gpu_tiled_matrix_mult. Otherwise, launch regular matrix multiplication kernel
    

    //Launch the normal matrix multplication kernel
    gpu_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, m, n, k);

    //Launch the square matrix multiplication kernel
    //gpu_tiled_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, m);
    
    // Transfer the results from device to host
    

    cudaMemcpy(h_c, d_c, sizeof(int) * m * k, cudaMemcpyDeviceToHost);
    

    //Part 3-3 ends here
    

    cudaDeviceSynchronize();
    
    // Time counting terminate
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    // GPU computing time elapse
    cudaEventElapsedTime(&gpu_elapsed_time_ms, start, stop);
    printf("\n\nGPU execution time on matrix multiplication of %dx%d . %dx%d: %f ms.\n\n", m, n, n, k, gpu_elapsed_time_ms);

    // CPU version
    cudaEventRecord(start, 0);

    cpu_matrix_mult(h_a, h_b, h_cc, m, n, k);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&cpu_elapsed_time_ms, start, stop);
    printf("CPU execution time on matrix multiplication of %dx%d . %dx%d: %f ms.\n\n", m, n, n, k, cpu_elapsed_time_ms);

    // Validate the results computed by GPU
    int all_ok = 1;
    for (int i = 0; i < m; ++i) {
        for (int j = 0; j < k; ++j) {
            // Uncomment below to see the actual results on both CPU and GPU
            // printf("CPU result [%d][%d]:%d == GPU result [%d][%d]:%d, ", i, j, h_cc[i * k + j], i, j, h_c[i * k + j]);
            if (h_cc[i * k + j] != h_c[i * k + j]) {
                all_ok = 0;
            }
        }
        // printf("\n");
    }

    // Compute the speedup
    if (all_ok) {
        printf("All results are correct !!!, speedup = %f\n", cpu_elapsed_time_ms / gpu_elapsed_time_ms);
    }
    else {
        printf("Incorrect results\n");
    }

    //Part 3-4. Free the device and host memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cudaFreeHost(h_a);
    cudaFreeHost(h_b);
    cudaFreeHost(h_c);
    cudaFreeHost(h_cc);
 
 

    //Part 3-4 ends here
    

    return 0;
}



GPU execution time on matrix multiplication of 256x256 . 256x256: 0.545952 ms.

CPU execution time on matrix multiplication of 256x256 . 256x256: 88.256035 ms.

All results are correct !!!, speedup = 161.655289

