<a href="https://colab.research.google.com/github/jackiemacguire/learning-archive/blob/main/Matrix_Multiplication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
# Step 1: Checks GPU and Allows C++
!nvidia-smi
!nvcc --version

# Makes it C++ Code and Import


Sat Oct 11 20:01:37 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [28]:
%%writefile matrixMul.cu
#include <iostream>
#include <cuda_runtime.h>
#include <cstdlib>
#include <ctime>

// Step 3: Define GPU Kernel
__global__ void matrixMultiplicationKernel(float *a, float *b, float *c, int m, int n, int p){

  /* to compute a unique (row, column) for every thread across the whole grid
     you multiply the block index by the size of the block and add the thread offset */

  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int column = blockIdx.x * blockDim.x + threadIdx.x;

// checking if matrices can be multiplied together
  if(row < m && column < n){    // if they can, the matrices are flattened into 1d arrays
    float sum_of_c = 0.0;
    for(int i = 0; i < p; i++){
      sum_of_c += a[row * p + i] * b[i * n + column];
    }
    c[row * n + column] = sum_of_c;
  }
}


// Step 4: cpu code
int main(){

  // defining matrix dimensions (predetermining sizes)
  int m = 4; // row 'a' and 'c' size
  int p = 3; // column size of 'a' and row size of 'b'
  int n = 4; // column 'b' and 'c' size

  // allocating cpu memory using malloc()
  float *h_a = (float*)malloc(m * p * sizeof(float));
  float *h_b = (float*)malloc(p * n * sizeof(float));
  float *h_c = (float*)malloc(m * n * sizeof(float));

  // initialize matrices with data that is randomized
  srand(time(0));
  for(int j=0; j < m * p; j++)
    h_a[j]= rand() % 10; // 0-9
  for(int j=0; j < p * n; j++)
    h_b[j]=rand() % 10;


  // Print A =
  std::cout << "Matrix A (" << m << "x" << p << "):" << std::endl;
  for (int i = 0; i < m; i++) {
      for (int j = 0; j < p; j++) {
          std::cout << h_a[i * p + j] << "\t";
      }
      std::cout << std::endl;
  }


  // Print B =
  std::cout << "\nMatrix B (" << p << "x" << n << "):" << std::endl;
  for (int i = 0; i < p; i++) {
      for (int j = 0; j < n; j++) {
          std::cout << h_b[i * n + j] << "\t";
      }
      std::cout << std::endl;
    }


  // allocating gpu memory
  float *d_a, *d_b, *d_c;
  cudaMalloc(&d_a, m*p*sizeof(float));
  cudaMalloc(&d_b, p*n*sizeof(float));
  cudaMalloc(&d_c, m*n*sizeof(float));

  // copy data to gpu
  cudaMemcpy(d_a, h_a, m*p*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, h_b, p*n*sizeof(float), cudaMemcpyHostToDevice);

  // threads and block
  //dim3 = data type in cuda (x,y,z), (16, 16) because ..., (n+15)/16, (m+15)/16;

  dim3 threadsPerBlock(16,16);
  dim3 blocksPerGrid((n+15)/16, (m+15)/16);

  // launch kernel
  matrixMultiplicationKernel<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, m,n,p);

  // copy back to cpu
  cudaMemcpy(h_c, d_c, m*n*sizeof(float), cudaMemcpyDeviceToHost);

  // print c
  std::cout << "\nC = A * B (4x4): " << std::endl;
  for (int i=0; i<m; i++){
    for(int j=0; j<n; j++){
      std::cout<<h_c[i*n+j]<<"\t";
    }
    std::cout<<std::endl;
  }

  // free cpu and gpu memory
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

  free(h_a);
  free(h_b);
  free(h_c);

  return 0;

}


Overwriting matrixMul.cu


In [29]:
# Step 5: compile cuda code
!nvcc -gencode=arch=compute_75,code=sm_75 matrixMul.cu -o matrixMul

# Step 6: run
!./matrixMul


Matrix A (4x3):
4	4	7	
5	8	7	
4	9	9	
5	4	7	

Matrix B (3x4):
3	3	8	1	
1	5	2	4	
1	5	0	5	

C = A * B (4x4): 
23	67	40	55	
30	90	56	72	
30	102	50	85	
26	70	48	56	
