In [51]:
#!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
!pip install nvcc4jupyter
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [52]:
%load_ext nvcc4jupyter

The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


In [56]:
%%cuda

#include <iostream>
using namespace std;

const int N = 4; // Size of matrices

// CUDA kernel to perform matrix multiplication
__global__ void matrixMul(int *d_a, int *d_b, int *d_c) {
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;

  if (row < N && col < N) {
    int sum = 0;
    for (int k = 0; k < N; k++) {
      // Access elements using 1D indexing with appropriate offset calculation
      sum += d_a[row * N + k] * d_b[k * N + col];
    }
    d_c[row * N + col] = sum;
  }
}

int main() {
  // Host arrays
  int *h_a, *h_b, *h_c;

  // Device array
  int *d_a, *d_b, *d_c;

  // Allocate memory on host
  h_a = new int[N * N];
  h_b = new int[N * N];
  h_c = new int[N * N];

  // Initialize host arrays (replace with your initialization logic)
  for (int i = 0; i < N * N; i++) {
    h_a[i] = (rand() % 10) + 2;
    h_b[i] = (rand() % 10) + 2;
  }

  // Allocate memory on device
  cudaMalloc((void**)&d_a, N * N * sizeof(int));
  cudaMalloc((void**)&d_b, N * N * sizeof(int));
  cudaMalloc((void**)&d_c, N * N * sizeof(int));

  // Copy data from host to device
  cudaMemcpy(d_a, h_a, N * N * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, h_b, N * N * sizeof(int), cudaMemcpyHostToDevice);

  // Threads per ThreadBlock dimension
  int THREADS = 16;

  // Blocks per grid dimension
  int BLOCKS = (N + THREADS - 1) / THREADS;


  // grid and block dimensions
  dim3 threadsPerBlock(THREADS, THREADS);
  dim3 numBlocks(BLOCKS, BLOCKS);

  // Create CUDA events for timing
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  // Record start event
  cudaEventRecord(start, 0);


  // Launch kernel
  matrixMul<<<numBlocks, threadsPerBlock>>>(d_a, d_b, d_c);


  // Record stop event
  cudaEventRecord(stop, 0);

  // Synchronize events
  cudaEventSynchronize(stop);

  // Calculate elapsed time
  float elapsedTime;
  cudaEventElapsedTime(&elapsedTime, start, stop);
  cout << "Elapsed Time: " << elapsedTime << " ms\n";

  // Synchronize to ensure kernel execution finishes
  cudaDeviceSynchronize();

  // Copy data from device to host
  cudaMemcpy(h_c, d_c, N * N * sizeof(int), cudaMemcpyDeviceToHost);


  cout << "Matrix A:\n";
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
      cout << h_a[i * N + j] << " ";
    }
    cout << endl;
  }

  cout << "Matrix B:\n";
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
      cout << h_b[i * N + j] << " ";
    }
    cout << endl;
  }

  // Print the result (optional)
  cout << "Matrix C (Result of A * B):\n";
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
      cout << h_c[i * N + j] << " ";
    }
    cout << endl;
  }

  // Free device memory
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

  // Free host memory
  delete[] h_a;
  delete[] h_b;
  delete[] h_c;

  return 0;
}


Elapsed Time: 0.584544 ms
Matrix A:
5 9 5 8 11 4 2 5 2 4 3 9 4 4 9 11 4 11 5 3 11 3 6 10 7 5 3 8 4 8 7 6 8 8 5 6 4 6 6 5 9 8 10 5 6 4 8 11 8 6 7 6 9 9 9 4 3 8 7 6 2 3 9 3 11 9 9 8 8 11 10 4 5 2 10 2 8 10 8 3 11 6 3 5 6 6 9 5 9 11 4 9 7 6 10 11 7 10 5 10 8 5 5 8 6 10 11 9 6 2 2 4 6 7 6 11 4 9 7 10 4 11 8 2 3 7 3 10 2 6 4 10 4 6 4 2 4 11 10 5 3 5 2 11 11 11 5 2 8 6 2 8 8 7 11 9 10 11 8 4 8 5 3 11 3 11 2 7 9 6 2 4 8 2 4 4 7 4 2 10 10 6 11 11 4 6 11 5 2 2 11 5 3 6 3 8 6 4 6 4 10 4 10 8 5 5 5 2 9 10 2 10 11 5 5 5 8 4 7 9 8 6 2 10 2 8 6 11 11 10 2 9 11 7 11 7 6 11 7 5 9 10 11 9 4 5 11 4 3 8 3 2 5 3 2 8 9 2 6 6 7 4 2 8 8 10 8 9 3 3 9 4 6 4 4 2 11 7 2 9 10 2 8 8 11 7 9 7 5 5 11 9 9 3 2 10 7 6 9 5 2 9 11 4 5 3 4 4 9 3 6 9 3 9 6 10 3 8 3 8 10 10 2 4 9 8 8 9 9 11 9 8 10 5 6 7 3 7 11 5 7 4 9 5 8 8 5 6 11 4 10 2 6 8 9 5 5 7 2 9 5 2 2 3 5 11 6 7 10 7 7 11 9 5 8 7 8 2 3 4 11 2 4 6 5 10 5 2 5 11 9 4 4 6 10 2 11 4 3 5 
4 6 3 7 3 11 3 5 9 10 9 6 6 3 10 4 11 8 8 11 2 11 3 10 8 9 9 4 3 2 2 2 5 6 3 2 4 9 8