In [4]:
%%writefile matrix_mul.cu
#include <iostream>
#include <cstdlib>
using namespace std;

// CUDA kernel for matrix multiplication
__global__ void multiply(int* A, int* B, int* C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N) {
        int sum = 0;
        for (int i = 0; i < N; i++) {
            sum += A[row * N + i] * B[i * N + col];
        }
        C[row * N + col] = sum;
    }
}

// Initialize matrix with random values
void initialize(int* mat, int N) {
    for (int i = 0; i < N * N; i++) {
        mat[i] = rand() % 10;
    }
}

// Print matrix
void print(int* mat, int N) {
    for (int row = 0; row < N; row++) {
        for (int col = 0; col < N; col++) {
            cout << mat[row * N + col] << " ";
        }
        cout << '\n';
    }
    cout << '\n';
}

int main() {
    int N = 2; // Change size as needed
    int SIZE = N * N;
    size_t BYTES = SIZE * sizeof(int);

    // Host matrices
    int* A = new int[SIZE];
    int* B = new int[SIZE];
    int* C = new int[SIZE];

    initialize(A, N);
    initialize(B, N);

    cout << "Matrix A:\n";
    print(A, N);
    cout << "Matrix B:\n";
    print(B, N);

    // Device matrices
    int *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, BYTES);
    cudaMalloc(&d_B, BYTES);
    cudaMalloc(&d_C, BYTES);

    // Copy to device
    cudaMemcpy(d_A, A, BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, BYTES, cudaMemcpyHostToDevice);

    // Setup kernel dimensions
    dim3 threads(2, 2);
    dim3 blocks(N / threads.x, N / threads.y);

    // Launch kernel
    multiply<<<blocks, threads>>>(d_A, d_B, d_C, N);

    // Wait for GPU to finish
    cudaDeviceSynchronize();

    // Copy result back to host
    cudaMemcpy(C, d_C, BYTES, cudaMemcpyDeviceToHost);

    cout << "Result of Matrix Multiplication:\n";
    print(C, N);

    // Cleanup
    delete[] A;
    delete[] B;
    delete[] C;
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

Writing matrix_mul.cu


In [5]:
!nvcc matrix_mul.cu -o matrix_mul

In [6]:
!./matrix_mul

Matrix A:
3 6 
7 5 

Matrix B:
3 5 
6 2 

Result of Matrix Multiplication:
0 0 
0 0 

