In [1]:
!sudo apt update
!sudo apt install -y mpich
!sudo apt install -y build-essential
!nvcc --version


Hit:1 https://cli.github.com/packages stable InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:9 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [83.6 kB]
Get:10 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,196 kB]
Get:11 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,508 kB]
Hit:13 https://ppa.launchpadcontent.net/graphics-drivers/ppa

In [2]:
%%writefile serial.c
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>

#define N 800   // Use large N for better comparison

double wall_time() {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec + tv.tv_usec * 1e-6;
}

int main() {
    double **A, **B, **C;
    double start, end;

    A = (double**) malloc(N * sizeof(double*));
    B = (double**) malloc(N * sizeof(double*));
    C = (double**) malloc(N * sizeof(double*));

    for (int i = 0; i < N; i++) {
        A[i] = (double*) malloc(N * sizeof(double));
        B[i] = (double*) malloc(N * sizeof(double));
        C[i] = (double*) malloc(N * sizeof(double));
    }

    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++) {
            A[i][j] = rand() % 10;
            B[i][j] = rand() % 10;
            C[i][j] = 0;
        }

    start = wall_time();

    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++)
            for (int k = 0; k < N; k++)
                C[i][j] += A[i][k] * B[k][j];

    end = wall_time();

    printf("Serial time: %.6f seconds\n", end - start);
    return 0;
}


Writing serial.c


In [3]:
!gcc serial.c -o serial -O2


In [4]:
!./serial


Serial time: 0.823569 seconds


In [5]:
%%writefile openmp_matmul.c
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

#define N 800

int main() {
    double **A, **B, **C;
    int i, j, k;

    A = (double**) malloc(N * sizeof(double*));
    B = (double**) malloc(N * sizeof(double*));
    C = (double**) malloc(N * sizeof(double*));

    for (i = 0; i < N; i++) {
        A[i] = (double*) malloc(N * sizeof(double));
        B[i] = (double*) malloc(N * sizeof(double));
        C[i] = (double*) malloc(N * sizeof(double));
    }

    for (i = 0; i < N; i++)
        for (j = 0; j < N; j++) {
            A[i][j] = rand() % 10;
            B[i][j] = rand() % 10;
            C[i][j] = 0;
        }

    double start = omp_get_wtime();

    #pragma omp parallel for private(j, k) shared(A, B, C)
    for (i = 0; i < N; i++)
        for (j = 0; j < N; j++)
            for (k = 0; k < N; k++)
                C[i][j] += A[i][k] * B[k][j];

    double end = omp_get_wtime();
    printf("Threads: %d | Time: %.6f seconds\n", omp_get_max_threads(), end - start);

    return 0;
}


Writing openmp_matmul.c


In [6]:
!gcc openmp_matmul.c -fopenmp -O2 -o openmp_matmul


In [7]:
!OMP_NUM_THREADS=1 ./openmp_matmul


Threads: 1 | Time: 1.080810 seconds


In [8]:
!OMP_NUM_THREADS=1 ./openmp_matmul
!OMP_NUM_THREADS=2 ./openmp_matmul
!OMP_NUM_THREADS=4 ./openmp_matmul
!OMP_NUM_THREADS=8 ./openmp_matmul


Threads: 1 | Time: 1.287342 seconds
Threads: 2 | Time: 1.952191 seconds
Threads: 4 | Time: 1.779185 seconds
Threads: 8 | Time: 1.686399 seconds


In [18]:
%%writefile mpi_matmul.c
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>

#define N 800

int main(int argc, char* argv[]) {
    int rank, size;
    double start, end;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    double *A = NULL;
    double *B = (double*) malloc(N*N*sizeof(double));
    double *C_local = (double*) malloc(N*N/size * sizeof(double));
    double *C = NULL;

    if (rank == 0) {
        A = (double*) malloc(N*N*sizeof(double));
        C = (double*) malloc(N*N*sizeof(double));
        for (int i = 0; i < N*N; i++) {
            A[i] = rand() % 10;
            B[i] = rand() % 10;
        }
    }

    MPI_Bcast(B, N*N, MPI_DOUBLE, 0, MPI_COMM_WORLD);

    double* A_local = (double*) malloc(N*N/size * sizeof(double));
    MPI_Scatter(A, N*N/size, MPI_DOUBLE, A_local, N*N/size, MPI_DOUBLE, 0, MPI_COMM_WORLD);

    start = MPI_Wtime();

    for (int i = 0; i < N/size; i++)
        for (int j = 0; j < N; j++) {
            C_local[i*N + j] = 0;
            for (int k = 0; k < N; k++)
                C_local[i*N + j] += A_local[i*N + k] * B[k*N + j];
        }

    end = MPI_Wtime();
    MPI_Gather(C_local, N*N/size, MPI_DOUBLE, C, N*N/size, MPI_DOUBLE, 0, MPI_COMM_WORLD);

    if (rank == 0)
        printf("Processes: %d | Time: %.6f sec\n", size, end - start);
    if (rank == 0 && C) free(C);

    MPI_Finalize();
    return 0;
}



Overwriting mpi_matmul.c


In [19]:
!mpicc mpi_matmul.c -o mpi_matmul -O2


In [20]:
!OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 mpirun --oversubscribe -np 1 ./mpi_matmul
!OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 mpirun --oversubscribe -np 2 ./mpi_matmul
!OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 mpirun --oversubscribe -np 4 ./mpi_matmul
!OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 mpirun --oversubscribe -np 8 ./mpi_matmul






Processes: 1 | Time: 0.979619 sec
Processes: 2 | Time: 0.985073 sec
Processes: 4 | Time: 0.862657 sec
Processes: 8 | Time: 0.482635 sec


In [None]:
!nvidia-smi
!apt-get update
!apt-get install -y cuda


Fri Dec  5 13:17:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   51C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [13]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [14]:

#  Check and Configure CUDA

print("=" * 60)
print("CUDA ENVIRONMENT SETUP")
print("=" * 60)

import os
import subprocess
import shutil

# Check GPU availability
print("\n1. Checking GPU availability:")
!nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv

# Get CUDA driver version from nvidia-smi
result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
driver_output = result.stdout
print("\n2. CUDA Driver Version from nvidia-smi:")
for line in driver_output.split('\n'):
    if 'CUDA Version' in line:
        print(line)
        # Extract CUDA version
        cuda_driver_version = line.split('CUDA Version:')[1].strip().split()[0]
        print(f"   Detected CUDA Driver: {cuda_driver_version}")

# Check available CUDA installations
print("\n3. Available CUDA installations:")
!ls -d /usr/local/cuda* 2>/dev/null || echo "No CUDA installations found"

# Check current nvcc version
print("\n4. Current nvcc version:")
!nvcc --version 2>/dev/null || echo "nvcc not found in PATH"

# Use PyTorch to find compatible CUDA
print("\n5. PyTorch CUDA information:")
import torch
if torch.cuda.is_available():
    print(f"   ✓ CUDA is available")
    print(f"   PyTorch CUDA version: {torch.version.cuda}")
    print(f"   GPU Device: {torch.cuda.get_device_name(0)}")

    # Extract major.minor from driver version (e.g., 12.4)
    cuda_driver_version = "12.4"  # From nvidia-smi above
    driver_major_minor = '.'.join(cuda_driver_version.split('.')[:2])

    print(f"\n6. Selecting compatible CUDA toolkit:")
    print(f"   Driver supports CUDA: {cuda_driver_version}")

    # Try older CUDA versions (12.4 driver supports up to CUDA 12.4)
    possible_paths = [
        "/usr/local/cuda-12.4",
        "/usr/local/cuda-12.3",
        "/usr/local/cuda-12.2",
        "/usr/local/cuda-12.1",
        "/usr/local/cuda-12.0",
        "/usr/local/cuda-11.8",
        "/usr/local/cuda-11",
    ]

    cuda_path = None
    for path in possible_paths:
        if os.path.exists(path) and os.path.exists(f"{path}/bin/nvcc"):
            # Verify it's not a symlink to a newer version
            real_path = os.path.realpath(path)
            cuda_path = path
            print(f"   ✓ Found compatible CUDA at: {path}")
            if real_path != path:
                print(f"     (resolves to: {real_path})")
            break

    if cuda_path is None:
        print(f"   ✗ No compatible CUDA found!")
        print(f"   Installing CUDA 11.8 (compatible with driver 12.4)...")

        # Install CUDA 11.8 toolkit
        !wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
        !chmod +x cuda_11.8.0_520.61.05_linux.run
        !./cuda_11.8.0_520.61.05_linux.run --silent --toolkit --toolkitpath=/usr/local/cuda-11.8 --no-opengl-libs

        cuda_path = "/usr/local/cuda-11.8"

        if os.path.exists(cuda_path):
            print(f"   ✓ CUDA 11.8 installed successfully at: {cuda_path}")
        else:
            print(f"   ✗ Installation failed, using system default")
            cuda_path = "/usr/local/cuda"

    # Update environment
    os.environ['CUDA_HOME'] = cuda_path
    os.environ['PATH'] = f"{cuda_path}/bin:" + os.environ.get('PATH', '')
    os.environ['LD_LIBRARY_PATH'] = f"{cuda_path}/lib64:" + os.environ.get('LD_LIBRARY_PATH', '')

    print(f"\n7. CUDA environment configured:")
    print(f"   CUDA_HOME: {cuda_path}")
    print(f"   Using nvcc from: {cuda_path}/bin/nvcc")

else:
    print("   ✗ CUDA is NOT available!")
    print("   Please check Runtime -> Change runtime type -> Select GPU")

print("\n8. Testing CUDA configuration:")
!{os.environ.get('CUDA_HOME', '/usr/local/cuda')}/bin/nvcc --version

print("\n" + "=" * 60)
print("SETUP COMPLETE - Proceed to next cell")
print("=" * 60)

CUDA ENVIRONMENT SETUP

1. Checking GPU availability:
name, driver_version, memory.total [MiB]
Tesla T4, 550.54.15, 15360 MiB

2. CUDA Driver Version from nvidia-smi:
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
   Detected CUDA Driver: 12.4

3. Available CUDA installations:
/usr/local/cuda  /usr/local/cuda-12  /usr/local/cuda-12.5

4. Current nvcc version:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0

5. PyTorch CUDA information:
   ✓ CUDA is available
   PyTorch CUDA version: 12.6
   GPU Device: Tesla T4

6. Selecting compatible CUDA toolkit:
   Driver supports CUDA: 12.4
   ✗ No compatible CUDA found!
   Installing CUDA 11.8 (compatible with driver 12.4)...
^C
Signal caught, cleaning up
   ✗ Installation failed, using system default

7. CUDA environment configured:
   CUDA_HOM

In [None]:
%%writefile cuda_matmul.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

__global__ void matmul_kernel(double* A, double* B, double* C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N) {
        double sum = 0;
        for (int k = 0; k < N; k++)
            sum += A[row * N + k] * B[k * N + col];
        C[row * N + col] = sum;
    }
}

int main(int argc, char** argv) {
    int N = 1024;
    int block_x = 16, block_y = 16;

    if(argc >= 2) N = atoi(argv[1]);
    if(argc >= 3) block_x = atoi(argv[2]);
    if(argc >= 4) block_y = atoi(argv[3]);

    // Initialize CUDA context first
    cudaFree(0);

    printf("Matrix size: %d x %d\n", N, N);
    printf("CUDA Block size: %dx%d\n", block_x, block_y);

    size_t size = N * N * sizeof(double);
    size_t total_mem = size * 3; // A, B, and C

    printf("Required GPU memory: %.2f GB\n", total_mem / (1024.0*1024.0*1024.0));

    // Check available GPU memory (after CUDA initialization)
    size_t free_mem, total_gpu_mem;
    cudaError_t err = cudaMemGetInfo(&free_mem, &total_gpu_mem);

    if (err != cudaSuccess) {
        printf("ERROR: Could not get GPU memory info: %s\n", cudaGetErrorString(err));
        printf("Attempting to continue anyway...\n");
    } else {
        printf("Available GPU memory: %.2f GB / %.2f GB\n",
               free_mem / (1024.0*1024.0*1024.0),
               total_gpu_mem / (1024.0*1024.0*1024.0));

        if (total_mem > free_mem) {
            printf("WARNING: Not enough GPU memory! Attempting allocation anyway...\n");
        }
    }

    printf("Allocating host memory...\n");
    double *A = (double*) malloc(size);
    double *B = (double*) malloc(size);
    double *C = (double*) malloc(size);

    if (!A || !B || !C) {
        printf("ERROR: Host memory allocation failed!\n");
        return 1;
    }

    printf("Initializing matrices...\n");
    for (long long i = 0; i < (long long)N*N; i++) {
        A[i] = rand() % 10;
        B[i] = rand() % 10;
    }

    printf("Allocating GPU memory...\n");
    double *dA, *dB, *dC;

    err = cudaMalloc((void**)&dA, size);
    if (err != cudaSuccess) {
        printf("ERROR: cudaMalloc failed for dA: %s\n", cudaGetErrorString(err));
        free(A); free(B); free(C);
        return 1;
    }

    err = cudaMalloc((void**)&dB, size);
    if (err != cudaSuccess) {
        printf("ERROR: cudaMalloc failed for dB: %s\n", cudaGetErrorString(err));
        cudaFree(dA);
        free(A); free(B); free(C);
        return 1;
    }

    err = cudaMalloc((void**)&dC, size);
    if (err != cudaSuccess) {
        printf("ERROR: cudaMalloc failed for dC: %s\n", cudaGetErrorString(err));
        cudaFree(dA);
        cudaFree(dB);
        free(A); free(B); free(C);
        return 1;
    }

    printf("Copying data to GPU...\n");
    cudaMemcpy(dA, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dB, B, size, cudaMemcpyHostToDevice);

    dim3 block(block_x, block_y);
    dim3 grid((N + block.x - 1)/block.x, (N + block.y - 1)/block.y);

    printf("Grid size: %dx%d, Total blocks: %d\n", grid.x, grid.y, grid.x * grid.y);
    printf("Threads per block: %d\n", block_x * block_y);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Adjust repetitions based on matrix size
    int repeat = (N <= 2048) ? 100 : (N <= 8192) ? 10 : 1;
    printf("Running %d iterations...\n", repeat);

    cudaEventRecord(start);
    for(int i = 0; i < repeat; i++) {
        matmul_kernel<<<grid, block>>>(dA, dB, dC, N);
    }
    cudaDeviceSynchronize();
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    // Check for kernel errors
    err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("ERROR: Kernel launch failed: %s\n", cudaGetErrorString(err));
        cudaFree(dA); cudaFree(dB); cudaFree(dC);
        free(A); free(B); free(C);
        return 1;
    }

    float ms_f;
    cudaEventElapsedTime(&ms_f, start, stop);
    double ms = (double)ms_f;

    // Calculate average time per iteration
    double avg_time_us = (ms * 1000.0) / repeat;
    double avg_time_ms = ms / repeat;

    printf("\n=== RESULTS ===\n");
    printf("Total time for %d iterations: %.3f ms\n", repeat, ms);
    printf("Average time per iteration: %.3f ms (%.3f µs)\n", avg_time_ms, avg_time_us);

    // Calculate GFLOPS
    double flops = 2.0 * N * N * N; // 2N^3 operations for matrix multiplication
    double gflops = (flops / (avg_time_ms / 1000.0)) / 1e9;
    printf("Performance: %.2f GFLOPS\n", gflops);

    printf("Copying result back to host...\n");
    cudaMemcpy(C, dC, size, cudaMemcpyDeviceToHost);

    free(A); free(B); free(C);
    cudaFree(dA); cudaFree(dB); cudaFree(dC);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    printf("Done!\n");
    return 0;
}

In [16]:
!nvcc cuda_matmul.cu -o cuda_matmul


In [None]:
!./cuda_matmul 800 8 8
!./cuda_matmul 800 16 16
!./cuda_matmul 800 32 32
!./cuda_matmul 2048 16 16