In [97]:
!ls /usr/local

bin    cuda	cuda-12.2  games	       include	lib64	   man	 share
colab  cuda-12	etc	   _gcs_config_ops.so  lib	licensing  sbin  src


In [98]:
!which nvcc

/usr/local/cuda/bin/nvcc


In [99]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [100]:
!nvidia-smi

Sun Apr 14 16:39:06 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [101]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-z3ehs0os
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-z3ehs0os
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 5741c522547756ac4bb7a16df32106a15efb8a57
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [102]:
%%writefile cuda.cu

#include <stdio.h>
#include <iostream>
#include <ctime>
#include <cmath>
#include <cuda_runtime.h>
#include <iomanip> // for std::setw

__global__ void makeRightHandSideIdentity(double *mat, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        for (int j = n; j < 2 * n; ++j) {
            if (tid == j - n) {
                mat[tid * (2 * n) + j] = 1.0;
            } else {
                mat[tid * (2 * n) + j] = 0.0;
            }
        }
    }
}

__global__ void partialPivoting(double *mat, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n - 1) {
        if (mat[tid * (2 * n) + 1] > mat[(tid + 1) * (2 * n) + 1]) {
            for (int j = 0; j < 2 * n; ++j) {
                double temp = mat[tid * (2 * n) + j];
                mat[tid * (2 * n) + j] = mat[(tid + 1) * (2 * n) + j];
                mat[(tid + 1) * (2 * n) + j] = temp;
            }
        }
    }
}

__global__ void reduceToDiagonal(double *mat, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        if (mat[tid * (2 * n) + tid] != 0) {
            for (int i = tid + 1; i < n; ++i) {
                double d = mat[i * (2 * n) + tid] / mat[tid * (2 * n) + tid]; // Use d here
                for (int k = tid; k < 2 * n; ++k) {
                    mat[i * (2 * n) + k] -= mat[tid * (2 * n) + k] * d;
                }
            }
        }
    }
}

__global__ void reduceToUnitMatrix(double *mat, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        if (mat[tid * (2 * n) + tid] != 0) {
            double d = mat[tid * (2 * n) + tid]; // Use d here
            for (int j = tid; j < 2 * n; ++j) {
                mat[tid * (2 * n) + j] = mat[tid * (2 * n) + j] / d;
            }
        }
    }
}

void printMatrix(double *mat, int n) {
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < 2 * n; ++j) {
            std::cout << std::setw(8) << mat[i * (2 * n) + j] << " ";
        }
        std::cout << std::endl;
    }
}

void printDeviceMatrix(double *d_mat, int n) {
    double *temp_mat = new double[2 * n * 2 * n];
    cudaMemcpy(temp_mat, d_mat, (2 * n) * (2 * n) * sizeof(double), cudaMemcpyDeviceToHost);
    printMatrix(temp_mat, n);
    delete[] temp_mat;
}

int main() {
    int n;
    double *mat = nullptr;

    std::cin >> n;

    // Allocate memory for matrix array on CPU
    mat = new double[2 * n * 2 * n];

    // Inputs the coefficients of the matrix
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < n; ++j) {
            std::cin >> mat[i * (2 * n) + j];
        }
    }

    // Print the input matrix
    // std::cout << "Input matrix:" << std::endl;
    // printMatrix(mat, n);

    // CUDA memory allocation
    double *d_mat;
    cudaMalloc((void **)&d_mat, (2 * n) * (2 * n) * sizeof(double));
    cudaMemcpy(d_mat, mat, (2 * n) * (2 * n) * sizeof(double), cudaMemcpyHostToDevice);

    // Print the content of d_mat after memory copy
    // std::cout << "Content of d_mat after memory copy:" << std::endl;
    // printDeviceMatrix(d_mat, n);

    // Launch CUDA kernels
    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((n + threadsPerBlock.x - 1) / threadsPerBlock.x, 1);

    // Call CUDA kernel to make right hand side identity
    makeRightHandSideIdentity<<<numBlocks, threadsPerBlock>>>(d_mat, n);
    cudaDeviceSynchronize();

    std::cout << "Content of d_mat after making right hand side identity:" << std::endl;
    printDeviceMatrix(d_mat, n);

    partialPivoting<<<numBlocks, threadsPerBlock>>>(d_mat, n);
    cudaDeviceSynchronize();

    std::cout << "Content of d_mat after partial pivoting:" << std::endl;
    printDeviceMatrix(d_mat, n);

    reduceToDiagonal<<<numBlocks, threadsPerBlock>>>(d_mat, n);
    cudaDeviceSynchronize();

    // std::cout << "Content of d_mat after reduce to diagonal:" << std::endl;
    // printDeviceMatrix(d_mat, n);

    reduceToUnitMatrix<<<numBlocks, threadsPerBlock>>>(d_mat, n);
    cudaDeviceSynchronize();

    // Copy results back to CPU
    cudaMemcpy(mat, d_mat, (2 * n) * (2 * n) * sizeof(double), cudaMemcpyDeviceToHost);

    // Print the output matrix
    // std::cout << "Output matrix:" << std::endl;
    // printMatrix(mat, n);

    // Free memory
    delete[] mat;
    cudaFree(d_mat);

    return 0;
}

Overwriting cuda.cu


In [103]:
!nvcc cuda.cu -o cuda

In [104]:
!./cuda < 8.txt

Content of d_mat after making right hand side identity:
       6        2        8        1        6        4        8        8        1        0        0        0        0        0        0        0 
       5        4        8        6        6        4        4        9        0        1        0        0        0        0        0        0 
       1        1        2        7        5        5        4        8        0        0        1        0        0        0        0        0 
       8        0        9        3        1        1        0        1        0        0        0        1        0        0        0        0 
       5        5        6        9        6        4        7        1        0        0        0        0        1        0        0        0 
       0        1        3        5        9        7        7        9        0        0        0        0        0        1        0        0 
       2        8        7        3        4        4        7        2   