<a href="https://colab.research.google.com/github/jaidalmotra/Cuda/blob/main/Cuda_in_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# How to use cuda in Colab


In [2]:
!pip install cuda-python

Collecting cuda-python
  Downloading cuda_python-12.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cuda-python
Successfully installed cuda-python-12.3.0


In [3]:
# Check GPU availability
import torch
torch.cuda.is_available()


True

In [4]:
!nvidia-smi


Fri Feb  9 06:27:30 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0              26W /  70W |    121MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [1]:
import torch

# Check GPU availability
if torch.cuda.is_available():
    # Set device to GPU
    device = torch.device("cuda")
    print("Cuda available")
    print(torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available. Switching to CPU.")

# Example usage
x = torch.randn(5, 5).to(device)
y = torch.randn(5, 5).to(device)

# Perform operations on GPU
z = x + y

# Move result back to CPU if needed
z_cpu = z.to("cpu")
z


Cuda available
Tesla T4


tensor([[-1.2461,  0.0655, -0.5245,  0.3524,  2.4836],
        [ 1.9315, -1.1436, -0.2533, -1.4938,  1.5631],
        [ 1.9780, -0.7118, -5.0194, -0.1446, -1.3006],
        [ 0.5053, -1.0809, -1.1687, -0.6461,  1.7631],
        [-2.8332,  1.0968, -1.4914,  0.7832,  1.7377]], device='cuda:0')

In [9]:
# Check if GPU is available
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print('GPU device not found')
else:
    print('Found GPU at: {}'.format(device_name))

# Check CUDA version
!nvcc --version

Found GPU at: /device:GPU:0
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
import tensorflow as tf

# Check GPU availability
if tf.config.list_physical_devices('GPU'):
    print("GPU is available.")
else:
    print("GPU is not available. Switching to CPU.")

# Example usage
with tf.device('/GPU:0'):  # Use '/CPU:0' to force CPU
    x = tf.random.normal((5, 5))
    y = tf.random.normal((5, 5))

    # Perform operations on GPU
    z = x + y

# The result 'z' is automatically on GPU if available


GPU is available.


In [5]:
!pip install numba




Multiplication of two matrix

In [12]:
%%writefile prog.cu
#include <stdio.h>

#define N 3  // Size of the matrices (N x N)


__global__ void matrixMultiplication(int *a, int *b, int *c) {
    // Calculate the row and column indices of the element to be computed
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    // Perform matrix multiplication
    if (row < N && col < N) {
        int sum = 0;
        for (int k = 0; k < N; k++) {
            sum += a[row * N + k] * b[k * N + col];
        }
        c[row * N + col] = sum;
    }
}

int main() {
    int a[N][N], b[N][N], c[N][N];  // Input and output matrices

    // Initialize matrices a and b with random values
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            a[i][j] = i + j;
            b[i][j] = i - j;
        }
    }

    // Declare GPU memory pointers
    int *dev_a, *dev_b, *dev_c;

    // Allocate GPU memory
    cudaMalloc((void**)&dev_a, N * N * sizeof(int));
    cudaMalloc((void**)&dev_b, N * N * sizeof(int));
    cudaMalloc((void**)&dev_c, N * N * sizeof(int));

    // Copy input matrices from host to GPU memory
    cudaMemcpy(dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 threadsPerBlock(N, N);
    dim3 numBlocks(1, 1);

    // Launch CUDA kernel to perform matrix multiplication
    matrixMultiplication<<<numBlocks, threadsPerBlock>>>(dev_a, dev_b, dev_c);

    // Copy result matrix from GPU to host memory
    cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost);

    // Print the result matrix
    printf("Result Matrix:\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%d\t", c[i][j]);
        }
        printf("\n");
    }

    // Free GPU memory
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    return 0;
}


Writing prog.cu


In [19]:
!nvcc prog.cu -o prog

In [21]:
!./prog

Result Matrix:
5	2	-1	
8	2	-4	
11	2	-7	
