In [None]:
!nvcc --version


In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-4qq6kgv2
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-4qq6kgv2
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 5741c522547756ac4bb7a16df32106a15efb8a57
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: nvcc4jupyter
  Building wheel for nvcc4jupyter (pyproject.toml) ... [?25l[?25hdone
  Created wheel for nvcc4jupyter: filename=nvcc4jupyter-1.2.1-py3-none-any.whl size=10741 sha256=1c43b610d84440f376c57bb0b3d20f87e2433fdb6f06eb98d5fd7c81c461a971
  Stored in directory: /tmp/pip-ephem-wheel-cache-l9ggpvb8/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully bu

In [2]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmphmq9juku".


In [3]:
%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define N 1000000 // Size of vectors

// CUDA kernel for vector addition
__global__ void vectorAdd(int *a, int *b, int *c, int n) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n) {
        c[i] = a[i] + b[i];
    }
}

int main() {
    // Host vectors and initialization
    int *h_a = (int *)malloc(N * sizeof(int)); // Allocate memory for host vector a
    int *h_b = (int *)malloc(N * sizeof(int)); // Allocate memory for host vector b
    int *h_c = (int *)malloc(N * sizeof(int)); // Allocate memory for host vector c
    // Initialize host vectors a and b
    for (int i = 0; i < N; ++i) {
        h_a[i] = i;
        h_b[i] = i * 2;
    }

    // Device vectors
    int *d_a, *d_b, *d_c;
    cudaMalloc((void **)&d_a, N * sizeof(int)); // Allocate memory for device vector a
    cudaMalloc((void **)&d_b, N * sizeof(int)); // Allocate memory for device vector b
    cudaMalloc((void **)&d_c, N * sizeof(int)); // Allocate memory for device vector c

    // Copy host vectors to device
    cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice); // Copy host vector a to device
    cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice); // Copy host vector b to device

    // Launch vectorAdd kernel on GPU
    vectorAdd<<<ceil(N / 256.0), 256>>>(d_a, d_b, d_c, N); // Run kernel with appropriate grid and block dimensions

    // Copy result from device to host
    cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost); // Copy device vector c to host

    // Verify the result
    for (int i = 0; i < 10; ++i) { // Print the first 10 elements of vectors a, b, and c
        printf("%d + %d = %d\n", h_a[i], h_b[i], h_c[i]);
    }

    // Free memory
    free(h_a); // Free memory allocated for host vector a
    free(h_b); // Free memory allocated for host vector b
    free(h_c); // Free memory allocated for host vector c
    cudaFree(d_a); // Free memory allocated for device vector a
    cudaFree(d_b); // Free memory allocated for device vector b
    cudaFree(d_c); // Free memory allocated for device vector c

    return 0;
}



0 + 0 = 0
1 + 2 = 0
2 + 4 = 0
3 + 6 = 0
4 + 8 = 0
5 + 10 = 0
6 + 12 = 0
7 + 14 = 0
8 + 16 = 0
9 + 18 = 0



In [4]:
%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define N 32 // Matrix size

// CUDA kernel for matrix multiplication
__global__ void matrixMul(int *a, int *b, int *c, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < n && col < n) {
        int sum = 0;
        // Compute dot product of row of matrix A and column of matrix B
        for (int i = 0; i < n; ++i) {
            sum += a[row * n + i] * b[i * n + col];
        }
        // Store result in the corresponding cell of matrix C
        c[row * n + col] = sum;
    }
}

int main() {
    // Matrix dimensions
    int size = N * N * sizeof(int);

    // Host matrices
    int *h_a, *h_b, *h_c;

    // Device matrices
    int *d_a, *d_b, *d_c;

    // Allocate memory for host matrices
    h_a = (int *)malloc(size);
    h_b = (int *)malloc(size);
    h_c = (int *)malloc(size);

    // Initialize host matrices
    for (int i = 0; i < N * N; ++i) {
        h_a[i] = i;
        h_b[i] = i * 2;
    }

    // Allocate memory for device matrices
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);

    // Copy host matrices to device
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    // Define grid and block size
    dim3 dimGrid(ceil(N / 16.0), ceil(N / 16.0), 1);
    dim3 dimBlock(16, 16, 1);

    // Print input matrices
    printf("Matrix A (Input):\n");
    for (int i = 0; i < 4; ++i) {
        for (int j = 0; j < 4; ++j) {
            printf("%d ", h_a[i * N + j]);
        }
        printf("\n");
    }

    printf("\nMatrix B (Input):\n");
    for (int i = 0; i < 4; ++i) {
        for (int j = 0; j < 4; ++j) {
            printf("%d ", h_b[i * N + j]);
        }
        printf("\n");
    }

    // Launch matrixMul kernel on GPU
    matrixMul<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);

    // Copy result from device to host
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    // Print result matrix
    printf("\nResult Matrix (Output):\n");
    for (int i = 0; i < 4; ++i) {
        for (int j = 0; j < 4; ++j) {
            printf("%d ", h_c[i * N + j]);
        }
        printf("\n");
    }

    // Free device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Free host memory
    free(h_a);
    free(h_b);
    free(h_c);

    return 0;
}


Matrix A (Input):
0 1 2 3 
32 33 34 35 
64 65 66 67 
96 97 98 99 

Matrix B (Input):
0 2 4 6 
64 66 68 70 
128 130 132 134 
192 194 196 198 

Result Matrix (Output):
0 0 0 0 
0 0 0 0 
0 0 0 0 
0 0 0 0 

