<a href="https://colab.research.google.com/github/harshitkgupta/HighPerformaceComputing/blob/main/CudaHelloWorld.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup Cuda in Colab Notebook

1. Connect to "T4 GPU" runtime

In [6]:
!python --version
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Python 3.12.12
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


2. Write First Cuda Kernel

In [11]:
%%cuda
#include <stdio.h>
#include <stdlib.h>

__global__ void hello(int* output){
    // Calculate a unique index for each thread
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    // Write a unique value to indicate execution from this block/thread
    output[idx] = blockIdx.x * 100 + threadIdx.x;
}

int main(){
    printf("Hello from host.\n");

    int num_blocks = 2;
    int threads_per_block = 2;
    int total_elements = num_blocks * threads_per_block;

    // Host array to store results
    int* h_output = (int*) malloc(total_elements * sizeof(int));
    if (h_output == NULL) {
        printf("Failed to allocate host memory!\n");
        return 1;
    }

    // Device array
    int* d_output;
    cudaError_t err = cudaMalloc((void**)&d_output, total_elements * sizeof(int));
    if (err != cudaSuccess) {
        printf("Failed to allocate device memory: %s\n", cudaGetErrorString(err));
        free(h_output);
        return 1;
    }

    // Initialize device memory to a known value (-1) to verify kernel writes
    err = cudaMemset(d_output, -1, total_elements * sizeof(int));
    if (err != cudaSuccess) {
        printf("Failed to memset device memory: %s\n", cudaGetErrorString(err));
        cudaFree(d_output);
        free(h_output);
        return 1;
    }

    // Launch kernel
    hello<<<num_blocks, threads_per_block>>>(d_output);

    // Synchronize to ensure kernel completes
    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) {
        printf("Failed to synchronize device: %s\n", cudaGetErrorString(err));
        cudaFree(d_output);
        free(h_output);
        return 1;
    }

    // Copy results from device to host
    err = cudaMemcpy(h_output, d_output, total_elements * sizeof(int), cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) {
        printf("Failed to copy device to host: %s\n", cudaGetErrorString(err));
        cudaFree(d_output);
        free(h_output);
        return 1;
    }

    // Print results from host
    printf("\nKernel output (values from block * 100 + thread):\n");
    for(int i = 0; i < total_elements; ++i) {
        printf("Output[%d]: %d\n", i, h_output[i]);
    }

    // Free device memory
    cudaFree(d_output);
    // Free host memory
    free(h_output);

    return 0;
}

Hello from host.

Kernel output (values from block * 100 + thread):
Output[0]: -1
Output[1]: -1
Output[2]: -1
Output[3]: -1

