In [1]:
!nvidia-smi

Wed Oct 29 14:22:11 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   51C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!apt-get update -y
!apt-get install -y cuda-toolkit-12-4     # we install this toolkit, so that nvcc version is same as the CUDA driver version (seen in nvidia-smi as CUDA Version: 12.4)

0% [Working]            Hit:1 https://cli.github.com/packages stable InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.81)] [Connecting to security.                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,086 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubun

In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [15]:
%%writefile vectorAdd.cu
// VECTOR ADDITION

#include <stdio.h>

#define cudaCheckError(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void vectorAdd(float *A, float *B, float *C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
        C[i] = A[i] + B[i];
}

int main() {
    int N = 1 << 20;
    size_t size = N * sizeof(float);

    float *A, *B, *C;
    cudaCheckError(cudaMallocManaged(&A, size));
    cudaCheckError(cudaMallocManaged(&B, size));
    cudaCheckError(cudaMallocManaged(&C, size));

    for (int i = 0; i < N; i++) { A[i] = i; B[i] = 2*i; }

    int threads = 256;
    int blocks = (N + threads - 1) / threads;

    vectorAdd<<<blocks, threads>>>(A, B, C, N);
    cudaCheckError(cudaGetLastError());
    cudaCheckError(cudaDeviceSynchronize());

    printf("A[100]=%f, B[100]=%f, C[100]=%f\n", A[100], B[100], C[100]);

    cudaFree(A); cudaFree(B); cudaFree(C);
    return 0;
}


Overwriting vectorAdd.cu


In [18]:
!nvcc -arch=sm_75  vectorAdd.cu -o vector_add # Add -arch=sm_75 for the T4 GPU
!./vector_add

A[100]=100.000000, B[100]=200.000000, C[100]=300.000000


In [19]:
%%writefile matMul.cu
// MATRIX MULTIPLICATION

#include <stdio.h>
#define N 16

__global__ void matMul(float *A, float *B, float *C, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < n && col < n) {
        float sum = 0;
        for (int k = 0; k < n; k++)
            sum += A[row*n + k] * B[k*n + col];
        C[row*n + col] = sum;
    }
}

int main() {
    int size = N * N * sizeof(float);
    float *A, *B, *C;

    // cudaMallocManaged allocates the memory on both device and host
    cudaMallocManaged(&A, size);
    cudaMallocManaged(&B, size);
    cudaMallocManaged(&C, size);

    for (int i = 0; i < N*N; i++) { A[i] = 1.0f; B[i] = 2.0f; }

    // So, each block has threads that are composed of 16 threads along x-axis and 16 threads along y-axis
    // This is small enough to fit in shared memory (all threads in block share the 'shared memory')
    dim3 threads(16, 16);

    // number of blocks = (N+15)/16 X (N+15)/16
    dim3 blocks((N + 15)/16, (N + 15)/16);

    // Each thread computes one output element of matrix C; C and N are arguments
    matMul<<<blocks, threads>>>(A, B, C, N);
    cudaDeviceSynchronize();

    printf("C[0] = %f\n", C[0]);
    cudaFree(A); cudaFree(B); cudaFree(C);
    return 0;
}


Writing matMul.cu


In [20]:
!nvcc -arch=sm_75  matMul.cu -o matMul # Add -arch=sm_75 for the T4 GPU
!./matMul

C[0] = 32.000000


In [None]:
%%writefile reduceSum.cu
// VECTOR SUM
#include <stdio.h>

__global__ void reduceSum(float *input, float *output, int N) {
   // This is way to initialize the dynamic shared memory array, size of which has been declared during kernel launch
    extern __shared__ float sdata[];
    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    // such an i<N check will almost always be required, as our total number of threads (blocks*threads) will cross N slightly most
    // of the times as we did blocks = (N+threads-1)/threads
    sdata[tid] = (i < N) ? input[i] : 0.0f;
    // all threads within block stopped till everyone comes (barrier synchronization)
    __syncthreads();

    // So sdata[] is from sdata[0] to sdata[255]
    // Note that halving threads each time and adding like this only works if number of threads in block is a power of 2
    // Even for last block when i>=N is possible, sdata[] is still of size 255 and for i>=N, sdata[tid]=0.0f
    for (int stride = blockDim.x/2; stride > 0; stride >>= 1) {
        if (tid < stride)
            sdata[tid] += sdata[tid + stride];
        __syncthreads();
    }

    if (tid == 0)
        output[blockIdx.x] = sdata[0];
}

int main() {
    int N = 1024;
    int threads = 256;
    int blocks = (N + threads - 1) / threads;

    float *input, *partial, *result;
    cudaMallocManaged(&input, N * sizeof(float));
    cudaMallocManaged(&partial, blocks * sizeof(float));
    cudaMallocManaged(&result, sizeof(float));

    for (int i = 0; i < N; i++) input[i] = 1.0f;

    // kernelName<<<gridDim, blockDim, sharedMemSize, stream>>>(args...);
    // sharedMemSize = size of shared memory in bytes; shared memory is dynamic memory created within block
    // Each Streaming Multiprocessor (SM) has a small pool of on-chip shared memory (like an L1 scratchpad) that is divided among concurrently running blocks.
    // T4 GPU has 48 KB shared memory per block
    reduceSum<<<blocks, threads, threads*sizeof(float)>>>(input, partial, N);
    cudaDeviceSynchronize();

    *result = 0;
    for (int i = 0; i < blocks; i++) *result += partial[i];

    printf("Sum = %f\n", *result);
    cudaFree(input); cudaFree(partial); cudaFree(result);
    return 0;
}


In [None]:
!nvcc -arch=sm_75  reduceSum.cu -o reduceSum # Add -arch=sm_75 for the T4 GPU
!./reduceSum

In [39]:
%%writefile simple_pointwise_mult.cu
// POINTWISE MODULAR MULTIPLICATION

#include <stdint.h>
#include <stdio.h>

__global__ void pointwise_mul_u128(const uint64_t* A,
                                   const uint64_t* B,
                                   uint64_t* C,
                                   uint64_t q,
                                   size_t N) {
    size_t i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= N) return;
    unsigned __int128 prod = (unsigned __int128)A[i] * (unsigned __int128)B[i];
    /*if(i==100)
    {
      printf("i: %lu\n",i);
      printf("a: %lu, b: %lu\n", A[i],B[i]);
    }
    */
    uint64_t r = (uint64_t)(prod % q);
    C[i] = r;
}

int main() {
    int N = 1 << 20;
    size_t size = N * sizeof(uint64_t);

    uint64_t *A, *B, *C;
    cudaMallocManaged(&A, size);
    cudaMallocManaged(&B, size);
    cudaMallocManaged(&C, size);

    for (int i = 0; i < N; i++) { A[i] = i; B[i] = 2*i; }

    A[100]=1ULL<<50-1;
    B[100]=1ULL<<59+3;

    printf("A[100] = %llu\n", (unsigned long long)A[100]);
    printf("B[100] = %llu\n", (unsigned long long)B[100]);

    int threads = 256;
    int blocks = (N + threads - 1) / threads;

    uint64_t q = (1ULL << 61) - 1;
    pointwise_mul_u128<<<blocks, threads>>>(A, B, C, q, N);
    cudaDeviceSynchronize();

    printf("A[100]=%lu, B[100]=%lu, C[100]=%lu\n", A[100], B[100], C[100]);

    cudaFree(A); cudaFree(B); cudaFree(C);
    return 0;
}

Overwriting simple_pointwise_mult.cu


In [40]:
!nvcc -arch=sm_75  simple_pointwise_mult.cu -o simple_pointwise_mult # Add -arch=sm_75 for the T4 GPU
!time ./simple_pointwise_mult

A[100] = 562949953421312
B[100] = 4611686018427387904
A[100]=562949953421312, B[100]=4611686018427387904, C[100]=1125899906842624

real	0m0.135s
user	0m0.027s
sys	0m0.102s


In [41]:
%%writefile simple_pointwise_mult_cpu.cpp
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

int main() {
    int N = 1 << 20;  // number of coefficients
    uint64_t q = (1ULL << 61) - 1;  // 2^61 - 1 (Mersenne prime)

    uint64_t *A = (uint64_t*)malloc(N * sizeof(uint64_t));
    uint64_t *B = (uint64_t*)malloc(N * sizeof(uint64_t));
    uint64_t *C = (uint64_t*)malloc(N * sizeof(uint64_t));

    for (int i = 0; i < N; i++) {
        A[i] = i;
        B[i] = 2 * i;
    }

    // Use large test values at index 100
    A[100]=1ULL<<50-1;
    B[100]=1ULL<<59+3;

    // printf("A[100] = %lu\n", (unsigned long)A[100]);
            // printf("B[100] = %llu\n", (unsigned long long)B[100]);

    for (int i = 0; i < N; i++) {
        __uint128_t prod = (__uint128_t)A[i] * (__uint128_t)B[i];
        C[i] = (uint64_t)(prod % q);

        // if (i == 100) {
        //     printf("i = %d\n", i);
        //     printf("A[100] = %lu\n", (unsigned long)A[i]);
        //     printf("B[100] = %llu\n", (unsigned long long)B[i]);
        //     printf("prod = %llu (low 64 bits)\n", (unsigned long long)prod);
        //     printf("C[100] = %llu\n", (unsigned long long)C[i]);
        // }
    }

    printf("\nHost summary:\nA[100]=%llu, B[100]=%llu, C[100]=%llu\n",
           (unsigned long long)A[100],
           (unsigned long long)B[100],
           (unsigned long long)C[100]);

    free(A);
    free(B);
    free(C);
    return 0;
}


Overwriting simple_pointwise_mult_cpu.cpp


In [42]:
# Pointwise Modular multiplication
!g++ -std=c++17 simple_pointwise_mult_cpu.cpp -o simple_pointwise_mult_cpu
!time ./simple_pointwise_mult_cpu


Host summary:
A[100]=562949953421312, B[100]=4611686018427387904, C[100]=1125899906842624

real	0m0.035s
user	0m0.022s
sys	0m0.013s
