In [1]:
!nvidia-smi

Sat Nov  1 13:34:45 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   52C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!apt-get update -y
!apt-get install -y cuda-toolkit-12-4     # we install this toolkit, so that nvcc version is same as the CUDA driver version (seen in nvidia-smi as CUDA Version: 12.4)

0% [Working]            Hit:1 https://cli.github.com/packages stable InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Wait                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:6 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB

In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [None]:
%%writefile vectorAdd.cu
// VECTOR ADDITION

#include <stdio.h>

#define cudaCheckError(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void vectorAdd(float *A, float *B, float *C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;  // Get index accessed by thread using threadId and blockId
    if (i < N)
        C[i] = A[i] + B[i];
}

int main() {
    int N = 1 << 20;
    size_t size = N * sizeof(float);

    float *A, *B, *C;
    cudaCheckError(cudaMallocManaged(&A, size));    // cudaMallocManaged creates common memory,accessible from CPU and GPU both
    cudaCheckError(cudaMallocManaged(&B, size));
    cudaCheckError(cudaMallocManaged(&C, size));

    for (int i = 0; i < N; i++) { A[i] = i; B[i] = 2*i; }

    int threads = 256;
    int blocks = (N + threads - 1) / threads;

    vectorAdd<<<blocks, threads>>>(A, B, C, N);
    cudaCheckError(cudaGetLastError());
    cudaCheckError(cudaDeviceSynchronize());

    printf("A[100]=%f, B[100]=%f, C[100]=%f\n", A[100], B[100], C[100]);

    cudaFree(A); cudaFree(B); cudaFree(C);
    return 0;
}


Writing vectorAdd.cu


In [None]:
!nvcc -arch=sm_75  vectorAdd.cu -o vector_add # Add -arch=sm_75 for the T4 GPU
!./vector_add

A[100]=100.000000, B[100]=200.000000, C[100]=300.000000


In [None]:
%%writefile matMul.cu
// MATRIX MULTIPLICATION

#include <stdio.h>
#define N 16

__global__ void matMul(float *A, float *B, float *C, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < n && col < n) {
        float sum = 0;
        for (int k = 0; k < n; k++)
            sum += A[row*n + k] * B[k*n + col];
        C[row*n + col] = sum;
    }
}

int main() {
    int size = N * N * sizeof(float);
    float *A, *B, *C;

    // cudaMallocManaged allocates the memory on both device and host
    cudaMallocManaged(&A, size);
    cudaMallocManaged(&B, size);
    cudaMallocManaged(&C, size);

    for (int i = 0; i < N*N; i++) { A[i] = 1.0f; B[i] = 2.0f; }

    // So, each block has threads that are composed of 16 threads along x-axis and 16 threads along y-axis
    // This is small enough to fit in shared memory (all threads in block share the 'shared memory')
    dim3 threads(16, 16);

    // number of blocks = (N+15)/16 X (N+15)/16
    dim3 blocks((N + 15)/16, (N + 15)/16);

    // Each thread computes one output element of matrix C; C and N are arguments
    matMul<<<blocks, threads>>>(A, B, C, N);
    cudaDeviceSynchronize();

    printf("C[0] = %f\n", C[0]);
    cudaFree(A); cudaFree(B); cudaFree(C);
    return 0;
}


Writing matMul.cu


In [None]:
!nvcc -arch=sm_75  matMul.cu -o matMul # Add -arch=sm_75 for the T4 GPU
!./matMul

C[0] = 32.000000


In [None]:
%%writefile reduceSum.cu
// VECTOR SUM
#include <stdio.h>

__global__ void reduceSum(float *input, float *output, int N) {
   // This is way to initialize the dynamic shared memory array, size of which has been declared during kernel launch
    extern __shared__ float sdata[];
    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    // such an i<N check will almost always be required, as our total number of threads (blocks*threads) will cross N slightly most
    // of the times as we did blocks = (N+threads-1)/threads
    sdata[tid] = (i < N) ? input[i] : 0.0f;
    // all threads within block stopped till everyone comes (barrier synchronization)
    __syncthreads();

    // So sdata[] is from sdata[0] to sdata[255]
    // Note that halving threads each time and adding like this only works if number of threads in block is a power of 2
    // Even for last block when i>=N is possible, sdata[] is still of size 255 and for i>=N, sdata[tid]=0.0f
    for (int stride = blockDim.x/2; stride > 0; stride >>= 1) {
        if (tid < stride)
            sdata[tid] += sdata[tid + stride];
        __syncthreads();
    }

    if (tid == 0)
        output[blockIdx.x] = sdata[0];
}

int main() {
    int N = 1024;
    int threads = 256;
    int blocks = (N + threads - 1) / threads;

    float *input, *partial, *result;
    cudaMallocManaged(&input, N * sizeof(float));
    cudaMallocManaged(&partial, blocks * sizeof(float));
    cudaMallocManaged(&result, sizeof(float));

    for (int i = 0; i < N; i++) input[i] = 1.0f;

    // kernelName<<<gridDim, blockDim, sharedMemSize, stream>>>(args...);
    // sharedMemSize = size of shared memory in bytes; shared memory is dynamic memory created within block
    // Each Streaming Multiprocessor (SM) has a small pool of on-chip shared memory (like an L1 scratchpad) that is divided among concurrently running blocks.
    // T4 GPU has 48 KB shared memory per block
    reduceSum<<<blocks, threads, threads*sizeof(float)>>>(input, partial, N);
    cudaDeviceSynchronize();

    *result = 0;
    for (int i = 0; i < blocks; i++) *result += partial[i];

    printf("Sum = %f\n", *result);
    cudaFree(input); cudaFree(partial); cudaFree(result);
    return 0;
}


Writing reduceSum.cu


In [None]:
!nvcc -arch=sm_75  reduceSum.cu -o reduceSum # Add -arch=sm_75 for the T4 GPU
!./reduceSum

Sum = 1024.000000


In [34]:
%%writefile simple_pointwise_mult.cu
// POINTWISE MODULAR MULTIPLICATION - 100X faster than sequential multiplication of coefficients

#include <stdint.h>
#include <stdio.h>
#include <chrono>

__global__ void pointwise_mul_u128(const uint64_t* A,
                                   const uint64_t* B,
                                   uint64_t* C,
                                   uint64_t q,
                                   size_t N) {
    size_t i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= N) return;
    // product of 64-bit integers will give 128-bit integer
    unsigned __int128 prod = (unsigned __int128)A[i] * (unsigned __int128)B[i];
    /*if(i==100)
    {
      printf("i: %lu\n",i);
      printf("a: %lu, b: %lu\n", A[i],B[i]);
    }
    */

    // This modulus is in fact expensive, due to which Montgomery multiplication is used which just uses bit shifts, etc.
    uint64_t r = (uint64_t)(prod % q);
    C[i] = r;
}

int main() {
    int N = 1 << 20;
    size_t size = N * sizeof(uint64_t);

    uint64_t *A, *B, *C;
    cudaMallocManaged(&A, size);
    cudaMallocManaged(&B, size);
    cudaMallocManaged(&C, size);

    for (int i = 0; i < N; i++) { A[i] = i; B[i] = 2*i; }

    A[100]=(1ULL<<50)-1;
    B[100]=(1ULL<<59)+3;

    printf("A[100] = %llu\n", (unsigned long long)A[100]);
    printf("B[100] = %llu\n", (unsigned long long)B[100]);

    int threads = 256;
    int blocks = (N + threads - 1) / threads;

    // large prime
    uint64_t q = (1ULL << 61) - 1;

    auto start = std::chrono::high_resolution_clock::now();
    pointwise_mul_u128<<<blocks, threads>>>(A, B, C, q, N);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> elapsed = end - start;

    cudaDeviceSynchronize();

    printf("A[100]=%lu, B[100]=%lu, C[100]=%lu\n", A[100], B[100], C[100]);
    printf("CPU execution time: %.3f ms\n", elapsed.count());

    cudaFree(A); cudaFree(B); cudaFree(C);
    return 0;
}

Writing simple_pointwise_mult.cu


In [40]:
!nvcc -arch=sm_75  simple_pointwise_mult.cu -o simple_pointwise_mult # Add -arch=sm_75 for the T4 GPU
!time ./simple_pointwise_mult

A[100] = 1125899906842623
B[100] = 576460752303423491
A[100]=1125899906842623, B[100]=576460752303423491, C[100]=1733041431607508988
CPU execution time: 0.166 ms

real	0m0.146s
user	0m0.024s
sys	0m0.118s


In [37]:
%%writefile simple_pointwise_mult_cpu.cpp
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <chrono>

int main() {
    int N = 1 << 20;  // number of coefficients
    uint64_t q = (1ULL << 61) - 1;  // 2^61 - 1 (Mersenne prime)

    uint64_t *A = (uint64_t*)malloc(N * sizeof(uint64_t));
    uint64_t *B = (uint64_t*)malloc(N * sizeof(uint64_t));
    uint64_t *C = (uint64_t*)malloc(N * sizeof(uint64_t));

    for (int i = 0; i < N; i++) {
        A[i] = i;
        B[i] = 2 * i;
    }

    // Use large test values at index 100
    A[100] = (1ULL << 50) - 1;
    B[100] = (1ULL << 59) + 3;


    auto start = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < N; i++) {
        __uint128_t prod = (__uint128_t)A[i] * (__uint128_t)B[i];
        C[i] = (uint64_t)(prod % q);

        // if (i == 100) {
        //     printf("i = %d\n", i);
        //     printf("A[100] = %llu\n", (unsigned long long)A[i]);
        //     printf("B[100] = %llu\n", (unsigned long long)B[i]);
        //     printf("prod = %llu (low 64 bits)\n", (unsigned long long)prod);
        //     printf("C[100] = %llu\n", (unsigned long long)C[i]);
        // }
    }
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> elapsed = end - start;
    printf("CPU execution time: %.3f ms\n", elapsed.count());

    printf("\nHost summary:\nA[100]=%llu, B[100]=%llu, C[100]=%llu\n",
           (unsigned long long)A[100],
           (unsigned long long)B[100],
           (unsigned long long)C[100]);

    free(A);
    free(B);
    free(C);
    return 0;
}


Writing simple_pointwise_mult_cpu.cpp


In [None]:
# Pointwise Modular multiplication
!g++ -std=c++17 simple_pointwise_mult_cpu.cpp -o simple_pointwise_mult_cpu
!./simple_pointwise_mult_cpu

CPU execution time: 17.595 ms

Host summary:
A[100]=1125899906842623, B[100]=576460752303423491, C[100]=1733041431607508988


In [41]:
%%writefile montmogery_pointwise_mult.cu

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <chrono>

__global__ void montgomery_mul_u128(const uint64_t* A,
                                   const uint64_t* B,
                                   uint64_t* C,
                                   uint64_t q,
                                   size_t N, uint64_t qinv_mod_R, int k)
{
    size_t i = blockIdx.x * blockDim.x + threadIdx.x;
    if(i>=N)
      return;

    uint64_t a=A[i]; uint64_t b=B[i];

    __uint128_t t = (__uint128_t)a * (__uint128_t)b;
    // uint64_t Rmask = 0xFFFFFFFFFFFFFFFFULL;
    // '& Rmask' ensures that we take only the bottom 64 bits
    // uint64_t m = (uint64_t)((__uint128_t)(t * qinv_mod_R) & Rmask);  // (t * qinv) mod R
    uint64_t m = (uint64_t)((__uint128_t)t * qinv_mod_R);   // automatically truncates to lower 64 bits; no need to do '& Rmask' explicitly
    __uint128_t temp = (__uint128_t)(t + (__uint128_t)m * (__uint128_t)q);
    uint64_t u = (uint64_t)(temp >> k); // divide by R (shift);
    if (u >= q) u -= q;

    /*
    a'<q, b'<q
    q<R
    so, a'*b' = t <qR
    m<R, so m*q < R*q
    So, (t+m*q) < 2*q*R
    So, (t+m*q)/R < 2*q
    */
    C[i]=u;

}



int main() {

    int N = 1 << 20;
    size_t size = N * sizeof(uint64_t);

    uint64_t *A, *B, *C, *C1;
    cudaMallocManaged(&A, size);
    cudaMallocManaged(&B, size);
    cudaMallocManaged(&C, size);
    cudaMallocManaged(&C1, size);

    uint64_t *A1, *B1;
    A1 = (uint64_t*)malloc(N * sizeof(uint64_t));
    B1 = (uint64_t*)malloc(N * sizeof(uint64_t));

    for (int i = 0; i < N; i++) { A1[i] = i; B1[i] = 2*i; }

    A1[100]=(1ULL<<50)-1;
    B1[100]=(1ULL<<59)+3;

    printf("A1[3] = %llu\n", (unsigned long long)A1[3]);
    printf("B1[3] = %llu\n", (unsigned long long)B1[3]);

    int threads = 256;
    int blocks = (N + threads - 1) / threads;

    // large prime
    uint64_t q = (1ULL << 61) - 1;
    uint64_t qinv_mod_R = 2305843009213693953;

    int k=64;
    __uint128_t R=((__uint128_t)1<<k);

    for(int i=0;i<N;i++)
    {
      __uint128_t x = __uint128_t(A1[i])*R;
      A[i]=(uint64_t)(x%q);
      if(i==3)
        printf("A[%d]=%llu\n",i,(unsigned long long)A[i]);
    }

    for(int i=0;i<N;i++)
    {
      __uint128_t x = __uint128_t(B1[i])*R;
      B[i]=(uint64_t)(x%q);
    }


    auto start = std::chrono::high_resolution_clock::now();
    montgomery_mul_u128<<<blocks, threads>>>(A, B, C, q, N, qinv_mod_R, k);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> elapsed = end - start;

    cudaDeviceSynchronize();

    for(int i=0;i<N;i++)
    {
      B[i]=1;
    }

    montgomery_mul_u128<<<blocks,threads>>>(C,B,C1,q,N,qinv_mod_R,k);
    cudaDeviceSynchronize();

    printf("A1[3]=%lu, B1[3]=%lu, C[3]: %lu, B[3]: %lu, C1[3]=%lu\n", A1[3], B1[3], C[3], B[3], C1[3]);
    printf("A1[100]=%lu, B1[100]=%lu, C[100]: %lu, B[100]: %lu, C1[100]=%lu\n", A1[100], B1[100], C[100], B[100], C1[100]);
    printf("Execution time: %.3f ms\n", elapsed.count());

    cudaFree(A); cudaFree(B); cudaFree(C); cudaFree(C1);
    free(A1); free(B1);

    return 0;
}


Overwriting montmogery_pointwise_mult.cu


In [43]:
!nvcc -arch=sm_75  montmogery_pointwise_mult.cu -o montmogery_pointwise_mult # Add -arch=sm_75 for the T4 GPU
!time ./montmogery_pointwise_mult

A1[3] = 3
B1[3] = 6
A[3]=24
A1[3]=3, B1[3]=6, C[3]: 144, B[3]: 1, C1[3]=18
A1[100]=1125899906842623, B1[100]=576460752303423491, C[100]: 29273397577908198, B[100]: 1, C1[100]=1733041431607508988
Execution time: 0.172 ms

real	0m0.243s
user	0m0.115s
sys	0m0.124s
