**PS 1**

Execute the following program and check the properties of your GPU.

In [5]:
%%writefile deviceQuery.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

int main()
{
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount == 0)
    {
        printf("There is no device supporting CUDA\n");
        return 0;
    }
    int dev;
    for (dev = 0; dev < deviceCount; ++dev)
    {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        if (dev == 0)
        {
            if (deviceProp.major < 1)
                printf("There is no device supporting CUDA.\n");
            else if (deviceCount == 1)
                printf("There is 1 device supporting CUDA\n");
            else
                printf("There are %d devices supporting CUDA\n", deviceCount);
        }
        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
        printf("  Major revision number:                         %d\n", deviceProp.major);
        printf("  Minor revision number:                         %d\n", deviceProp.minor);
        printf("  Total amount of global memory:                 %zu bytes\n", deviceProp.totalGlobalMem);
        printf("  Total amount of constant memory:               %zu bytes\n", deviceProp.totalConstMem);
        printf("  Total amount of shared memory per block:       %zu bytes\n", deviceProp.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
        printf("  Multiprocessor count:                          %d\n", deviceProp.multiProcessorCount);
        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
                deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
                deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %zu bytes\n", deviceProp.memPitch);
        printf("  Texture alignment:                             %zu bytes\n", deviceProp.textureAlignment);
        printf("  Clock rate:                                    %d kilohertz\n", deviceProp.clockRate);
    }
    return 0;
}


Writing deviceQuery.cu


In [13]:
!nvidia-smi


Mon Nov  3 06:10:24 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   55C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [6]:
!nvcc deviceQuery.cu -o deviceQuery


In [7]:
!./deviceQuery


There is 1 device supporting CUDA

Device 0: "Tesla T4"
  Major revision number:                         7
  Minor revision number:                         5
  Total amount of global memory:                 15828320256 bytes
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Multiprocessor count:                          40
  Maximum number of threads per block:           1024
  Maximum sizes of each dimension of a block:    1024 x 1024 x 64
  Maximum sizes of each dimension of a grid:     2147483647 x 65535 x 65535
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Clock rate:                                    1590000 kilohertz


**PS 2**

Write a program to where each thread prints its thread ID along with hello world. Lauch the kernel with one block and multiple threads.

In [2]:
!pip install cupy-cuda12x




In [19]:
import cupy as cp

# Allocate space for messages (one per thread)
num_threads = 8
messages = cp.empty((num_threads,), dtype=cp.int32)

# Define a CUDA kernel that stores thread IDs
kernel = cp.RawKernel(r'''
extern "C" __global__
void hello(int *out) {
    int tid = threadIdx.x;
    out[tid] = tid;
}
''', 'hello')

# Launch kernel with 1 block and 8 threads
kernel((1,), (num_threads,), (messages,))

# Copy back and print from host
cp.cuda.Stream.null.synchronize()

for tid in messages.get():
    print(f"Hello World from thread {tid}")


Hello World from thread 0
Hello World from thread 1
Hello World from thread 2
Hello World from thread 3
Hello World from thread 4
Hello World from thread 5
Hello World from thread 6
Hello World from thread 7


**PS 3**

Write a program to where each thread prints its thread ID along with hello world. Lauch the kernel with multiple blocks and multiple threads.

In [20]:
import cupy as cp

# Total configuration
threads_per_block = 4
num_blocks = 3
total_threads = threads_per_block * num_blocks

# Allocate array to store thread IDs
messages = cp.empty((total_threads,), dtype=cp.int32)

# Define CUDA kernel
kernel_code = r'''
extern "C" __global__
void hello(int *out) {
    int global_id = blockIdx.x * blockDim.x + threadIdx.x;
    out[global_id] = global_id;
}
'''

# Compile kernel
hello_kernel = cp.RawKernel(kernel_code, 'hello')

# Launch kernel with multiple blocks & threads
hello_kernel((num_blocks,), (threads_per_block,), (messages,))

# Wait for GPU to finish
cp.cuda.Stream.null.synchronize()

# Print output from host
for i, tid in enumerate(messages.get()):
    print(f"Hello World from block {i // threads_per_block}, thread {i % threads_per_block}, global thread ID = {tid}")


Hello World from block 0, thread 0, global thread ID = 0
Hello World from block 0, thread 1, global thread ID = 1
Hello World from block 0, thread 2, global thread ID = 2
Hello World from block 0, thread 3, global thread ID = 3
Hello World from block 1, thread 0, global thread ID = 4
Hello World from block 1, thread 1, global thread ID = 5
Hello World from block 1, thread 2, global thread ID = 6
Hello World from block 1, thread 3, global thread ID = 7
Hello World from block 2, thread 0, global thread ID = 8
Hello World from block 2, thread 1, global thread ID = 9
Hello World from block 2, thread 2, global thread ID = 10
Hello World from block 2, thread 3, global thread ID = 11


**PS 4**

Write a program to where each thread prints its thread ID along with hello world. Lauch the kernel with 2D blocks and 2D threads.

In [21]:
import cupy as cp

# Define 2D grid and 2D block dimensions
threads_per_block = (3, 3)   # 3x3 threads = 9 threads per block
num_blocks = (2, 2)          # 2x2 blocks = 4 blocks total

# Total threads = 4 blocks × 9 threads = 36
total_threads = num_blocks[0] * num_blocks[1] * threads_per_block[0] * threads_per_block[1]

# Allocate output arrays
thread_ids = cp.empty((total_threads,), dtype=cp.int32)

# CUDA kernel
kernel_code = r'''
extern "C" __global__
void hello2D(int *out) {
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int bx = blockIdx.x;
    int by = blockIdx.y;
    int bw = blockDim.x;
    int bh = blockDim.y;
    int gw = gridDim.x;

    // Compute a unique global thread ID for 2D grid & block
    int global_id = ((by * gw + bx) * (bw * bh)) + (ty * bw + tx);

    out[global_id] = global_id;
}
'''

# Compile the kernel
hello2D_kernel = cp.RawKernel(kernel_code, 'hello2D')

# Launch kernel (2D grid, 2D block)
hello2D_kernel(num_blocks, threads_per_block, (thread_ids,))

# Wait for GPU to finish
cp.cuda.Stream.null.synchronize()

# Print output
for gid in range(total_threads):
    bx = (gid // (threads_per_block[0] * threads_per_block[1])) % num_blocks[0]
    by = gid // ((threads_per_block[0] * threads_per_block[1]) * num_blocks[0])
    local_id = gid % (threads_per_block[0] * threads_per_block[1])
    tx = local_id % threads_per_block[0]
    ty = local_id // threads_per_block[0]
    print(f"Hello World from Block ({bx}, {by}), Thread ({tx}, {ty}), Global Thread ID = {gid}")


Hello World from Block (0, 0), Thread (0, 0), Global Thread ID = 0
Hello World from Block (0, 0), Thread (1, 0), Global Thread ID = 1
Hello World from Block (0, 0), Thread (2, 0), Global Thread ID = 2
Hello World from Block (0, 0), Thread (0, 1), Global Thread ID = 3
Hello World from Block (0, 0), Thread (1, 1), Global Thread ID = 4
Hello World from Block (0, 0), Thread (2, 1), Global Thread ID = 5
Hello World from Block (0, 0), Thread (0, 2), Global Thread ID = 6
Hello World from Block (0, 0), Thread (1, 2), Global Thread ID = 7
Hello World from Block (0, 0), Thread (2, 2), Global Thread ID = 8
Hello World from Block (1, 0), Thread (0, 0), Global Thread ID = 9
Hello World from Block (1, 0), Thread (1, 0), Global Thread ID = 10
Hello World from Block (1, 0), Thread (2, 0), Global Thread ID = 11
Hello World from Block (1, 0), Thread (0, 1), Global Thread ID = 12
Hello World from Block (1, 0), Thread (1, 1), Global Thread ID = 13
Hello World from Block (1, 0), Thread (2, 1), Global Threa

**PS 5**

Vector Addition using CUDA
Problem Statement: Write a CUDA C program that performs element-wise addition of two vectors A and B of size N. The result of the addition should be stored in vector C.
Details:
•	Initialize the vectors A and B with random numbers.
•	The output vector C[i] = A[i] + B[i], where i ranges from 0 to N-1.
•	Use CUDA kernels to perform the computation in parallel.
•	Write the code for both serial (CPU-based) and parallel (CUDA-based) implementations.
•	Measure the execution time of both the serial and CUDA implementations for different values of N (e.g., N = 10^5, 10^6, 10^7).
Task:
•	Calculate and report the speedup (i.e., the ratio of CPU execution time to GPU execution time).

In [41]:
%%writefile vectorAdd.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>
#include <math.h>

// CUDA kernel for vector addition
__global__ void vectorAdd(float *A, float *B, float *C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
        C[i] = A[i] + B[i];
}

// CPU implementation
void vectorAddCPU(float *A, float *B, float *C, int N) {
    for (int i = 0; i < N; i++) {
        C[i] = A[i] + B[i];
    }
}

// Initialize vectors
void initializeVectors(float *A, float *B, int N) {
    for (int i = 0; i < N; i++) {
        A[i] = (float)rand() / RAND_MAX;
        B[i] = (float)rand() / RAND_MAX;
    }
}

int main() {
    int N_values[] = {100000, 1000000, 10000000};
    int num_cases = 3;

    for (int c = 0; c < num_cases; c++) {
        int N = N_values[c];
        size_t size = N * sizeof(float);

        printf("\n=============================\n");
        printf("Vector Size: %d\n", N);

        // Allocate host memory
        float *h_A = (float *)malloc(size);
        float *h_B = (float *)malloc(size);
        float *h_C_CPU = (float *)malloc(size);
        float *h_C_GPU = (float *)malloc(size);

        initializeVectors(h_A, h_B, N);

        // CPU computation
        clock_t start_cpu = clock();
        vectorAddCPU(h_A, h_B, h_C_CPU, N);
        clock_t end_cpu = clock();
        double cpu_time = ((double)(end_cpu - start_cpu)) / CLOCKS_PER_SEC;

        // GPU computation
        float *d_A, *d_B, *d_C;
        cudaMalloc((void **)&d_A, size);
        cudaMalloc((void **)&d_B, size);
        cudaMalloc((void **)&d_C, size);

        cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

        int threadsPerBlock = 256;
        int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

        cudaEvent_t start_gpu, stop_gpu;
        cudaEventCreate(&start_gpu);
        cudaEventCreate(&stop_gpu);

        cudaEventRecord(start_gpu);
        vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
        cudaEventRecord(stop_gpu);

        cudaMemcpy(h_C_GPU, d_C, size, cudaMemcpyDeviceToHost);
        cudaEventSynchronize(stop_gpu);

        float gpu_time = 0;
        cudaEventElapsedTime(&gpu_time, start_gpu, stop_gpu);
        gpu_time /= 1000.0;  // convert ms to seconds

        // Verify correctness
        int correct = 1;
        for (int i = 0; i < N; i++) {
            if (fabs(h_C_CPU[i] - h_C_GPU[i]) > 1e-5) {
                correct = 0;
                break;
            }
        }

        // Print results
        printf("CPU Time: %.6f s\n", cpu_time);
        printf("GPU Time: %.6f s\n", gpu_time);
        if (gpu_time > 0)
            printf("Speedup: %.2fx\n", cpu_time / gpu_time);
        else
            printf("Speedup: N/A\n");

        printf("Result Verification: %s\n", correct ? "SUCCESS ✅" : "FAIL ❌");

        // CSV-style output for graph plotting
        printf("CSV_RESULT,%d,%.6f,%.6f\n", N, cpu_time, gpu_time);

        // Free memory
        free(h_A); free(h_B); free(h_C_CPU); free(h_C_GPU);
        cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    }

    return 0;
}


Overwriting vectorAdd.cu


In [43]:
!nvcc -arch=sm_75 vectorAdd.cu -o vectorAdd
!./vectorAdd



Vector Size: 100000
CPU Time: 0.000547 s
GPU Time: 0.000128 s
Speedup: 4.28x
Result Verification: SUCCESS ✅
CSV_RESULT,100000,0.000547,0.000128

Vector Size: 1000000
CPU Time: 0.005890 s
GPU Time: 0.000055 s
Speedup: 106.27x
Result Verification: SUCCESS ✅
CSV_RESULT,1000000,0.005890,0.000055

Vector Size: 10000000
CPU Time: 0.045206 s
GPU Time: 0.000467 s
Speedup: 96.74x
Result Verification: SUCCESS ✅
CSV_RESULT,10000000,0.045206,0.000467


**PS 6**

Matrix Addition using CUDA
Problem Statement: Write a CUDA C program to perform element-wise addition of two matrices A and B of size M x N. The result of the addition should be stored in matrix C.
Details:
•	Initialize the matrices A and B with random values.
•	The output matrix C[i][j] = A[i][j] + B[i][j] where i ranges from 0 to M-1 and j ranges from 0 to N-1.
•	Write code for both serial (CPU-based) and parallel (CUDA-based) implementations.
•	Measure the execution time of both implementations for various matrix sizes (e.g., 100x100, 500x500, 1000x1000).
Task:
•	Calculate the speedup using the execution times of the CPU and GPU implementations.

In [44]:
%%writefile matrixAdd.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>
#include <math.h>

// CUDA kernel for matrix addition
__global__ void matrixAdd(float *A, float *B, float *C, int M, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int idx = row * N + col;

    if (row < M && col < N) {
        C[idx] = A[idx] + B[idx];
    }
}

// CPU implementation of matrix addition
void matrixAddCPU(float *A, float *B, float *C, int M, int N) {
    for (int i = 0; i < M * N; i++) {
        C[i] = A[i] + B[i];
    }
}

// Initialize matrices with random numbers
void initializeMatrices(float *A, float *B, int M, int N) {
    for (int i = 0; i < M * N; i++) {
        A[i] = (float)rand() / RAND_MAX;
        B[i] = (float)rand() / RAND_MAX;
    }
}

int main() {
    int sizes[][2] = {{100,100}, {500,500}, {1000,1000}};
    int num_cases = 3;

    for (int c = 0; c < num_cases; c++) {
        int M = sizes[c][0];
        int N = sizes[c][1];
        size_t size = M * N * sizeof(float);

        printf("\n=============================\n");
        printf("Matrix Size: %dx%d\n", M, N);

        // Allocate host memory
        float *h_A = (float *)malloc(size);
        float *h_B = (float *)malloc(size);
        float *h_C_CPU = (float *)malloc(size);
        float *h_C_GPU = (float *)malloc(size);

        // Initialize matrices
        initializeMatrices(h_A, h_B, M, N);

        // --- CPU computation ---
        clock_t start_cpu = clock();
        matrixAddCPU(h_A, h_B, h_C_CPU, M, N);
        clock_t end_cpu = clock();
        double cpu_time = ((double)(end_cpu - start_cpu)) / CLOCKS_PER_SEC;

        // --- GPU computation ---
        float *d_A, *d_B, *d_C;
        cudaMalloc((void **)&d_A, size);
        cudaMalloc((void **)&d_B, size);
        cudaMalloc((void **)&d_C, size);

        cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

        dim3 threadsPerBlock(16, 16);
        dim3 blocksPerGrid((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
                           (M + threadsPerBlock.y - 1) / threadsPerBlock.y);

        cudaEvent_t start_gpu, stop_gpu;
        cudaEventCreate(&start_gpu);
        cudaEventCreate(&stop_gpu);

        cudaEventRecord(start_gpu);
        matrixAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, M, N);
        cudaEventRecord(stop_gpu);

        cudaMemcpy(h_C_GPU, d_C, size, cudaMemcpyDeviceToHost);
        cudaEventSynchronize(stop_gpu);

        float gpu_time = 0;
        cudaEventElapsedTime(&gpu_time, start_gpu, stop_gpu);
        gpu_time /= 1000.0; // convert ms → seconds

        // Verify correctness
        int correct = 1;
        for (int i = 0; i < M * N; i++) {
            if (fabs(h_C_CPU[i] - h_C_GPU[i]) > 1e-5) {
                correct = 0;
                break;
            }
        }

        // Print results
        printf("CPU Time: %.6f s\n", cpu_time);
        printf("GPU Time: %.6f s\n", gpu_time);
        if (gpu_time > 0)
            printf("Speedup: %.2fx\n", cpu_time / gpu_time);
        else
            printf("Speedup: N/A\n");

        printf("Result Verification: %s\n", correct ? "SUCCESS ✅" : "FAIL ❌");

        // CSV-style output for Python graph
        printf("CSV_RESULT,%d,%.6f,%.6f\n", M*N, cpu_time, gpu_time);

        // Free memory
        free(h_A); free(h_B); free(h_C_CPU); free(h_C_GPU);
        cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    }

    return 0;
}


Writing matrixAdd.cu


In [46]:
!nvcc -arch=sm_75 matrixAdd.cu -o matrixAdd
!./matrixAdd



Matrix Size: 100x100
CPU Time: 0.000054 s
GPU Time: 0.000085 s
Speedup: 0.64x
Result Verification: SUCCESS ✅
CSV_RESULT,10000,0.000054,0.000085

Matrix Size: 500x500
CPU Time: 0.001100 s
GPU Time: 0.000023 s
Speedup: 48.62x
Result Verification: SUCCESS ✅
CSV_RESULT,250000,0.001100,0.000023

Matrix Size: 1000x1000
CPU Time: 0.004353 s
GPU Time: 0.000061 s
Speedup: 71.82x
Result Verification: SUCCESS ✅
CSV_RESULT,1000000,0.004353,0.000061


**PS 7**

Dot Product of Two Vectors using CUDA
Problem Statement: Write a CUDA C program to compute the dot product of two vectors A and B of size N. The dot product is defined as:
Details:
•	Initialize the vectors A and B with random values.
•	Implement the dot product calculation using both serial (CPU) and parallel (CUDA) approaches.
•	Measure the execution time for both implementations with different vector sizes (e.g., N = 10^5, 10^6, 10^7).
•	Use atomic operations or shared memory reduction in the CUDA kernel to compute the final sum.
Task:
•	Calculate and report the speedup for different vector sizes.

In [56]:
%%writefile dotProduct.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>
#include <math.h>

// CUDA kernel for dot product using atomicAdd
__global__ void dotProductKernel(float *A, float *B, float *C, int N) {
    __shared__ float cache[256];
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int cacheIndex = threadIdx.x;

    float temp = 0;
    while (tid < N) {
        temp += A[tid] * B[tid];
        tid += blockDim.x * gridDim.x;
    }

    // Each thread stores partial sum in shared memory
    cache[cacheIndex] = temp;
    __syncthreads();

    // Parallel reduction within block
    int i = blockDim.x / 2;
    while (i != 0) {
        if (cacheIndex < i)
            cache[cacheIndex] += cache[cacheIndex + i];
        __syncthreads();
        i /= 2;
    }

    // One thread per block adds partial sum to global memory
    if (cacheIndex == 0)
        atomicAdd(C, cache[0]);
}

// CPU version of dot product
float dotProductCPU(float *A, float *B, int N) {
    float sum = 0.0f;
    for (int i = 0; i < N; i++) {
        sum += A[i] * B[i];
    }
    return sum;
}

// Initialize vectors
void initializeVectors(float *A, float *B, int N) {
    for (int i = 0; i < N; i++) {
        A[i] = (float)rand() / RAND_MAX;
        B[i] = (float)rand() / RAND_MAX;
    }
}

int main() {
    int N_values[] = {100000, 1000000, 10000000};
    int num_cases = 3;

    for (int c = 0; c < num_cases; c++) {
        int N = N_values[c];
        size_t size = N * sizeof(float);

        printf("\n=============================\n");
        printf("Vector Size: %d\n", N);

        // Allocate host memory
        float *h_A = (float *)malloc(size);
        float *h_B = (float *)malloc(size);
        initializeVectors(h_A, h_B, N);

        // --- CPU computation ---
        clock_t start_cpu = clock();
        float cpu_result = dotProductCPU(h_A, h_B, N);
        clock_t end_cpu = clock();
        double cpu_time = ((double)(end_cpu - start_cpu)) / CLOCKS_PER_SEC;

        // --- GPU computation ---
        float *d_A, *d_B, *d_C;
        float h_C = 0.0f;

        cudaMalloc((void **)&d_A, size);
        cudaMalloc((void **)&d_B, size);
        cudaMalloc((void **)&d_C, sizeof(float));

        cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_C, &h_C, sizeof(float), cudaMemcpyHostToDevice);

        int threadsPerBlock = 256;
        int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

        cudaEvent_t start_gpu, stop_gpu;
        cudaEventCreate(&start_gpu);
        cudaEventCreate(&stop_gpu);

        cudaEventRecord(start_gpu);
        dotProductKernel<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
        cudaEventRecord(stop_gpu);

        cudaMemcpy(&h_C, d_C, sizeof(float), cudaMemcpyDeviceToHost);
        cudaEventSynchronize(stop_gpu);

        float gpu_time = 0;
        cudaEventElapsedTime(&gpu_time, start_gpu, stop_gpu);
        gpu_time /= 1000.0; // convert ms → s

        // Verify correctness
        int correct = fabs(cpu_result - h_C) / fabs(cpu_result) < 1e-5;

        // Print results
        printf("CPU Result: %.6f\n", cpu_result);
        printf("GPU Result: %.6f\n", h_C);
        printf("CPU Time: %.6f s\n", cpu_time);
        printf("GPU Time: %.6f s\n", gpu_time);
        if (gpu_time > 0)
            printf("Speedup: %.2fx\n", cpu_time / gpu_time);
        else
            printf("Speedup: N/A\n");

        // CSV-style output
        printf("CSV_RESULT,%d,%.6f,%.6f\n", N, cpu_time, gpu_time);

        // Free memory
        free(h_A);
        free(h_B);
        cudaFree(d_A);
        cudaFree(d_B);
        cudaFree(d_C);
    }
    return 0;
}


Overwriting dotProduct.cu


In [57]:
!nvcc -arch=sm_75 dotProduct.cu -o dotProduct
!./dotProduct



Vector Size: 100000
CPU Result: 25001.640625
GPU Result: 25001.757812
CPU Time: 0.000298 s
GPU Time: 0.000117 s
Speedup: 2.55x
CSV_RESULT,100000,0.000298,0.000117

Vector Size: 1000000
CPU Result: 249990.375000
GPU Result: 250029.671875
CPU Time: 0.002922 s
GPU Time: 0.000116 s
Speedup: 25.18x
CSV_RESULT,1000000,0.002922,0.000116

Vector Size: 10000000
CPU Result: 2471388.500000
GPU Result: 2500497.250000
CPU Time: 0.029231 s
GPU Time: 0.000991 s
Speedup: 29.49x
CSV_RESULT,10000000,0.029231,0.000991


**PS 8**

Matrix Multiplication using CUDA
Problem Statement: Write a CUDA C program to perform matrix multiplication. Given two matrices A (MxN) and B (NxP), compute the resulting matrix C (MxP) where:
Details:
•	Initialize the matrices A and B with random values.
•	Write code for both serial (CPU-based) and parallel (CUDA-based) implementations.
•	Measure the execution time of both implementations for various matrix sizes (e.g., 100x100, 500x500, 1000x1000).
Task:
•	Calculate the speedup by comparing the CPU and GPU execution times.

In [62]:
%%writefile matrixMul.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>

// ======================
// CUDA Kernel for Matrix Multiplication
// ======================
__global__ void matrixMul(float *A, float *B, float *C, int M, int N, int P) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;  // Row index
    int col = blockIdx.x * blockDim.x + threadIdx.x;  // Column index

    if (row < M && col < P) {
        float sum = 0.0f;
        for (int k = 0; k < N; k++) {
            sum += A[row * N + k] * B[k * P + col];
        }
        C[row * P + col] = sum;
    }
}

// ======================
// CPU Matrix Multiplication
// ======================
void matrixMulCPU(float *A, float *B, float *C, int M, int N, int P) {
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < P; j++) {
            float sum = 0.0f;
            for (int k = 0; k < N; k++) {
                sum += A[i * N + k] * B[k * P + j];
            }
            C[i * P + j] = sum;
        }
    }
}

// ======================
// Random Matrix Initialization
// ======================
void initializeMatrix(float *mat, int rows, int cols) {
    for (int i = 0; i < rows * cols; i++) {
        mat[i] = (float)rand() / RAND_MAX;
    }
}

// ======================
// Main Function
// ======================
int main() {
    int sizes[][3] = {{100, 100, 100}, {500, 500, 500}, {1000, 1000, 1000}};
    int num_cases = 3;

    for (int c = 0; c < num_cases; c++) {
        int M = sizes[c][0];
        int N = sizes[c][1];
        int P = sizes[c][2];

        printf("\n====================================\n");
        printf("Matrix Size: %dx%d x %dx%d\n", M, N, N, P);

        size_t size_A = M * N * sizeof(float);
        size_t size_B = N * P * sizeof(float);
        size_t size_C = M * P * sizeof(float);

        float *h_A = (float *)malloc(size_A);
        float *h_B = (float *)malloc(size_B);
        float *h_C_CPU = (float *)malloc(size_C);
        float *h_C_GPU = (float *)malloc(size_C);

        initializeMatrix(h_A, M, N);
        initializeMatrix(h_B, N, P);

        // -------------------------
        // CPU Computation
        // -------------------------
        clock_t start_cpu = clock();
        matrixMulCPU(h_A, h_B, h_C_CPU, M, N, P);
        clock_t end_cpu = clock();
        double cpu_time = ((double)(end_cpu - start_cpu)) / CLOCKS_PER_SEC;

        // -------------------------
        // GPU Computation
        // -------------------------
        float *d_A, *d_B, *d_C;
        cudaMalloc((void **)&d_A, size_A);
        cudaMalloc((void **)&d_B, size_B);
        cudaMalloc((void **)&d_C, size_C);

        cudaMemcpy(d_A, h_A, size_A, cudaMemcpyHostToDevice);
        cudaMemcpy(d_B, h_B, size_B, cudaMemcpyHostToDevice);

        dim3 threadsPerBlock(16, 16);
        dim3 blocksPerGrid((P + threadsPerBlock.x - 1) / threadsPerBlock.x,
                           (M + threadsPerBlock.y - 1) / threadsPerBlock.y);

        cudaEvent_t start_gpu, stop_gpu;
        cudaEventCreate(&start_gpu);
        cudaEventCreate(&stop_gpu);

        cudaEventRecord(start_gpu);
        matrixMul<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, M, N, P);
        cudaEventRecord(stop_gpu);

        cudaMemcpy(h_C_GPU, d_C, size_C, cudaMemcpyDeviceToHost);
        cudaEventSynchronize(stop_gpu);

        float gpu_time;
        cudaEventElapsedTime(&gpu_time, start_gpu, stop_gpu);
        gpu_time /= 1000.0f; // convert ms to seconds

        // -------------------------
        // Verification
        // -------------------------
        for (int i = 0; i < M * P; i++) {
            if (fabs(h_C_CPU[i] - h_C_GPU[i]) > 1e-3) {
                break;
            }
        }

        // -------------------------
        // Output Results
        // -------------------------
        printf("CPU Time: %.6f s\n", cpu_time);
        printf("GPU Time: %.6f s\n", gpu_time);
        if (gpu_time > 0)
            printf("Speedup: %.2fx\n", cpu_time / gpu_time);

        // Free memory
        free(h_A);
        free(h_B);
        free(h_C_CPU);
        free(h_C_GPU);
        cudaFree(d_A);
        cudaFree(d_B);
        cudaFree(d_C);
    }

    return 0;
}


Overwriting matrixMul.cu


In [63]:
!nvcc matrixMul.cu -o matrixMul
!./matrixMul



Matrix Size: 100x100 x 100x100
CPU Time: 0.003075 s
GPU Time: 0.007389 s
Speedup: 0.42x

Matrix Size: 500x500 x 500x500
CPU Time: 0.672368 s
GPU Time: 0.000002 s
Speedup: 269378.21x

Matrix Size: 1000x1000 x 1000x1000
CPU Time: 4.623642 s
GPU Time: 0.000002 s
Speedup: 1852420.69x
