<a href="https://colab.research.google.com/github/ja390/Parallel-Computing-Assignment3/blob/main/Cuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!sudo apt-get update -qq

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [2]:
!sudo apt-get install -y openmpi-bin libopenmpi-dev > /dev/null

In [3]:
print("\nChecking nvcc (CUDA compiler):")
!nvcc --version


Checking nvcc (CUDA compiler):
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [4]:
print("\nChecking GPU:")
!nvidia-smi


Checking GPU:
Wed Dec  3 07:10:49 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   59C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                 

In [5]:
%%writefile kmeans_cuda.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#include <time.h>

#define MAX_ITER 100
#define K 3       // no of clusters
#define N 10000   // no of points

typedef struct {
    float x, y;
    int cluster;
} Point;

typedef struct {
    float x, y;
} Centroid;

// assign points to nearest centroid(kernel)
__global__ void assignClusters(Point *points, Centroid *centroids, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;

    float minDist = 1e9;
    int cluster = 0;
    for (int j = 0; j < K; j++) {
        float dx = points[idx].x - centroids[j].x;
        float dy = points[idx].y - centroids[j].y;
        float dist = sqrtf(dx*dx + dy*dy);
        if (dist < minDist) {
            minDist = dist;
            cluster = j;
        }
    }
    points[idx].cluster = cluster;
}

// compute partial sums for centroid update
__global__ void computePartialSums(Point *points, float *sumX, float *sumY, int *count, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;

    int c = points[idx].cluster;
    atomicAdd(&sumX[c], points[idx].x);
    atomicAdd(&sumY[c], points[idx].y);
    atomicAdd(&count[c], 1);
}

int main(int argc, char *argv[]) {
    int threads = 256; // default threads (per block)
    if (argc > 1) threads = atoi(argv[1]);

    // Initialize points
    Point *h_points = (Point*) malloc(N * sizeof(Point));
    srand(42);
    for (int i = 0; i < N; i++) {
        h_points[i].x = ((float)rand() / RAND_MAX) * 10.0f;
        h_points[i].y = ((float)rand() / RAND_MAX) * 10.0f;
        h_points[i].cluster = -1;
    }

    // Initialize centroids
    Centroid h_centroids[K];
    for (int j = 0; j < K; j++) {
        h_centroids[j].x = ((float)rand() / RAND_MAX) * 10.0f;
        h_centroids[j].y = ((float)rand() / RAND_MAX) * 10.0f;
    }

    // Device memory
    Point *d_points;
    Centroid *d_centroids;
    float *d_sumX, *d_sumY;
    int *d_count;

    cudaMalloc(&d_points, N * sizeof(Point));
    cudaMalloc(&d_centroids, K * sizeof(Centroid));
    cudaMalloc(&d_sumX, K * sizeof(float));
    cudaMalloc(&d_sumY, K * sizeof(float));
    cudaMalloc(&d_count, K * sizeof(int));

    cudaMemcpy(d_points, h_points, N * sizeof(Point), cudaMemcpyHostToDevice);
    cudaMemcpy(d_centroids, h_centroids, K * sizeof(Centroid), cudaMemcpyHostToDevice);

    int blocks = (N + threads - 1) / threads;
    float start = (float)clock() / CLOCKS_PER_SEC;

    for (int iter = 0; iter < MAX_ITER; iter++) {
        // Assign clusters
        assignClusters<<<blocks, threads>>>(d_points, d_centroids, N);
        cudaDeviceSynchronize();

        // Reset sums
        cudaMemset(d_sumX, 0, K * sizeof(float));
        cudaMemset(d_sumY, 0, K * sizeof(float));
        cudaMemset(d_count, 0, K * sizeof(int));

        // Compute partial sums
        computePartialSums<<<blocks, threads>>>(d_points, d_sumX, d_sumY, d_count, N);
        cudaDeviceSynchronize();

        // Copy back to host
        float h_sumX[K], h_sumY[K];
        int h_count[K];
        cudaMemcpy(h_sumX, d_sumX, K * sizeof(float), cudaMemcpyDeviceToHost);
        cudaMemcpy(h_sumY, d_sumY, K * sizeof(float), cudaMemcpyDeviceToHost);
        cudaMemcpy(h_count, d_count, K * sizeof(int), cudaMemcpyDeviceToHost);

        // Update centroids on host
        for (int j = 0; j < K; j++) {
            if (h_count[j] > 0) {
                h_centroids[j].x = h_sumX[j] / h_count[j];
                h_centroids[j].y = h_sumY[j] / h_count[j];
            }
        }

        // Copy centroids back to device
        cudaMemcpy(d_centroids, h_centroids, K * sizeof(Centroid), cudaMemcpyHostToDevice);
    }

    float end = (float)clock() / CLOCKS_PER_SEC;

    printf("CUDA K-Means with %d threads per block completed.\n", threads);
    printf("Execution time: %f seconds\n", end - start);
    printf("Final Centroids:\n");
    for (int j = 0; j < K; j++)
        printf("Centroid %d: (%.2f, %.2f)\n", j, h_centroids[j].x, h_centroids[j].y);

    cudaFree(d_points);
    cudaFree(d_centroids);
    cudaFree(d_sumX);
    cudaFree(d_sumY);
    cudaFree(d_count);
    free(h_points);

    return 0;
}


Writing kmeans_cuda.cu


In [6]:
!nvcc kmeans_cuda.cu -o kmeans_cuda -O2

In [7]:
!./kmeans_cuda 128

CUDA K-Means with 128 threads per block completed.
Execution time: 0.013239 seconds
Final Centroids:
Centroid 0: (0.13, 1.84)
Centroid 1: (9.74, 6.66)
Centroid 2: (4.91, 7.36)


In [8]:
!./kmeans_cuda 256

CUDA K-Means with 256 threads per block completed.
Execution time: 0.011545 seconds
Final Centroids:
Centroid 0: (0.13, 1.84)
Centroid 1: (9.74, 6.66)
Centroid 2: (4.91, 7.36)


In [9]:
!./kmeans_cuda 512

CUDA K-Means with 512 threads per block completed.
Execution time: 0.011359 seconds
Final Centroids:
Centroid 0: (0.13, 1.84)
Centroid 1: (9.74, 6.66)
Centroid 2: (4.91, 7.36)


In [10]:
!./kmeans_cuda 1024

CUDA K-Means with 1024 threads per block completed.
Execution time: 0.011126 seconds
Final Centroids:
Centroid 0: (0.13, 1.84)
Centroid 1: (9.74, 6.66)
Centroid 2: (4.91, 7.36)
