In [None]:
'''
Maximum number of threads per block: 1024
Maximum number of threads per SM: 2048
Shared memory per SM: 64 KB
Register file size per SM: 65536 registers

Number of Streaming Multiprocessors (SMs): 40
Maximum number of resident blocks per SM: 16
'''

In [1]:
import pandas as pd
import numpy as np

# Set the random seed for reproducibility
np.random.seed(42)

# List of different sizes for the datasets
sizes = [10000, 100000, 1000000, 10000000]
num_features = 5

# Loop through each size and generate, then save the data
for size in sizes:
    # Generate random data
    data = pd.DataFrame(np.random.randn(size, num_features), columns=[f'feature_{i}' for i in range(num_features)])

    # Save the data to a CSV file
    filename = f'random_data_{size}.csv'
    data.to_csv(filename, index=False)

    print(f'Saved {size} samples to {filename}')

Saved 10000 samples to random_data_10000.csv
Saved 100000 samples to random_data_100000.csv
Saved 1000000 samples to random_data_1000000.csv
Saved 10000000 samples to random_data_10000000.csv


In [9]:
%%writefile kmeans_rnd.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <cuda.h>

#define TPB 32   // Threads per block
#define MAX_ITER 10
#define MAX_LINE_LENGTH 1024

// Function to read CSV data
int readCSVData(const char *filename, float **data, int *num_points, int *num_dims) {
    FILE *file = fopen(filename, "r");
    if (!file) {
        perror("Unable to open file");
        return -1;
    }

    char line[MAX_LINE_LENGTH];
    int n = 0, d = 0;

    // Read first line to determine the number of dimensions
    if (fgets(line, MAX_LINE_LENGTH, file)) {
        char *token = strtok(line, ",");
        while (token) {
            d++;
            token = strtok(NULL, ",");
        }
    }

    // Count number of points
    while (fgets(line, MAX_LINE_LENGTH, file)) {
        n++;
    }

    // Allocate memory for data
    *data = (float *)malloc(n * d * sizeof(float));
    if (!*data) {
        perror("Unable to allocate memory");
        fclose(file);
        return -1;
    }

    rewind(file);

    // Read data into the array
    int point = 0;
    while (fgets(line, MAX_LINE_LENGTH, file)) {
        char *token = strtok(line, ",");
        int dim = 0;
        while (token) {
            (*data)[point * d + dim] = atof(token);
            token = strtok(NULL, ",");
            dim++;
        }
        point++;
    }

    fclose(file);

    *num_points = n;
    *num_dims = d;

    return 0;
}

__device__ float distance(float *a, float *b, int dims) {
    float dist = 0;
    for (int i = 0; i < dims; ++i) {
        dist += (a[i] - b[i]) * (a[i] - b[i]);
    }
    return sqrt(dist);
}

__global__ void kMeansClusterAssignment(float *d_datapoints, int *d_clust_assn, float *d_centroids, int N, int K, int D) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= N) return;

    float min_dist = INFINITY;
    int closest_centroid = 0;

    for (int c = 0; c < K; ++c) {
        float dist = distance(&d_datapoints[idx * D], &d_centroids[c * D], D);
        if (dist < min_dist) {
            min_dist = dist;
            closest_centroid = c;
        }
    }
    d_clust_assn[idx] = closest_centroid;
}

__global__ void kMeansCentroidUpdate(float *d_datapoints, int *d_clust_assn, float *d_centroids, int *d_clust_sizes, int N, int K, int D) {
    extern __shared__ float s_centroids[];
    int *s_counts = (int *)&s_centroids[K * D];

    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int tid = threadIdx.x;

    if (tid < K) {
        for (int i = 0; i < D; ++i) {
            s_centroids[tid * D + i] = 0;
        }
        s_counts[tid] = 0;
    }
    __syncthreads();

    if (idx < N) {
        int cluster_id = d_clust_assn[idx];
        for (int i = 0; i < D; ++i) {
            atomicAdd(&s_centroids[cluster_id * D + i], d_datapoints[idx * D + i]);
        }
        atomicAdd(&s_counts[cluster_id], 1);
    }
    __syncthreads();

    if (tid < K) {
        for (int i = 0; i < D; ++i) {
            atomicAdd(&d_centroids[tid * D + i], s_centroids[tid * D + i]);
        }
        atomicAdd(&d_clust_sizes[tid], s_counts[tid]);
    }
}

__global__ void normalizeCentroids(float *d_centroids, int *d_clust_sizes, int K, int D) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= K) return;

    for (int i = 0; i < D; ++i) {
        if (d_clust_sizes[idx] > 0) {
            d_centroids[idx * D + i] /= d_clust_sizes[idx];
        }
    }
}

int main(int argc, char **argv) {
    if (argc != 3) {
        fprintf(stderr, "Usage: %s <input.csv> <K>\n", argv[0]);
        return -1;
    }

    const char *filename = argv[1];
    int K = atoi(argv[2]);

    float *datapoints;
    int num_points = 0;
    int num_dims = 0;

    if (readCSVData(filename, &datapoints, &num_points, &num_dims) != 0) {
        fprintf(stderr, "Error reading data from file\n");
        return -1;
    }

    float *d_datapoints, *d_centroids;
    int *d_clust_assn, *d_clust_sizes;

    cudaMalloc(&d_datapoints, num_points * num_dims * sizeof(float));
    cudaMalloc(&d_clust_assn, num_points * sizeof(int));
    cudaMalloc(&d_centroids, K * num_dims * sizeof(float));
    cudaMalloc(&d_clust_sizes, K * sizeof(int));

    float *h_centroids = (float *)malloc(K * num_dims * sizeof(float));
    int *h_clust_sizes = (int *)malloc(K * sizeof(int));

    srand(time(0));

    for (int c = 0; c < K; ++c) {
        for (int d = 0; d < num_dims; ++d) {
            h_centroids[c * num_dims + d] = datapoints[c * num_dims + d];
        }
        h_clust_sizes[c] = 0;
    }

    cudaMemcpy(d_centroids, h_centroids, K * num_dims * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_datapoints, datapoints, num_points * num_dims * sizeof(float), cudaMemcpyHostToDevice);

    cudaEvent_t start_total, stop_total;
    cudaEventCreate(&start_total);
    cudaEventCreate(&stop_total);

    cudaEventRecord(start_total);

    for (int cur_iter = 0; cur_iter < MAX_ITER; ++cur_iter) {
        kMeansClusterAssignment<<<(num_points + TPB - 1) / TPB, TPB>>>(d_datapoints, d_clust_assn, d_centroids, num_points, K, num_dims);

        // Reset centroids and sizes on device
        cudaMemset(d_centroids, 0, K * num_dims * sizeof(float));
        cudaMemset(d_clust_sizes, 0, K * sizeof(int));

        size_t shared_mem_size = K * num_dims * sizeof(float) + K * sizeof(int);
        kMeansCentroidUpdate<<<(num_points + TPB - 1) / TPB, TPB, shared_mem_size>>>(d_datapoints, d_clust_assn, d_centroids, d_clust_sizes, num_points, K, num_dims);

        normalizeCentroids<<<(K + TPB - 1) / TPB, TPB>>>(d_centroids, d_clust_sizes, K, num_dims);

        cudaMemcpy(h_centroids, d_centroids, K * num_dims * sizeof(float), cudaMemcpyDeviceToHost);

        /*printf("Iteration %d centroids:\n", cur_iter + 1);
        for (int i = 0; i < K; ++i) {
            printf("Centroid %d: ", i);
            for (int j = 0; j < num_dims; ++j) {
                printf("%f ", h_centroids[i * num_dims + j]);
            }
            printf("\n");
        }*/
    }

    cudaEventRecord(stop_total);
    cudaEventSynchronize(stop_total);

    float total_milliseconds = 0;
    cudaEventElapsedTime(&total_milliseconds, start_total, stop_total);
    printf("Total time: %f milliseconds\n", total_milliseconds);

    cudaFree(d_datapoints);
    cudaFree(d_clust_assn);
    cudaFree(d_centroids);
    cudaFree(d_clust_sizes);

    free(h_centroids);
    free(datapoints);
    free(h_clust_sizes);

    return 0;
}


Overwriting kmeans_rnd.cu


In [10]:
!nvcc kmeans_rnd.cu -o kmeans_rnd
!./kmeans_rnd random_data_10000.csv 10

Total time for K-means clustering: 0.000000 milliseconds


In [None]:
!nvcc kmeans_rnd.cu -o kmeans_rnd
!./kmeans_rnd random_data_10000.csv 100

In [None]:
!nvcc kmeans_rnd.cu -o kmeans_rnd
!./kmeans_rnd random_data_10000.csv 1000

In [None]:
!nvcc kmeans_rnd.cu -o kmeans_rnd
!./kmeans_rnd random_data_100000.csv 10

In [None]:
!nvcc kmeans_rnd.cu -o kmeans_rnd
!./kmeans_rnd random_data_100000.csv 100

In [None]:
!nvcc kmeans_rnd.cu -o kmeans_rnd
!./kmeans_rnd random_data_100000.csv 1000

In [None]:
!nvcc kmeans_rnd.cu -o kmeans_rnd
!./kmeans_rnd random_data_1000000.csv 10

In [None]:
!nvcc kmeans_rnd.cu -o kmeans_rnd
!./kmeans_rnd random_data_1000000.csv 100

In [None]:
!nvcc kmeans_rnd.cu -o kmeans_rnd
!./kmeans_rnd random_data_1000000.csv 1000

In [None]:
!nvcc kmeans_rnd.cu -o kmeans_rnd
!./kmeans_rnd random_data_10000000.csv 10

In [None]:
!nvcc kmeans_rnd.cu -o kmeans_rnd
!./kmeans_rnd random_data_10000000.csv 100

In [None]:
!nvcc kmeans_rnd.cu -o kmeans_rnd
!./kmeans_rnd random_data_10000000.csv 1000