In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time

# Load the data from the CSV file
sizes = [10000, 100000, 1000000, 10000000]
num_features = 5
cluster_numbers = [10, 100, 1000]

def kmeans(X, num_clusters, max_iterations=10):
    # Initialize centroids randomly
    centroids = tf.Variable(tf.slice(tf.random.shuffle(X), [0, 0], [num_clusters, -1]))

    for _ in range(max_iterations):
        # Assign each point to the nearest centroid
        distances = tf.reduce_sum((tf.expand_dims(X, axis=1) - tf.expand_dims(centroids, axis=0)) ** 2, axis=2)
        assignments = tf.argmin(distances, axis=1)

        # Update centroids based on the mean of the assigned points
        new_centroids = tf.concat([tf.reduce_mean(tf.gather(X, tf.reshape(tf.where(tf.equal(assignments, c)), [1, -1])), axis=1) for c in range(num_clusters)], axis=0)
        centroids.assign(new_centroids)

    return centroids

results = []

# Define TensorFlow GPU options
gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.5)

# Create TensorFlow session with GPU options
with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) as sess:
    for size in sizes:
        filename = f'random_data_{size}.csv'
        data = pd.read_csv(filename).values
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(data)

        for num_clusters in cluster_numbers:
            start_time = time.time()

            # Convert data to TensorFlow tensor
            X_tensor = tf.convert_to_tensor(scaled_data, dtype=tf.float32)

            # Run KMeans clustering on GPU
            with tf.device('/device:GPU:0'):  # Use GPU
                centroids = kmeans(X_tensor, num_clusters)

            end_time = time.time()

            clustering_time = end_time - start_time

            results.append({'Data Size': size, 'Clusters': num_clusters, 'Time (seconds)': clustering_time})

results_df = pd.DataFrame(results)
print(results_df)

    Data Size  Clusters  Time (seconds)
0       10000        10        0.871729
1       10000       100        6.014540
2       10000      1000       49.041308
3      100000        10        0.471055
4      100000       100        4.323267
5      100000      1000       51.076926
6     1000000        10        0.604169
7     1000000       100        4.198499
8     1000000      1000       51.030580
9    10000000        10        2.929545
10   10000000       100       10.118102
11   10000000      1000       53.169064
