Generate the data

In [1]:
import pandas as pd
import numpy as np

# Set the random seed for reproducibility
np.random.seed(42)

# List of different sizes for the datasets
sizes = [10000, 100000, 1000000, 10000000]
num_features = 5

# Loop through each size and generate, then save the data
for size in sizes:
    # Generate random data
    data = pd.DataFrame(np.random.randn(size, num_features), columns=[f'feature_{i}' for i in range(num_features)])

    # Save the data to a CSV file
    filename = f'random_data_{size}.csv'
    data.to_csv(filename, index=False)

    print(f'Saved {size} samples to {filename}')

Saved 10000 samples to random_data_10000.csv
Saved 100000 samples to random_data_100000.csv
Saved 1000000 samples to random_data_1000000.csv
Saved 10000000 samples to random_data_10000000.csv


KMeans from sklearn

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np
import time

# Initialize an empty list to store the results
results = []

# List of different sizes for the datasets
sizes = [10000, 100000, 1000000]
num_features = 5

# List of different numbers of clusters to try
cluster_numbers = [10, 100, 1000]

for size in sizes:
    # Load the data from the CSV file
    filename = f'random_data_{size}.csv'
    data = pd.read_csv(filename)

    # Standardize the features
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)

    for num_clusters in cluster_numbers:
        # Measure the start time
        start_time = time.time()

        # Apply K-means clustering
        kmeans = KMeans(n_clusters=num_clusters, random_state=42, max_iter=10)
        kmeans.fit(scaled_data)

        # Measure the end time
        end_time = time.time()

        # Calculate the time taken for K-means clustering
        clustering_time = end_time - start_time

        # Add the results to the list
        results.append({'Data Size': size, 'Clusters': num_clusters, 'Time (seconds)': clustering_time})

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)

# Print the table
print(results_df)



   Data Size  Clusters  Time (seconds)
0      10000        10        0.842065
1      10000       100        2.387318
2      10000      1000       41.758798
3     100000        10        4.466653
4     100000       100       16.375293
5     100000      1000      143.831143
6    1000000        10       10.407497
7    1000000       100      141.549031
8    1000000      1000     1593.802099


In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np
import time

# Initialize an empty list to store the results
results = []

# List of different sizes for the datasets
sizes = [10000000]
num_features = 5

# List of different numbers of clusters to try
cluster_numbers = [10, 100, 1000]

for size in sizes:
    # Load the data from the CSV file
    filename = f'random_data_{size}.csv'
    data = pd.read_csv(filename)

    # Standardize the features
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)

    for num_clusters in cluster_numbers:
        # Measure the start time
        start_time = time.time()

        # Apply K-means clustering
        kmeans = KMeans(n_clusters=num_clusters, random_state=42, max_iter=10)
        kmeans.fit(scaled_data)

        # Measure the end time
        end_time = time.time()

        # Calculate the time taken for K-means clustering
        clustering_time = end_time - start_time

        # Add the results to the list
        results.append({'Data Size': size, 'Clusters': num_clusters, 'Time (seconds)': clustering_time})

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)

# Print the table
print(results_df)



   Data Size  Clusters  Time (seconds)
0   10000000        10      104.167836
1   10000000       100     1083.490873
2   10000000      1000    12539.156301
