In [1]:
'''
RAPIDS is an open-source suite of data science and analytics software and libraries developed by NVIDIA.
       It aims to bring the power of GPUs to data science workflows, enabling faster data processing and machine learning.
RAPIDS leverages NVIDIA CUDA for low-level, high-performance GPU parallel computing. The key components of RAPIDS include:
(1) cuML: The machine learning library within RAPIDS, offering GPU-accelerated implementations of various machine learning algorithms, similar to scikit-learn.
'''

'\nRAPIDS is an open-source suite of data science and analytics software and libraries developed by NVIDIA.\n       It aims to bring the power of GPUs to data science workflows, enabling faster data processing and machine learning.\nRAPIDS leverages NVIDIA CUDA for low-level, high-performance GPU parallel computing. The key components of RAPIDS include:\n(1) cuML: The machine learning library within RAPIDS, offering GPU-accelerated implementations of various machine learning algorithms, similar to scikit-learn.\n'

In [1]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 481, done.[K
remote: Counting objects: 100% (212/212), done.[K
remote: Compressing objects: 100% (121/121), done.[K
remote: Total 481 (delta 143), reused 124 (delta 91), pack-reused 269[K
Receiving objects: 100% (481/481), 133.58 KiB | 759.00 KiB/s, done.
Resolving deltas: 100% (245/245), done.
Collecting pynvml
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.1/53.1 kB 2.3 MB/s eta 0:00:00
Installing collected packages: pynvml
Successfully installed pynvml-11.5.0
***********************************************************************
Woo! Your instance has a Tesla T4 GPU!
We will install the latest stable RAPIDS via pip 24.4.*!  Please stand by, should be quick...
***********************************************************************

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cuml-cu12==24.4.*
  Downloading https://pypi.nvidi

In [2]:
import cuml
cuml.__version__

'24.04.00'

Generate the data

In [3]:
import pandas as pd
import numpy as np

# Set the random seed for reproducibility
np.random.seed(42)

# List of different sizes for the datasets
sizes = [10000, 100000, 1000000, 10000000]
num_features = 5

# Loop through each size and generate, then save the data
for size in sizes:
    # Generate random data
    data = pd.DataFrame(np.random.randn(size, num_features), columns=[f'feature_{i}' for i in range(num_features)])

    # Save the data to a CSV file
    filename = f'random_data_{size}.csv'
    data.to_csv(filename, index=False)

    print(f'Saved {size} samples to {filename}')

Saved 10000 samples to random_data_10000.csv
Saved 100000 samples to random_data_100000.csv
Saved 1000000 samples to random_data_1000000.csv
Saved 10000000 samples to random_data_10000000.csv


Apply KMeans

In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from cuml.cluster import KMeans
import numpy as np
import time

# Initialize an empty list to store the results
results = []

# List of different sizes for the datasets
sizes = [10000, 100000, 1000000, 10000000]
num_features = 5

# List of different numbers of clusters to try
cluster_numbers = [10, 100, 1000]

for size in sizes:
    # Load the data from the CSV file
    filename = f'random_data_{size}.csv'
    data = pd.read_csv(filename)

    # Standardize the features
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)

    for num_clusters in cluster_numbers:
        # Measure the start time
        start_time = time.time()

        # Apply K-means clustering with cuML
        kmeans = KMeans(n_clusters=num_clusters, random_state=42, max_iter=10)
        kmeans.fit(scaled_data)

        # Measure the end time
        end_time = time.time()

        # Calculate the time taken for K-means clustering
        clustering_time = end_time - start_time

        # Add the results to the list
        results.append({'Data Size': size, 'Clusters': num_clusters, 'Time (seconds)': clustering_time})

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)

# Print the table
print(results_df)

    Data Size  Clusters  Time (seconds)
0       10000        10        1.278008
1       10000       100        0.072464
2       10000      1000        0.475428
3      100000        10        0.040021
4      100000       100        0.178397
5      100000      1000        1.900078
6     1000000        10        0.354767
7     1000000       100        1.561732
8     1000000      1000       14.652058
9    10000000        10        3.613173
10   10000000       100       16.218980
11   10000000      1000      144.056942
