# KMeans

In [1]:
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "1024")

In [2]:
import numpy as np
import pandas as pd
import time

### Create synthetic dataset

In [3]:
n_rows = 100000
n_cols = 500
n_clusters_data = 200
cluster_std = 1.0
dtype='float32'
from cuml.datasets import make_blobs
data, _ = make_blobs(
        n_rows, n_cols, n_clusters_data, cluster_std=cluster_std, random_state=0, dtype=dtype
    )  # make_blobs creates a random dataset of isotropic gaussian blobs.

data = data.get()



### Convert dataset to Spark DataFrame

In [4]:
pd_data = pd.DataFrame({"features": list(data)})
df = spark.createDataFrame(pd_data)

In [5]:
df.schema

StructType([StructField('features', ArrayType(FloatType(), True), True)])

### We will use this function to build both the Spark RAPIDS ML (GPU) and Spark ML (CPU) linear estimator objects, demonstrating the common API

In [6]:
def build_kmeans_estimator(estimator_class):
    return ( 
            estimator_class()
            .setTol(1.0e-20)
            .setK(200)
            .setFeaturesCol("features")
            .setMaxIter(15)
           )

## Spark ML (CPU)

In [7]:
from pyspark.ml.clustering import KMeans
cpu_kmeans = build_kmeans_estimator(KMeans)

Convert array sql type to VectorUDT Dataframe expected by Spark ML algos (Note: Spark RAPIDS ML also accepts VectorUDT Dataframes in addition to array type Dataframe above, along with a scalar column format - see docs).

In [8]:
from pyspark.ml.functions import array_to_vector

In [9]:
vector_df = df.select(array_to_vector("features").alias("features"))

### Fit

In [10]:
time.sleep(5)

start_time = time.time()
cpu_kmeans_model = cpu_kmeans.fit(vector_df)
cpu_fit = time.time() - start_time
print(f"Fit took: {cpu_fit} sec")

time.sleep(5)

23/07/21 13:26:49 WARN TaskSetManager: Stage 0 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:26:56 WARN TaskSetManager: Stage 3 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:27:02 WARN TaskSetManager: Stage 4 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:27:03 WARN TaskSetManager: Stage 5 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:27:05 WARN TaskSetManager: Stage 6 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:27:06 WARN TaskSetManager: Stage 7 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:27:17 WARN TaskSetManager: Stage 8 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:2

Fit took: 116.45441722869873 sec


In [11]:
print("KMeans numIter: {}".format(cpu_kmeans_model.summary.numIter))

KMeans numIter: 10


In [12]:
type(cpu_kmeans_model.clusterCenters()[0])

numpy.ndarray

In [13]:
sorted_cpu_cluster_centers = sorted([vec.tolist() for vec in cpu_kmeans_model.clusterCenters()])
[vec[0:10] for vec in sorted_cpu_cluster_centers[0:2]]

[[-9.925886767809509,
  9.39085782649087,
  8.889854954891518,
  -5.842879385244651,
  -1.2607862297140184,
  3.21836230549656,
  -9.877833088890451,
  -4.180136888730722,
  -3.705105688483989,
  -3.483956264912105],
 [-9.922630417553675,
  9.698699070998469,
  1.3664845470518505,
  -2.162207881150218,
  2.7953974895164917,
  9.486636707548461,
  2.898738367709121,
  4.7261793167834565,
  7.6061832257088895,
  2.3376862421660523]]

### Transform

In [14]:
spark_transformed = cpu_kmeans_model.setPredictionCol("transformed").transform(vector_df)

In [15]:
spark_transformed.filter(spark_transformed.transformed >= 0).count()

23/07/21 13:28:50 WARN TaskSetManager: Stage 44 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

100000

In [16]:
spark_transformed.show(10)

+--------------------+-----------+
|            features|transformed|
+--------------------+-----------+
|[7.60500764846801...|         12|
|[3.42933940887451...|        154|
|[-7.0488376617431...|        158|
|[4.56557607650756...|         37|
|[3.64227962493896...|        193|
|[-4.4982471466064...|        189|
|[4.66437816619873...|         70|
|[-7.5873112678527...|         74|
|[8.58247280120849...|        178|
|[5.78661394119262...|         17|
+--------------------+-----------+
only showing top 10 rows



23/07/21 13:28:57 WARN TaskSetManager: Stage 47 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.


## Spark RAPIDS ML (GPU)

In [17]:
from spark_rapids_ml.clustering import KMeans
gpu_kmeans = build_kmeans_estimator(KMeans)

Estimator can be persisted and reloaded.

In [18]:
estimator_path = "/tmp/kmeans-estimator"

In [19]:
gpu_kmeans.write().overwrite().save(estimator_path)
gpu_kmeans_loaded = KMeans.load(estimator_path)

                                                                                

### Fit

In [20]:
time.sleep(5)

start_time = time.time()
gpu_model = gpu_kmeans_loaded.fit(df)
gpu_fit = time.time() - start_time
print(f"Fit took: {gpu_fit} sec")

time.sleep(5)

23/07/21 13:29:03 WARN TaskSetManager: Stage 50 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:29:04 WARN TaskSetManager: Stage 51 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Fit took: 10.781142711639404 sec


In [21]:
gpu_kmeans_loaded.getK()

200

In [22]:
sorted_clusters = sorted([vec.tolist() for vec in gpu_model.clusterCenters()])

In [23]:
[vec[0:10] for vec in sorted_clusters[0:2]]

[[-9.922629356384277,
  9.698698997497559,
  1.3664847612380981,
  -2.16220760345459,
  2.795397996902466,
  9.486640930175781,
  2.8987386226654053,
  4.726180553436279,
  7.606183052062988,
  2.3376855850219727],
 [-9.85255241394043,
  0.16838404536247253,
  2.27518892288208,
  -8.421260833740234,
  -9.01246452331543,
  4.740985870361328,
  0.44005265831947327,
  -1.697755217552185,
  -3.0606205463409424,
  9.689518928527832]]

### Transform

In [24]:
model_path = "/tmp/kmeans-model"

In [25]:
gpu_model.write().overwrite().save(model_path)

23/07/21 13:29:20 WARN TaskSetManager: Stage 55 contains a task of very large size (1938 KiB). The maximum recommended task size is 1000 KiB.


In [26]:
gpu_model_loaded = gpu_model.read().load(model_path)

In [27]:
[vec[0:10] for vec in sorted(gpu_model_loaded.cluster_centers_)[0:2]]

[[-9.922629356384277,
  9.698698997497559,
  1.3664847612380981,
  -2.16220760345459,
  2.795397996902466,
  9.486640930175781,
  2.8987386226654053,
  4.726180553436279,
  7.606183052062988,
  2.3376855850219727],
 [-9.85255241394043,
  0.16838404536247253,
  2.27518892288208,
  -8.421260833740234,
  -9.01246452331543,
  4.740985870361328,
  0.44005265831947327,
  -1.697755217552185,
  -3.0606205463409424,
  9.689518928527832]]

In [28]:
transformed_df = gpu_model_loaded.setPredictionCol("transformed").transform(df)

In [29]:
transformed_df.printSchema()

root
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- transformed: integer (nullable = true)



In [30]:
transformed_df.count()

23/07/21 13:29:20 WARN TaskSetManager: Stage 58 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

100000

In [31]:
transformed_df.show(10)

+--------------------+-----------+
|            features|transformed|
+--------------------+-----------+
|[7.6050076, -1.89...|         16|
|[3.4293394, 5.031...|        127|
|[-7.0488377, 5.49...|         28|
|[4.565576, 0.8187...|         83|
|[3.6422796, -0.92...|        157|
|[-4.498247, -6.47...|         38|
|[4.664378, -4.254...|         74|
|[-7.5873113, -5.9...|         94|
|[8.582473, 1.4493...|        132|
|[5.786614, 5.5428...|         11|
+--------------------+-----------+
only showing top 10 rows



23/07/21 13:29:21 WARN TaskSetManager: Stage 61 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.


## Pipeline: CPU MinMaxScaler + CPU KMeans

In [32]:
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.functions import array_to_vector

from pyspark.ml.clustering import KMeans

In [33]:
scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features')
cpu_kmeans = (
    KMeans()
    .setTol(1.0e-20)
    .setK(200)
    .setFeaturesCol("scaled_features")
    .setPredictionCol("transformed")
    .setMaxIter(15)
)

In [34]:
pipe = Pipeline(stages=[scaler, cpu_kmeans])

In [35]:
time.sleep(5)

start_time = time.time()
pipe_model = pipe.fit(vector_df)
cpu_cpu_pipe = time.time() - start_time
print(f"Fit took: {cpu_cpu_pipe} sec")

time.sleep(5)

23/07/21 13:29:26 WARN TaskSetManager: Stage 62 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:29:29 WARN TaskSetManager: Stage 65 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:29:32 WARN TaskSetManager: Stage 68 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:29:36 WARN TaskSetManager: Stage 69 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:29:37 WARN TaskSetManager: Stage 70 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:29:38 WARN TaskSetManager: Stage 71 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:29:39 WARN TaskSetManager: Stage 72 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/

Fit took: 133.2880301475525 sec


In [36]:
print("KMeans numIter: {}".format(pipe_model.stages[1].summary.numIter))

KMeans numIter: 15


In [37]:
pipe_transformed = pipe_model.transform(vector_df)
pipe_transformed.show(10)

+--------------------+--------------------+-----------+
|            features|     scaled_features|transformed|
+--------------------+--------------------+-----------+
|[7.60500764846801...|[0.78957140222302...|         33|
|[3.42933940887451...|[0.63245732071398...|         72|
|[-7.0488376617431...|[0.23820445581219...|         29|
|[4.56557607650756...|[0.67520946723735...|         38|
|[3.64227962493896...|[0.64046942908650...|         48|
|[-4.4982471466064...|[0.33417321173289...|         57|
|[4.66437816619873...|[0.67892700385240...|         88|
|[-7.5873112678527...|[0.21794379783209...|        142|
|[8.58247280120849...|[0.82634959696171...|        115|
|[5.78661394119262...|[0.72115235148728...|         17|
+--------------------+--------------------+-----------+
only showing top 10 rows



23/07/21 13:31:45 WARN TaskSetManager: Stage 124 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.


## Pipeline: CPU MinMaxScaler + GPU KMeans

In [38]:
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.functions import array_to_vector

from spark_rapids_ml.clustering import KMeans

In [39]:
scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features')
gpu_kmeans = (
    KMeans()
    .setTol(1.0e-20)
    .setK(200)
    .setFeaturesCol("scaled_features")
    .setPredictionCol("transformed")
    .setMaxIter(15)
)

In [40]:
pipe = Pipeline(stages=[scaler, gpu_kmeans])

In [41]:
time.sleep(5)

start_time = time.time()
pipe_model = pipe.fit(vector_df)
cpu_gpu_pipe = time.time() - start_time
print(f"Fit took: {cpu_gpu_pipe} sec")

time.sleep(5)

23/07/21 13:31:50 WARN TaskSetManager: Stage 125 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:31:52 WARN TaskSetManager: Stage 128 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:31:52 WARN TaskSetManager: Stage 129 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Fit took: 35.031697511672974 sec


In [42]:
pipe_transformed = pipe_model.transform(vector_df)
pipe_transformed.show(10)

+--------------------+--------------------+-----------+
|            features|     scaled_features|transformed|
+--------------------+--------------------+-----------+
|[7.60500764846801...|[0.78957140222302...|         71|
|[3.42933940887451...|[0.63245732071398...|        199|
|[-7.0488376617431...|[0.23820445581219...|        138|
|[4.56557607650756...|[0.67520946723735...|          2|
|[3.64227962493896...|[0.64046942908650...|         24|
|[-4.4982471466064...|[0.33417321173289...|         61|
|[4.66437816619873...|[0.67892700385240...|         80|
|[-7.5873112678527...|[0.21794379783209...|         65|
|[8.58247280120849...|[0.82634959696171...|        112|
|[5.78661394119262...|[0.72115235148728...|          9|
+--------------------+--------------------+-----------+
only showing top 10 rows



23/07/21 13:32:30 WARN TaskSetManager: Stage 132 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.


## Pipeline: GPU MinMaxScaler + GPU KMeans

In [43]:
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.functions import array_to_vector

from spark_rapids_ml.feature import MinMaxScaler
from spark_rapids_ml.clustering import KMeans

In [44]:
scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features')
gpu_kmeans = (
    KMeans()
    .setTol(1.0e-20)
    .setK(200)
    .setFeaturesCol("scaled_features")
    .setPredictionCol("transformed")
    .setMaxIter(15)
)

In [45]:
pipe = Pipeline(stages=[scaler, gpu_kmeans])

In [46]:
time.sleep(5)

start_time = time.time()
pipe_model = pipe.fit(vector_df)
gpu_gpu_pipe = time.time() - start_time
print(f"Fit took: {gpu_gpu_pipe} sec")

time.sleep(5)

23/07/21 13:32:35 WARN TaskSetManager: Stage 133 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:32:35 WARN TaskSetManager: Stage 134 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:32:45 WARN TaskSetManager: Stage 137 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
23/07/21 13:32:46 WARN TaskSetManager: Stage 138 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Fit took: 46.01442575454712 sec


In [47]:
pipe_transformed = pipe_model.transform(vector_df)
pipe_transformed.show(10)

23/07/21 13:33:27 WARN TaskSetManager: Stage 141 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
[Stage 141:>                                                        (0 + 1) / 1]

+--------------------+--------------------+-----------+
|            features|     scaled_features|transformed|
+--------------------+--------------------+-----------+
|[7.60500764846801...|[0.79968563934500...|         28|
|[3.42933940887451...|[0.64055896078507...|        179|
|[-7.0488376617431...|[0.24125580283769...|         25|
|[4.56557607650756...|[0.68385875296301...|          2|
|[3.64227962493896...|[0.64867370251500...|        125|
|[-4.4982471466064...|[0.33845389755024...|         76|
|[4.66437816619873...|[0.68762391040973...|        145|
|[-7.5873112678527...|[0.22073561025631...|        115|
|[8.58247280120849...|[0.83693495472136...|         35|
|[5.78661394119262...|[0.73039015552055...|          3|
+--------------------+--------------------+-----------+
only showing top 10 rows



                                                                                

### Summary

In [48]:
print(f"{'cpu_fit:':16} {cpu_fit}")
print(f"{'gpu_fit:':16} {gpu_fit}")
print(f"{'cpu_cpu_pipe:':16} {cpu_cpu_pipe}")
print(f"{'cpu_gpu_pipe:':16} {cpu_gpu_pipe}")
print(f"{'gpu_gpu_pipe:':16} {gpu_gpu_pipe}")

cpu_fit:         116.45441722869873
gpu_fit:         10.781142711639404
cpu_cpu_pipe:    133.2880301475525
cpu_gpu_pipe:    35.031697511672974
gpu_gpu_pipe:    46.01442575454712
