# KMeans

In [1]:
import numpy as np
import pandas as pd
import time

### Create synthetic dataset

In [2]:
n_rows = 100000
n_cols = 500
n_clusters_data = 200
cluster_std = 1.0
dtype='float32'
from cuml.datasets import make_blobs
data, _ = make_blobs(
        n_rows, n_cols, n_clusters_data, cluster_std=cluster_std, random_state=0, dtype=dtype
    )  # make_blobs creates a random dataset of isotropic gaussian blobs.

data = data.get()



### Convert dataset to Spark DataFrame

In [3]:
pd_data = pd.DataFrame({"features": list(data)})
df = spark.createDataFrame(pd_data)

In [4]:
df.schema

StructType([StructField('features', ArrayType(FloatType(), True), True)])

### We will use this function to build both the Spark RAPIDS ML (GPU) and Spark ML (CPU) linear estimator objects, demonstrating the common API

In [5]:
def build_kmeans_estimator(estimator_class):
    return ( 
            estimator_class()
            .setTol(1.0e-20)
            .setK(200)
            .setFeaturesCol("features")
            .setMaxIter(15)
           )

## Spark RAPIDS ML (GPU)

In [6]:
from spark_rapids_ml.clustering import KMeans
gpu_kmeans = build_kmeans_estimator(KMeans)

Estimator can be persisted and reloaded.

In [7]:
estimator_path = "/tmp/kmeans-estimator"

In [8]:
gpu_kmeans.write().overwrite().save(estimator_path)
gpu_kmeans_loaded = KMeans.load(estimator_path)

                                                                                

### Fit

In [9]:
start_time = time.time()
gpu_model = gpu_kmeans_loaded.fit(df)
gpu_fit = time.time() - start_time
print(f"Fit took: {gpu_fit} sec")

23/07/20 16:12:35 WARN TaskSetManager: Stage 2 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
23/07/20 16:12:37 WARN TaskSetManager: Stage 3 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
[Stage 5:>                                                          (0 + 2) / 2]

Fit took: 13.370030879974365 sec


                                                                                

In [10]:
gpu_kmeans_loaded.getK()

200

In [11]:
sorted_clusters = sorted([vec.tolist() for vec in gpu_model.clusterCenters()])

In [12]:
[vec[0:10] for vec in sorted_clusters[0:2]]

[[-9.925885200500488,
  9.390859603881836,
  8.889856338500977,
  -5.8428778648376465,
  -1.2607864141464233,
  3.218362331390381,
  -9.877833366394043,
  -4.180136203765869,
  -3.7051053047180176,
  -3.4839558601379395],
 [-9.922629356384277,
  9.698697090148926,
  1.366484522819519,
  -2.162208080291748,
  2.795397996902466,
  9.486637115478516,
  2.898738384246826,
  4.726179599761963,
  7.606182098388672,
  2.337686538696289]]

### Transform

In [13]:
model_path = "/tmp/kmeans-model"

In [14]:
gpu_model.write().overwrite().save(model_path)

23/07/20 16:12:49 WARN TaskSetManager: Stage 7 contains a task of very large size (1937 KiB). The maximum recommended task size is 1000 KiB.


In [15]:
gpu_model_loaded = gpu_model.read().load(model_path)

In [16]:
[vec[0:10] for vec in sorted(gpu_model_loaded.cluster_centers_)[0:2]]

[[-9.925885200500488,
  9.390859603881836,
  8.889856338500977,
  -5.8428778648376465,
  -1.2607864141464233,
  3.218362331390381,
  -9.877833366394043,
  -4.180136203765869,
  -3.7051053047180176,
  -3.4839558601379395],
 [-9.922629356384277,
  9.698697090148926,
  1.366484522819519,
  -2.162208080291748,
  2.795397996902466,
  9.486637115478516,
  2.898738384246826,
  4.726179599761963,
  7.606182098388672,
  2.337686538696289]]

In [17]:
transformed_df = gpu_model_loaded.setPredictionCol("transformed").transform(df)

In [18]:
transformed_df.printSchema()

root
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- transformed: integer (nullable = true)



In [19]:
transformed_df.count()

23/07/20 16:12:50 WARN TaskSetManager: Stage 10 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

100000

In [20]:
transformed_df.show(10)

23/07/20 16:12:51 WARN TaskSetManager: Stage 13 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.


+--------------------+-----------+
|            features|transformed|
+--------------------+-----------+
|[7.6050076, -1.89...|        190|
|[3.4293394, 5.031...|         60|
|[-7.0488377, 5.49...|         25|
|[4.565576, 0.8187...|          7|
|[3.6422796, -0.92...|         48|
|[-4.498247, -6.47...|         21|
|[4.664378, -4.254...|         20|
|[-7.5873113, -5.9...|        173|
|[8.582473, 1.4493...|        144|
|[5.786614, 5.5428...|         68|
+--------------------+-----------+
only showing top 10 rows



## Spark ML (CPU)

In [21]:
from pyspark.ml.clustering import KMeans
cpu_kmeans = build_kmeans_estimator(KMeans)

Convert array sql type to VectorUDT Dataframe expected by Spark ML algos (Note: Spark RAPIDS ML also accepts VectorUDT Dataframes in addition to array type Dataframe above, along with a scalar column format - see docs).

In [22]:
from pyspark.ml.functions import array_to_vector

In [23]:
vector_df = df.select(array_to_vector("features").alias("features"))

### Fit

In [24]:
start_time = time.time()
cpu_kmeans_model = cpu_kmeans.fit(vector_df)
cpu_fit = time.time() - start_time
print(f"Fit took: {cpu_fit} sec")

23/07/20 16:12:52 WARN TaskSetManager: Stage 14 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
23/07/20 16:12:56 WARN TaskSetManager: Stage 17 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
23/07/20 16:12:59 WARN TaskSetManager: Stage 18 contains a task of very large size (19578 KiB). The maximum recommended task size is 1000 KiB.
23/07/20 16:12:59 WARN TaskSetManager: Stage 19 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
23/07/20 16:13:00 WARN TaskSetManager: Stage 20 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
23/07/20 16:13:01 WARN TaskSetManager: Stage 21 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
23/07/20 16:13:10 WARN TaskSetManager: Stage 22 contains a task of very large size (19578 KiB). The maximum recommended task size is 1000 KiB.

Fit took: 89.80326724052429 sec


                                                                                

In [25]:
type(cpu_kmeans_model.clusterCenters()[0])

numpy.ndarray

In [26]:
sorted_cpu_cluster_centers = sorted([vec.tolist() for vec in cpu_kmeans_model.clusterCenters()])
[vec[0:10] for vec in sorted_cpu_cluster_centers[0:2]]

[[-9.925886767809509,
  9.39085782649087,
  8.889854954891518,
  -5.842879385244651,
  -1.2607862297140184,
  3.21836230549656,
  -9.877833088890451,
  -4.180136888730722,
  -3.705105688483989,
  -3.483956264912105],
 [-9.922630417553675,
  9.698699070998469,
  1.3664845470518505,
  -2.162207881150218,
  2.7953974895164917,
  9.486636707548461,
  2.898738367709121,
  4.7261793167834565,
  7.6061832257088895,
  2.3376862421660523]]

### Transform

In [27]:
spark_transformed = cpu_kmeans_model.setPredictionCol("transformed").transform(vector_df)

In [28]:
spark_transformed.filter(spark_transformed.transformed >= 0).count()

23/07/20 16:14:22 WARN TaskSetManager: Stage 61 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

100000

In [29]:
spark_transformed.show(10)

+--------------------+-----------+
|            features|transformed|
+--------------------+-----------+
|[7.60500764846801...|         49|
|[3.42933940887451...|        158|
|[-7.0488376617431...|        123|
|[4.56557607650756...|        180|
|[3.64227962493896...|         18|
|[-4.4982471466064...|        145|
|[4.66437816619873...|        197|
|[-7.5873112678527...|         74|
|[8.58247280120849...|         61|
|[5.78661394119262...|         12|
+--------------------+-----------+
only showing top 10 rows



23/07/20 16:14:28 WARN TaskSetManager: Stage 64 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.


## Pipeline: CPU MinMaxScaler + CPU KMeans

In [30]:
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.functions import array_to_vector

from pyspark.ml.clustering import KMeans

In [31]:
scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features')
cpu_kmeans = (
    KMeans()
    .setTol(1.0e-20)
    .setK(200)
    .setFeaturesCol("scaled_features")
    .setPredictionCol("transformed")
    .setMaxIter(15)
)

In [32]:
pipe = Pipeline(stages=[scaler, cpu_kmeans])

In [33]:
start_time = time.time()
pipe_model = pipe.fit(vector_df)
cpu_cpu_pipe = time.time() - start_time
print(f"Fit took: {cpu_cpu_pipe} sec")

23/07/20 16:14:28 WARN TaskSetManager: Stage 65 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
23/07/20 16:14:30 WARN TaskSetManager: Stage 68 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
23/07/20 16:14:32 WARN TaskSetManager: Stage 71 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
23/07/20 16:14:35 WARN TaskSetManager: Stage 72 contains a task of very large size (19578 KiB). The maximum recommended task size is 1000 KiB.
23/07/20 16:14:36 WARN TaskSetManager: Stage 73 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
23/07/20 16:14:36 WARN TaskSetManager: Stage 74 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
23/07/20 16:14:37 WARN TaskSetManager: Stage 75 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.

Fit took: 101.7600085735321 sec


                                                                                

In [34]:
pipe_transformed = pipe_model.transform(vector_df)
pipe_transformed.show(10)

+--------------------+--------------------+-----------+
|            features|     scaled_features|transformed|
+--------------------+--------------------+-----------+
|[7.60500764846801...|[0.78957140222302...|        171|
|[3.42933940887451...|[0.63245732071398...|        189|
|[-7.0488376617431...|[0.23820445581219...|        124|
|[4.56557607650756...|[0.67520946723735...|         26|
|[3.64227962493896...|[0.64046942908650...|         80|
|[-4.4982471466064...|[0.33417321173289...|        184|
|[4.66437816619873...|[0.67892700385240...|         18|
|[-7.5873112678527...|[0.21794379783209...|        192|
|[8.58247280120849...|[0.82634959696171...|        179|
|[5.78661394119262...|[0.72115235148728...|         33|
+--------------------+--------------------+-----------+
only showing top 10 rows



23/07/20 16:16:10 WARN TaskSetManager: Stage 127 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.


## Pipeline: CPU MinMaxScaler + GPU KMeans

In [35]:
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.functions import array_to_vector

from spark_rapids_ml.clustering import KMeans

In [36]:
scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features')
gpu_kmeans = (
    KMeans()
    .setTol(1.0e-20)
    .setK(200)
    .setFeaturesCol("scaled_features")
    .setPredictionCol("transformed")
    .setMaxIter(15)
)

In [37]:
pipe = Pipeline(stages=[scaler, gpu_kmeans])

In [38]:
start_time = time.time()
pipe_model = pipe.fit(vector_df)
cpu_gpu_pipe = time.time() - start_time
print(f"Fit took: {cpu_gpu_pipe} sec")

23/07/20 16:16:11 WARN TaskSetManager: Stage 128 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
23/07/20 16:16:12 WARN TaskSetManager: Stage 131 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
23/07/20 16:16:12 WARN TaskSetManager: Stage 132 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
[Stage 134:>                                                        (0 + 2) / 2]

Fit took: 34.3448269367218 sec


                                                                                

In [39]:
pipe_transformed = pipe_model.transform(vector_df)
pipe_transformed.show(10)

+--------------------+--------------------+-----------+
|            features|     scaled_features|transformed|
+--------------------+--------------------+-----------+
|[7.60500764846801...|[0.78957140222302...|        198|
|[3.42933940887451...|[0.63245732071398...|         67|
|[-7.0488376617431...|[0.23820445581219...|        146|
|[4.56557607650756...|[0.67520946723735...|         33|
|[3.64227962493896...|[0.64046942908650...|         74|
|[-4.4982471466064...|[0.33417321173289...|        148|
|[4.66437816619873...|[0.67892700385240...|         17|
|[-7.5873112678527...|[0.21794379783209...|        105|
|[8.58247280120849...|[0.82634959696171...|        133|
|[5.78661394119262...|[0.72115235148728...|        107|
+--------------------+--------------------+-----------+
only showing top 10 rows



23/07/20 16:16:45 WARN TaskSetManager: Stage 135 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.


## Pipeline: GPU MinMaxScaler + GPU KMeans

In [40]:
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.functions import array_to_vector

from spark_rapids_ml.feature import MinMaxScaler
from spark_rapids_ml.clustering import KMeans

In [41]:
scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features')
gpu_kmeans = (
    KMeans()
    .setTol(1.0e-20)
    .setK(200)
    .setFeaturesCol("scaled_features")
    .setPredictionCol("transformed")
    .setMaxIter(15)
)

In [42]:
pipe = Pipeline(stages=[scaler, gpu_kmeans])

In [43]:
start_time = time.time()
pipe_model = pipe.fit(vector_df)
gpu_gpu_pipe = time.time() - start_time
print(f"Fit took: {gpu_gpu_pipe} sec")

23/07/20 16:16:45 WARN TaskSetManager: Stage 136 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
23/07/20 16:16:45 WARN TaskSetManager: Stage 137 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
23/07/20 16:16:55 WARN TaskSetManager: Stage 140 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
23/07/20 16:16:56 WARN TaskSetManager: Stage 141 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Fit took: 43.08415985107422 sec


In [44]:
pipe_transformed = pipe_model.transform(vector_df)
pipe_transformed.show(10)

23/07/20 16:17:29 WARN TaskSetManager: Stage 144 contains a task of very large size (19577 KiB). The maximum recommended task size is 1000 KiB.
[Stage 144:>                                                        (0 + 1) / 1]

+--------------------+--------------------+-----------+
|            features|     scaled_features|transformed|
+--------------------+--------------------+-----------+
|[7.60500764846801...|[0.79968568302477...|         99|
|[3.42933940887451...|[0.64055903916338...|         26|
|[-7.0488376617431...|[0.24125596828650...|        150|
|[4.56557607650756...|[0.68385882189954...|         82|
|[3.64227962493896...|[0.64867377912384...|        175|
|[-4.4982471466064...|[0.33845404180440...|        197|
|[4.66437816619873...|[0.68762397852525...|        105|
|[-7.5873112678527...|[0.22073578017967...|         98|
|[8.58247280120849...|[0.83693499027869...|        148|
|[5.78661394119262...|[0.73039021431062...|         21|
+--------------------+--------------------+-----------+
only showing top 10 rows



                                                                                

### Summary

In [46]:
print(f"{'cpu_fit:':16} {cpu_fit}")
print(f"{'gpu_fit:':16} {gpu_fit}")
print(f"{'cpu_cpu_pipe:':16} {cpu_cpu_pipe}")
print(f"{'cpu_gpu_pipe:':16} {cpu_gpu_pipe}")
print(f"{'gpu_gpu_pipe:':16} {gpu_gpu_pipe}")

cpu_fit:         89.80326724052429
gpu_fit:         13.370030879974365
cpu_cpu_pipe:    101.7600085735321
cpu_gpu_pipe:    34.3448269367218
gpu_gpu_pipe:    43.08415985107422
