# KMeans

In [1]:
import numpy as np
import pandas as pd
import time

### Create synthetic dataset

In [2]:
n_rows = 100000
n_cols = 500
n_clusters_data = 200
cluster_std = 1.0
dtype='float32'
from cuml.datasets import make_blobs
data, _ = make_blobs(
        n_rows, n_cols, n_clusters_data, cluster_std=cluster_std, random_state=0, dtype=dtype
    )  # make_blobs creates a random dataset of isotropic gaussian blobs.

data = data.get()

### Convert dataset to Spark DataFrame

In [3]:
pd_data = pd.DataFrame({"features": list(data)})
df = spark.createDataFrame(pd_data)

  [(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]


In [4]:
df.schema

StructType([StructField('features', ArrayType(FloatType(), True), True)])

### We will use this function to build both the Spark RAPIDS ML (GPU) and Spark ML (CPU) linear estimator objects, demonstrating the common API

In [5]:
def build_kmeans_estimator(estimator_class):
    return ( 
            estimator_class()
            .setTol(1.0e-20)
            .setK(200)
            .setFeaturesCol("features")
            .setMaxIter(15)
           )

## Spark RAPIDS ML (GPU)

In [6]:
from spark_rapids_ml.clustering import KMeans
gpu_kmeans = build_kmeans_estimator(KMeans)

Estimator can be persisted and reloaded.

In [7]:
estimator_path = "/tmp/kmeans-estimator"

In [8]:
gpu_kmeans.write().overwrite().save(estimator_path)
gpu_kmeans_loaded = KMeans.load(estimator_path)

                                                                                

### Fit

In [9]:
start_time = time.time()
gpu_model = gpu_kmeans_loaded.fit(df)
gpu_fit = time.time() - start_time
print(f"Fit took: {gpu_fit} sec")

23/05/24 14:50:02 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU
  ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
    @Expression <AttributeReference> features#0 could run on GPU

23/05/24 14:50:02 WARN TaskSetManager: Stage 2 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.
23/05/24 14:50:03 WARN GpuOverrides: 
  ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
    @Expression <Attrib

[Stage 5:>                                                          (0 + 1) / 1]

Fit took: 12.810078859329224 sec


                                                                                

In [10]:
gpu_kmeans_loaded.getK()

200

In [11]:
sorted_clusters = sorted([vec.tolist() for vec in gpu_model.clusterCenters()])

In [12]:
[vec[0:10] for vec in sorted_clusters[0:2]]

[[-9.925885200500488,
  9.390853881835938,
  8.889856338500977,
  -5.84288215637207,
  -1.260785698890686,
  3.2183611392974854,
  -9.87783432006836,
  -4.180136203765869,
  -3.705103874206543,
  -3.4839558601379395],
 [-9.922632217407227,
  9.698699951171875,
  1.3664839267730713,
  -2.1622085571289062,
  2.7953970432281494,
  9.4866361618042,
  2.8987371921539307,
  4.72617769241333,
  7.606184005737305,
  2.337686538696289]]

### Transform

In [13]:
model_path = "/tmp/kmeans-model"

In [14]:
gpu_model.write().overwrite().save(model_path)

23/05/24 14:50:15 WARN TaskSetManager: Stage 7 contains a task of very large size (1935 KiB). The maximum recommended task size is 1000 KiB.


In [15]:
gpu_model_loaded = gpu_model.read().load(model_path)

In [16]:
[vec[0:10] for vec in sorted(gpu_model_loaded.cluster_centers_)[0:2]]

[[-9.925885200500488,
  9.390853881835938,
  8.889856338500977,
  -5.84288215637207,
  -1.260785698890686,
  3.2183611392974854,
  -9.87783432006836,
  -4.180136203765869,
  -3.705103874206543,
  -3.4839558601379395],
 [-9.922632217407227,
  9.698699951171875,
  1.3664839267730713,
  -2.1622085571289062,
  2.7953970432281494,
  9.4866361618042,
  2.8987371921539307,
  4.72617769241333,
  7.606184005737305,
  2.337686538696289]]

In [17]:
transformed_df = gpu_model_loaded.setPredictionCol("transformed").transform(df)

In [18]:
transformed_df.printSchema()

root
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- transformed: integer (nullable = true)



In [19]:
transformed_df.count()

23/05/24 14:50:16 WARN GpuOverrides: 
        ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
          @Expression <AttributeReference> features#0 could run on GPU

23/05/24 14:50:16 WARN GpuOverrides: 
        ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
          @Expression <AttributeReference> features#0 could run on GPU

23/05/24 14:50:16 WARN GpuOverrides: 
        ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
          @Expression <AttributeReference> features#0 could run on GPU

23/05/24 14:50:16 WARN GpuOverrides: 
      ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
        @Expression <AttributeReference> features#0 could run 

                                                                                

100000

In [20]:
transformed_df.show(10)

23/05/24 14:50:18 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU
      ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
        @Expression <AttributeReference> features#0 could run on GPU

23/05/24 14:50:18 WARN TaskSetManager: Stage 13 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


[Stage 13:>                                                         (0 + 1) / 1]

+--------------------+-----------+
|            features|transformed|
+--------------------+-----------+
|[7.605007648, -1....|         91|
|[3.429339409, 5.0...|        147|
|[-7.048837662, 5....|        169|
|[4.565576077, 0.8...|         73|
|[3.642279625, -0....|         47|
|[-4.498247147, -6...|        178|
|[4.664378166, -4....|         94|
|[-7.587311268, -5...|         48|
|[8.582472801, 1.4...|         28|
|[5.786613941, 5.5...|        132|
+--------------------+-----------+
only showing top 10 rows



                                                                                

## Spark ML (CPU)

In [21]:
from pyspark.ml.clustering import KMeans
cpu_kmeans = build_kmeans_estimator(KMeans)

Convert array sql type to VectorUDT Dataframe expected by Spark ML algos (Note: Spark RAPIDS ML also accepts VectorUDT Dataframes in addition to array type Dataframe above, along with a scalar column format - see docs).

In [22]:
from pyspark.ml.functions import array_to_vector

In [23]:
vector_df = df.select(array_to_vector("features").alias("features"))

### Fit

In [24]:
start_time = time.time()
cpu_kmeans_model = cpu_kmeans.fit(vector_df)
cpu_fit = time.time() - start_time
print(f"Fit took: {cpu_fit} sec")

23/05/24 14:50:23 WARN GpuOverrides: 
! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec
  ! <CreateExternalRow> createexternalrow(newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(features,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow
    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke
      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance
     

                                                                                

23/05/24 14:50:29 WARN TaskSetManager: Stage 15 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:50:30 WARN TaskSetManager: Stage 16 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:50:31 WARN TaskSetManager: Stage 17 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:50:31 WARN TaskSetManager: Stage 18 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:50:49 WARN TaskSetManager: Stage 19 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:50:50 WARN TaskSetManager: Stage 20 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:51:25 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/05/24 14:51:25 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


                                                                                

23/05/24 14:51:27 WARN TaskSetManager: Stage 23 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:51:35 WARN TaskSetManager: Stage 26 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:51:42 WARN TaskSetManager: Stage 29 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:51:50 WARN TaskSetManager: Stage 32 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:51:58 WARN TaskSetManager: Stage 35 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:52:05 WARN TaskSetManager: Stage 38 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:52:12 WARN TaskSetManager: Stage 41 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:52:20 WARN TaskSetManager: Stage 44 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:52:27 WARN TaskSetManager: Stage 47 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:52:33 WARN GpuOverrides: 
      !Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced
        @Expression <Alias> UDF(UDF(features#0)) AS prediction#58 could run on GPU
          !Expression <ScalaUDF> UDF(UDF(features#0)) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.clustering.KMeansModel$$Lambda$4038/2042610068 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; param expression ScalaUDF UDF(features#0) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)
            !Expression <ScalaUDF> UDF(features#0) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3697/1027037449 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(features#0) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7
              @Expression <AttributeRe



Fit took: 143.9151623249054 sec


                                                                                

In [25]:
type(cpu_kmeans_model.clusterCenters()[0])

numpy.ndarray

In [26]:
sorted_cpu_cluster_centers = sorted([vec.tolist() for vec in cpu_kmeans_model.clusterCenters()])
[vec[0:10] for vec in sorted_cpu_cluster_centers[0:2]]

[[-9.925886767809509,
  9.39085782649087,
  8.889854954891518,
  -5.842879385244651,
  -1.2607862297140184,
  3.21836230549656,
  -9.877833088890451,
  -4.180136888730722,
  -3.705105688483989,
  -3.483956264912105],
 [-9.852554267680151,
  0.16838401963078295,
  2.2751893251840416,
  -8.421260306115688,
  -9.012462378958979,
  4.740987099604259,
  0.440052602359294,
  -1.6977547910321629,
  -3.060619864473211,
  9.689517942876272]]

### Transform

In [27]:
spark_transformed = cpu_kmeans_model.setPredictionCol("transformed").transform(vector_df)

In [28]:
spark_transformed.filter(spark_transformed.transformed >= 0).count()

23/05/24 14:52:47 WARN GpuOverrides: 
        !Exec <FilterExec> cannot run on GPU because not all expressions can be replaced
          @Expression <GreaterThanOrEqual> (UDF(UDF(features#0)) >= 0) could run on GPU
            !Expression <ScalaUDF> UDF(UDF(features#0)) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.clustering.KMeansModel$$Lambda$4038/2042610068 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; param expression ScalaUDF UDF(features#0) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)
              !Expression <ScalaUDF> UDF(features#0) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3697/1027037449 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(features#0) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7
                @Expression 

                                                                                

100000

In [29]:
spark_transformed.show(10)

23/05/24 14:53:00 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU
  !Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced; unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [features#45]
    @Expression <Alias> cast(features#45 as string) AS features#113 could run on GPU
      !Expression <Cast> cast(features#45 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported
        !Expression <AttributeReference> features#45 cannot run on GPU because expression AttributeReference featu

## Pipeline: CPU MinMaxScaler + GPU KMeans

Note: cuML has a [MinMaxScaler](https://docs.rapids.ai/api/cuml/nightly/api.html#cuml.preprocessing.MinMaxScaler), but it needs to be exposed as a [Spark ML Transformer](https://spark.apache.org/docs/latest/ml-pipeline.html#transformers) (not sure of potential impact on performance).

In [30]:
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.functions import array_to_vector

from spark_rapids_ml.clustering import KMeans

In [31]:
scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features')
gpu_kmeans = (
    KMeans()
    .setTol(1.0e-20)
    .setK(200)
    .setFeaturesCol("scaled_features")
    .setPredictionCol("transformed")
    .setMaxIter(15)
)

In [32]:
pipe = Pipeline(stages=[scaler, gpu_kmeans])

In [33]:
start_time = time.time()
pipe_model = pipe.fit(vector_df)
cpu_gpu_pipe = time.time() - start_time
print(f"Fit took: {cpu_gpu_pipe} sec")

23/05/24 14:53:01 WARN GpuOverrides: 
!Exec <ProjectExec> cannot run on GPU because unsupported data types in input: StructType(StructField(max,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,false),StructField(min,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,false)) [summary#121]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [max#123, min#124]; not all expressions can be replaced
  !Expression <Alias> summary#121.max AS max#123 cannot run on GPU because input expression GetStructField summary#121.max (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias summary#121.max AS max#123 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7
    !Expression <GetStructField> summary#121.max cannot run on GPU because input expression AttributeReference summary#121 (child org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported, child org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expressio



23/05/24 14:53:04 WARN GpuOverrides: 
!Exec <ProjectExec> cannot run on GPU because unsupported data types in input: StructType(StructField(max,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,false),StructField(min,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,false)) [summary#121]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [max#123, min#124]; not all expressions can be replaced
  !Expression <Alias> summary#121.max AS max#123 cannot run on GPU because input expression GetStructField summary#121.max (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias summary#121.max AS max#123 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7
    !Expression <GetStructField> summary#121.max cannot run on GPU because input expression AttributeReference summary#121 (child org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported, child org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expressio

                                                                                

23/05/24 14:53:04 WARN TaskSetManager: Stage 59 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.
23/05/24 14:53:04 WARN GpuOverrides: 
!Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced
  @Expression <Alias> UDF(UDF(UDF(features#0))) AS cuml_values#143 could run on GPU
    !Expression <ScalaUDF> UDF(UDF(UDF(features#0))) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3658/1023321880 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; param expression ScalaUDF UDF(UDF(features#0)) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)
      !Expression <ScalaUDF> UDF(UDF(features#0)) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.feature.MinMaxScalerModel$$Lambda$4233/1510773388 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expre

[Stage 62:>                                                         (0 + 1) / 1]

Fit took: 18.478280305862427 sec


                                                                                

In [34]:
pipe_transformed = pipe_model.transform(vector_df)
pipe_transformed.show(10)

23/05/24 14:53:19 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU
  !Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced; unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [features#45]
    @Expression <Alias> cast(features#45 as string) AS features#178 could run on GPU
      !Expression <Cast> cast(features#45 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported
        !Expression <AttributeReference> features#45 cannot run on GPU because expression AttributeReference featu

[Stage 63:>                                                         (0 + 1) / 1]

+--------------------+--------------------+-----------+
|            features|     scaled_features|transformed|
+--------------------+--------------------+-----------+
|[7.60500764846801...|[0.78957140222302...|        194|
|[3.42933940887451...|[0.63245732071398...|         80|
|[-7.0488376617431...|[0.23820445581219...|         63|
|[4.56557607650756...|[0.67520946723735...|          6|
|[3.64227962493896...|[0.64046942908650...|        198|
|[-4.4982471466064...|[0.33417321173289...|         61|
|[4.66437816619873...|[0.67892700385240...|        132|
|[-7.5873112678527...|[0.21794379783209...|         83|
|[8.58247280120849...|[0.82634959696171...|         97|
|[5.78661394119262...|[0.72115235148728...|        169|
+--------------------+--------------------+-----------+
only showing top 10 rows



                                                                                

## Pipeline: CPU MinMaxScaler + CPU KMeans

In [35]:
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.functions import array_to_vector

from pyspark.ml.clustering import KMeans

In [36]:
scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features')
cpu_kmeans = (
    KMeans()
    .setTol(1.0e-20)
    .setK(200)
    .setFeaturesCol("scaled_features")
    .setPredictionCol("transformed")
    .setMaxIter(15)
)

In [37]:
pipe = Pipeline(stages=[scaler, cpu_kmeans])

In [38]:
start_time = time.time()
pipe_model = pipe.fit(vector_df)
cpu_cpu_pipe = time.time() - start_time
print(f"Fit took: {cpu_cpu_pipe} sec")

23/05/24 14:53:24 WARN GpuOverrides: 
!Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced; unsupported data types in input: StructType(StructField(max,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,false),StructField(min,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,false)) [summary#190]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [max#192, min#193]
  !Expression <Alias> summary#190.max AS max#192 cannot run on GPU because expression Alias summary#190.max AS max#192 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression GetStructField summary#190.max (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)
    !Expression <GetStructField> summary#190.max cannot run on GPU because expression GetStructField summary#190.max produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression AttributeReference summary#190 (child org.apache.spark.ml.linalg



23/05/24 14:53:27 WARN GpuOverrides: 
!Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced; unsupported data types in input: StructType(StructField(max,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,false),StructField(min,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,false)) [summary#190]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [max#192, min#193]
  !Expression <Alias> summary#190.max AS max#192 cannot run on GPU because expression Alias summary#190.max AS max#192 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression GetStructField summary#190.max (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)
    !Expression <GetStructField> summary#190.max cannot run on GPU because expression GetStructField summary#190.max produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression AttributeReference summary#190 (child org.apache.spark.ml.linalg

                                                                                

23/05/24 14:53:27 WARN TaskSetManager: Stage 67 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:53:32 WARN TaskSetManager: Stage 68 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:53:33 WARN TaskSetManager: Stage 69 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:53:34 WARN TaskSetManager: Stage 70 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:53:34 WARN TaskSetManager: Stage 71 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:53:52 WARN TaskSetManager: Stage 72 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:53:52 WARN TaskSetManager: Stage 73 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:54:29 WARN TaskSetManager: Stage 76 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:54:36 WARN TaskSetManager: Stage 79 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:54:43 WARN TaskSetManager: Stage 82 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:54:51 WARN TaskSetManager: Stage 85 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:54:58 WARN TaskSetManager: Stage 88 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:55:05 WARN TaskSetManager: Stage 91 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:55:12 WARN TaskSetManager: Stage 94 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:55:19 WARN TaskSetManager: Stage 97 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:55:27 WARN TaskSetManager: Stage 100 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/05/24 14:55:33 WARN GpuOverrides: 
      !Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced
        @Expression <Alias> UDF(UDF(UDF(features#0))) AS transformed#220 could run on GPU
          !Expression <ScalaUDF> UDF(UDF(UDF(features#0))) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.clustering.KMeansModel$$Lambda$4038/2042610068 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; param expression ScalaUDF UDF(UDF(features#0)) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)
            !Expression <ScalaUDF> UDF(UDF(features#0)) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.feature.MinMaxScalerModel$$Lambda$4233/1510773388 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(UDF(features#0)) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3



Fit took: 141.9701817035675 sec


                                                                                

In [39]:
pipe_transformed = pipe_model.transform(vector_df)
pipe_transformed.show(10)

23/05/24 14:55:46 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU
  !Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced; unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [features#45, scaled_features#252]
    @Expression <Alias> cast(features#45 as string) AS features#271 could run on GPU
      !Expression <Cast> cast(features#45 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported
        !Expression <AttributeReference> features#45 cannot run on GPU because expression Att

## Test (numpy)

In [40]:
from spark_rapids_ml.clustering import KMeans

In [41]:
gpu_kmeans = (
    KMeans(fake_pipe=True)
    .setTol(1.0e-20)
    .setK(200)
    .setFeaturesCol("features")
    .setPredictionCol("transformed")
    .setMaxIter(15)
)

In [42]:
start_time = time.time()
pipe_model = gpu_kmeans.fit(df)
fake_pipe_numpy = time.time() - start_time
print(f"Fit took: {fake_pipe_numpy} sec")

23/05/24 14:55:46 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU
  ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
    @Expression <AttributeReference> features#0 could run on GPU

23/05/24 14:55:46 WARN TaskSetManager: Stage 106 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.
23/05/24 14:55:47 WARN GpuOverrides: 
  ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
    @Expression <Attr

[Stage 109:>                                                        (0 + 1) / 1]

Fit took: 10.451662302017212 sec


                                                                                

## Test (cupy)

In [43]:
from spark_rapids_ml.clustering import KMeans

In [44]:
gpu_kmeans = (
    KMeans(fake_pipe=True, use_cupy=True)
    .setTol(1.0e-20)
    .setK(200)
    .setFeaturesCol("features")
    .setPredictionCol("transformed")
    .setMaxIter(15)
)

In [45]:
start_time = time.time()
pipe_model = gpu_kmeans.fit(df)
fake_pipe_cupy = time.time() - start_time
print(f"Fit took: {fake_pipe_cupy} sec")

23/05/24 14:55:57 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU
  ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
    @Expression <AttributeReference> features#0 could run on GPU

23/05/24 14:55:57 WARN TaskSetManager: Stage 110 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.
23/05/24 14:55:57 WARN GpuOverrides: 
  ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
    @Expression <Attr

[Stage 113:>                                                        (0 + 1) / 1]

Fit took: 6.068034648895264 sec


                                                                                

## Test (numpy)

In [46]:
from spark_rapids_ml.clustering import KMeans

In [47]:
gpu_kmeans = (
    KMeans(fake_pipe=True)
    .setTol(1.0e-20)
    .setK(200)
    .setFeaturesCol("features")
    .setPredictionCol("transformed")
    .setMaxIter(15)
)

In [48]:
start_time = time.time()
pipe_model = gpu_kmeans.fit(df)
fake_pipe_numpy = time.time() - start_time
print(f"Fit took: {fake_pipe_numpy} sec")

23/05/24 14:56:03 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU
  ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
    @Expression <AttributeReference> features#0 could run on GPU

23/05/24 14:56:03 WARN TaskSetManager: Stage 114 contains a task of very large size (24467 KiB). The maximum recommended task size is 1000 KiB.
23/05/24 14:56:03 WARN GpuOverrides: 
  ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
    @Expression <Attr

[Stage 117:>                                                        (0 + 1) / 1]

Fit took: 6.01736307144165 sec


                                                                                

### Summary

In [49]:
print(gpu_fit)
print(cpu_fit)
print(cpu_gpu_pipe)
print(cpu_cpu_pipe)
print(fake_pipe_numpy)
print(fake_pipe_cupy)

12.810078859329224
143.9151623249054
18.478280305862427
141.9701817035675
6.01736307144165
6.068034648895264
