In [2]:
## Import Libraries
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans

## Set seed
seed = 1

In [3]:
## Create Spark Session
spark = SparkSession.builder.appName('kmExample').getOrCreate()

In [4]:
## Load Data
df = spark.read.format('libsvm').load('gs://spark-training-data/datasets/sample_kmeans_data.txt')
df.show(5)
df.printSchema() ## Confirm proper schema

21/12/07 03:33:16 WARN org.apache.spark.ml.source.libsvm.LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.
[Stage 1:>                                                          (0 + 1) / 1]

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
+-----+--------------------+
only showing top 5 rows

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



                                                                                

In [5]:
## Setup Final Data
final_data = df.select(['features'])
final_data.show(5)

[Stage 2:>                                                          (0 + 1) / 1]

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
+--------------------+
only showing top 5 rows



                                                                                

In [14]:
## Setup Model
kmeans = KMeans(featuresCol='features').setK(3).setSeed(seed)

In [15]:
## Fit Model
kmeans_model = kmeans.fit(final_data)

In [16]:
## Evaluate KMeans Model
wssse = kmeans_model.summary.trainingCost
wssse

0.07499999999994544

In [17]:
## Get Centers
centers = kmeans_model.clusterCenters()
centers

[array([9.1, 9.1, 9.1]), array([0.05, 0.05, 0.05]), array([0.2, 0.2, 0.2])]

In [18]:
## Make Predictions & Show Clusters
predictions_df = kmeans_model.transform(final_data)
predictions_df.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         2|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+

