In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('kmeans1').getOrCreate()

In [3]:
data = spark.read.format('libsvm').load('../data/sample_kmeans_data.txt')

In [6]:
data.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [7]:
data.select('features').show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



In [9]:
final_data = data.select('features')

In [10]:
final_data.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



### Fit Kmeans Model

In [5]:
from pyspark.ml.clustering import KMeans

In [11]:
kmeans = KMeans().setK(2).setSeed(42)

In [12]:
model = kmeans.fit(final_data)

In [13]:
model.predictionCol

Param(parent='KMeans_9699acf1cb65', name='predictionCol', doc='prediction column name')

### Calculate Within Set Sum of Squared Errors (WSSSE)

In [16]:
# within set sum of squared errors
wssse = model.computeCost(final_data)

In [17]:
wssse

0.11999999999994547

### Where Are the Centroids ?

In [19]:
centers = model.clusterCenters()

In [20]:
centers

[array([9.1, 9.1, 9.1]), array([0.1, 0.1, 0.1])]

### Which Cluster Is Each Sample In ?

In [21]:
results = model.transform(final_data)

In [23]:
results.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         1|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+

