**From the documentation**

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [2]:
from pyspark.ml.clustering import KMeans

In [4]:
data = spark.read.format('libsvm').load('./datasets/sample_kmeans_data.txt')
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [7]:
data.count()

6

In [8]:
final_data = data.select('features')

In [9]:
kmeans = KMeans(k=2, seed=1, featuresCol='features')

In [10]:
model = kmeans.fit(final_data)

In [13]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [12]:
predictions = model.transform(final_data)

In [14]:
evaluator = ClusteringEvaluator()

In [15]:
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.9997530305375207


In [16]:
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[9.1 9.1 9.1]
[0.1 0.1 0.1]


In [17]:
predictions.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         1|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+



**Real dataset**

In [18]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [19]:
data = spark.read.csv('./datasets/seeds_dataset.csv', inferSchema=True, header=True)

In [20]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [21]:
data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)]

In [22]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

In [23]:
data.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [25]:
assembler = VectorAssembler(inputCols=data.columns, outputCol='features')

In [36]:
final_data = assembler.transform(data)

In [39]:
from pyspark.ml.feature import StandardScaler

In [40]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [41]:
scaler_model = scaler.fit(final_data)

In [42]:
final_data = scaler_model.transform(final_data)

In [43]:
final_data.select('features').show(truncate=False)

+---------------------------------------------------------------------+
|features                                                             |
+---------------------------------------------------------------------+
|[15.26,14.84,0.871,5.763,3.312,2.221,5.22]                           |
|[14.88,14.57,0.8811,5.553999999999999,3.333,1.018,4.956]             |
|[14.29,14.09,0.905,5.291,3.3369999999999997,2.699,4.825]             |
|[13.84,13.94,0.8955,5.324,3.3789999999999996,2.259,4.805]            |
|[16.14,14.99,0.9034,5.6579999999999995,3.562,1.355,5.175]            |
|[14.38,14.21,0.8951,5.386,3.312,2.4619999999999997,4.956]            |
|[14.69,14.49,0.8799,5.563,3.259,3.5860000000000003,5.218999999999999]|
|[14.11,14.1,0.8911,5.42,3.302,2.7,5.0]                               |
|[16.63,15.46,0.8747,6.053,3.465,2.04,5.877000000000001]              |
|[16.44,15.25,0.888,5.8839999999999995,3.505,1.969,5.5329999999999995]|
|[15.26,14.85,0.8696,5.7139999999999995,3.242,4.543,5.314]      

In [44]:
final_data.select('scaledFeatures').show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------+
|scaledFeatures                                                                                                                     |
+-----------------------------------------------------------------------------------------------------------------------------------+
|[5.244527953320284,11.363299389287777,36.860833906302894,13.007165541092315,8.76852883087142,1.4771618831975104,10.62097073949694] |
|[5.113930271651758,11.156554723849252,37.28826722714521,12.53544983779745,8.824126386864265,0.6770602418257837,10.08381819634997]  |
|[4.911160186955888,10.789008651958541,38.29971835270278,11.94185543604363,8.834716397529569,1.7950742560783792,9.817276593500525]  |
|[4.756505037611581,10.674150504492696,37.89767711032634,12.01633686287966,8.945911509515255,1.5024352517528932,9.776583219019694]  |
|[5.546964689815818,11.478157536753622,38.23200614346043,12.77

In [45]:
kmeans = KMeans(featuresCol='scaledFeatures', k=3)

In [46]:
model = kmeans.fit(final_data)

In [47]:
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator = ClusteringEvaluator()

In [49]:
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.6018627534901196


In [48]:
predictions = model.transform(final_data)

In [50]:
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[ 4.87257659 10.88120146 37.27692543 12.3410157   8.55443412  1.81649011
 10.32998598]
[ 6.31670546 12.37109759 37.39491396 13.91155062  9.748067    2.39849968
 12.2661748 ]
[ 4.06105916 10.13979506 35.80536984 11.82133095  7.50395937  3.27184732
 10.42126018]


In [52]:
predictions.select('prediction').show()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         1|
|         1|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         2|
+----------+
only showing top 20 rows

