# TP3 KMeans


Nemanja Kostadinovic

### Exercice 1

In [1]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import VectorIndexer, VectorAssembler

from pyspark.sql.functions import col

In [2]:
dataset = spark.read.format("libsvm").load("/usr/local/Cellar/apache-spark/3.0.1/libexec/data/mllib/sample_kmeans_data.txt")

In [3]:
dataset

DataFrame[label: double, features: vector]

In [4]:
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)

In [5]:
predictions = model.transform(dataset)

In [6]:
evaluator = ClusteringEvaluator()

In [7]:
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.9997530305375207


In [8]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[9.1 9.1 9.1]
[0.1 0.1 0.1]


In [9]:
for k in range(2,8):
    kmeans = KMeans().setK(k).setSeed(1)
    model = kmeans.fit(dataset)
    predictions2 = model.transform(dataset)
    silhouette = evaluator.evaluate(predictions)
    print("Silhouette score = " + str(silhouette)+" for k="+str(k))

Silhouette score = 0.9997530305375207 for k=2
Silhouette score = 0.9997530305375207 for k=3
Silhouette score = 0.9997530305375207 for k=4
Silhouette score = 0.9997530305375207 for k=5
Silhouette score = 0.9997530305375207 for k=6
Silhouette score = 0.9997530305375207 for k=7


In [10]:
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



## Exercice 2

In [12]:
training = spark.read.option("header", "false").option("sep",",").csv("./data/3D_spatial_network.txt")

In [13]:
training.show()

+---------+---------+----------+----------------+
|      _c0|      _c1|       _c2|             _c3|
+---------+---------+----------+----------------+
|144552912|9.3498486|56.7408757|17.0527715677876|
|144552912|9.3501884|56.7406785| 17.614840244389|
|144552912|9.3505485|56.7405445|  18.08353563951|
|144552912|9.3508058|56.7404845|18.2794652530352|
|144552912|9.3510534|56.7404863|18.4229736146099|
|144552912|9.3514747|56.7405022|19.1248885940143|
|144552912|9.3521273|56.7405585|19.5905926656897|
|144552912|9.3524201|56.7405974|19.6217636955693|
|144552912|9.3525839| 56.740629|19.6599309194984|
|144552912|9.3527255|56.7406626|19.4906695590218|
|144552912|9.3530759|56.7408002|19.2302682047961|
|144552912| 9.353404|56.7409042|18.2332801889404|
|144552912|9.3537287|56.7409995|17.6008423536018|
|144552912| 9.354179|56.7410594|17.6443793914607|
|144552912| 9.354581|56.7411511|18.3974824566501|
|144552912|9.3547868|56.7412058|18.8645551315113|
|144552912|9.3551147|56.7412693|18.2477297711073|


In [14]:
training.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)



In [15]:
training = training.select(*(col(c).cast("float").alias(c) for c in training.columns))

In [16]:
assembler = VectorAssembler(inputCols = training.columns, outputCol = 'features')
df = assembler.transform(training).select('features')
df.show(6)

+--------------------+
|            features|
+--------------------+
|[1.44552912E8,9.3...|
|[1.44552912E8,9.3...|
|[1.44552912E8,9.3...|
|[1.44552912E8,9.3...|
|[1.44552912E8,9.3...|
|[1.44552912E8,9.3...|
+--------------------+
only showing top 6 rows



In [17]:
df.columns

['features']

In [18]:
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(df)

In [19]:
predictions = model.transform(df)

In [20]:
evaluator = ClusteringEvaluator()

In [21]:
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.8412354480618449


In [22]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[1.15861486e+08 9.71354127e+00 5.70727889e+01 2.29653651e+01]
[4.03179970e+07 9.79035850e+00 5.71188476e+01 1.96904355e+01]


In [23]:
for k in range(2,8):
    kmeans = KMeans().setK(k).setSeed(1)
    model = kmeans.fit(dataset)
    predictions2 = model.transform(dataset)
    silhouette = evaluator.evaluate(predictions)
    print("Silhouette score = " + str(silhouette)+" for k="+str(k))

Silhouette score = 0.8412354480618449 for k=2
Silhouette score = 0.8412354480618449 for k=3
Silhouette score = 0.8412354480618449 for k=4
Silhouette score = 0.8412354480618449 for k=5
Silhouette score = 0.8412354480618449 for k=6
Silhouette score = 0.8412354480618449 for k=7
