In [18]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.clustering import GaussianMixture
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.clustering import KMeans
from pyspark.ml.clustering import LDA
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
import pandas as pd


if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("Algorithms")\
        .getOrCreate()

In [19]:
df = spark.read.csv('distance2.csv')
# Convert columns to float
df = df.select(*(col(c).cast("float").alias(c) for c in df.columns))

In [20]:
df.describe()

DataFrame[summary: string, _c0: string, _c1: string]

In [21]:
assembler = VectorAssembler(
    inputCols=["_c0","_c1"],
    outputCol="features")

df = assembler.transform(df)
df = df.drop("_c0")
df = df.drop("_c1")
df.show()

+-------------+
|     features|
+-------------+
|  [363.0,1.0]|
|  [363.0,2.0]|
| [333.0,33.0]|
| [333.0,40.0]|
| [333.0,20.0]|
|[333.0,350.0]|
|[333.0,150.0]|
+-------------+



In [22]:
df.describe()

DataFrame[summary: string]

In [23]:
gmm = GaussianMixture().setK(2).setSeed(538009335)
model = gmm.fit(df)
print("Gaussians shown as a DataFrame: ")
model.gaussiansDF.show(truncate=False)

Gaussians shown as a DataFrame: 
+--------------------------------------+---------------------------------------------------------------------------------+
|mean                                  |cov                                                                              |
+--------------------------------------+---------------------------------------------------------------------------------+
|[341.41970120031567,87.02808233175415]|181.699667706839    -720.1208974693747  
-720.1208974693747  14240.213137177681  |
|[341.7227384174395,83.26281973803401] |185.59598702410253  -713.195688847124  
-713.195688847124   13567.58420783319    |
+--------------------------------------+---------------------------------------------------------------------------------+



In [24]:
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(df)

# Make predictions
predictions = model.transform(df)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
     print(center)

Silhouette with squared euclidean distance = 0.7651516099155794
Cluster Centers: 
[343.  41.]
[333. 350.]


In [25]:
bkm = BisectingKMeans().setK(2).setSeed(1)
model = bkm.fit(df)
predictions = model.transform(df)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))
# Shows the result.
print("Cluster Centers: ")
centers = model.clusterCenters()
for center in centers:
    print(center)

Silhouette with squared euclidean distance = 0.7125317898292932
Cluster Centers: 
[345.   19.2]
[333. 250.]


In [26]:
spark.stop()