# Spark ML - Clustering

## Prepare the Spark session

In [None]:
# Import findspark
import findspark

# Configure the environment
findspark.init()

# Import the Spark components required for the session creation
from pyspark import SparkConf
from pyspark.sql import SparkSession

# Configure and create the session
conf = SparkConf()
conf = conf.setAppName('mds-session')
conf = conf.setMaster('local[*]')
spark = SparkSession.builder.config(conf = conf).getOrCreate()

## Package import

In [None]:
# Import required packages
import pandas as pd
from plotnine import *
from plotnine import options as plot_options
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import (
    KMeans
)
from pyspark.ml.evaluation import ClusteringEvaluator

## Read a sample CSV

In [None]:
# Read a sample data set
data = spark.read.options(sep=',', header=True, inferSchema=True).csv('./data/housing.csv')

## Prepare features

In [None]:
features = data.columns

## Vectorize inputs

In [None]:
# Create the assembler
assembler = VectorAssembler(inputCols=features, outputCol='features')

# Apply the transformation
vectorized_data = assembler.transform(data)

# Check the transformed data
vectorized_data.show(5)

## K-Means

### Create the models

In [None]:
# Create a KMeans clustering
kmeans = KMeans(
    featuresCol='features',
    predictionCol='cluster',
    k=5
)

### Train the model

In [None]:
# Train the model
trained_kmeans = kmeans.fit(vectorized_data)

### Evaluate the model

In [None]:
# Retrieve the predictions
predictions = trained_kmeans.transform(vectorized_data)

# Create the evaluator
evaluator = ClusteringEvaluator(predictionCol='cluster', featuresCol='features')

In [None]:
# Evaluate the clustering quality
silhouette = evaluator.evaluate(predictions, {evaluator.metricName: 'silhouette'}) 

# Display model metrics
print(f'Silhouette: {silhouette}')

In [None]:
# Display cluster frequencies
frequencies = predictions.groupBy('cluster').count().toPandas()

(
    ggplot(frequencies, aes(x='cluster', y='count', fill='cluster')) + geom_bar(stat='identity')
)

In [None]:
# Calculate the explanation of the clusters
cluster_explanation = predictions.groupby('cluster').agg(*[F.mean(col).alias(col) for col in features]).toPandas()
cluster_explanation

In [None]:
# Display the explanation of the clusters
cluster_explanation = cluster_explanation.assign(cluster = cluster_explanation.cluster.astype(str))
cluster_explanation = cluster_explanation.melt(id_vars='cluster')
current_fig_size = plot_options.figure_size
plot_options.figure_size = (4, 30)
(
    ggplot(cluster_explanation, aes(x='cluster', y='value', fill='cluster')) + 
        geom_bar(stat='identity') + 
        facet_wrap('~variable', scales='free', ncol=1)
).draw()
plot_options.figure_size = current_fig_size

### Close the Spark session

In [None]:
spark.stop()