# Spark ML - Pipelines

### Prepare the Spark session

In [None]:
# Import findspark
import findspark

# Configure the environment
findspark.init()

# Import the Spark components required for the session creation
from pyspark import SparkConf
from pyspark.sql import SparkSession

# Configure and create the session
conf = SparkConf()
conf = conf.setAppName('mds-session')
conf = conf.setMaster('local[*]')
spark = SparkSession.builder.config(conf = conf).getOrCreate()

## Package import

In [None]:
# Import required packages
import pandas as pd
from plotnine import *
from plotnine import options as plot_options
from pyspark.sql import functions as F
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    OneHotEncoder,
    VectorAssembler
)
from pyspark.ml.clustering import (
    KMeans
)
from pyspark.ml.evaluation import ClusteringEvaluator

## Read a sample CSV

In [None]:
# Read a sample data set
data = spark.read.options(sep=',', header=True, inferSchema=True).csv('./data/housing.csv')

## Create the individual pipeline steps

In [None]:
# Create the one hot encoder
encoder = OneHotEncoder(inputCols=['chas', 'rad'], outputCols=['chas_enc', 'rad_enc'])

In [None]:
# Create the assembler
columns = [col for col in data.columns if col not in ['chas', 'rad']]
assembler = VectorAssembler(inputCols=columns, outputCol='features')

In [None]:
# Create the KMeans clustering
kmeans = KMeans(k=5, predictionCol='cluster')

## Pipeline

In [None]:
# Create the pipeline
pipeline = Pipeline(stages=[encoder, assembler, kmeans])

In [None]:
# Train the pipeline
trained_pipeline = pipeline.fit(data)

In [None]:
# Retrive the predictions of the pipeline
predictions = trained_pipeline.transform(data)

# Create the evaluator
evaluator = ClusteringEvaluator(predictionCol='cluster', featuresCol='features')

In [None]:
# Evaluate the clustering quality
silhouette = evaluator.evaluate(predictions, {evaluator.metricName: 'silhouette'}) 

# Display model metrics
print(f'Silhouette: {silhouette}')

### Close the Spark session

In [None]:
spark.stop()