# Spark ML - Tuning

## Prepare the Spark session

In [None]:
# Import findspark
import findspark

# Configure the environment
findspark.init()

# Import the Spark components required for the session creation
from pyspark import SparkConf
from pyspark.sql import SparkSession

# Configure and create the session
conf = SparkConf()
conf = conf.setAppName('mds-session')
conf = conf.setMaster('local[*]')
spark = SparkSession.builder.config(conf = conf).getOrCreate()

## Package import

In [None]:
# Import required packages
import pandas as pd
from plotnine import *
from plotnine import options as plot_options
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import (
    LinearRegression
)
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import (
    ParamGridBuilder,
    TrainValidationSplit,
    CrossValidator
)

## Read a sample CSV

In [None]:
# Read a sample data set
data = spark.read.options(sep=',', header=True, inferSchema=True).csv('./data/housing.csv')

## Target vs. features

In [None]:
target = 'medv'
features = [col for col in data.columns if col != target]

## Vectorize inputs

In [None]:
# Create the assembler
assembler = VectorAssembler(inputCols=features, outputCol='features')

# Apply the transformation
vectorized_data = assembler.transform(data)

# Check the transformed data
vectorized_data.show(5)

## Split train-test

In [None]:
# Spilt train test (70-30)
train, test = vectorized_data.randomSplit([0.9, 0.1])
print(f'Train size: {train.count()}')
print(f'Test size: {test.count()}')

## Create the estimator

In [None]:
# Create a linear regressor
lr = LinearRegression(
    featuresCol='features', 
    labelCol=target, 
    predictionCol='prediction', 
    regParam=0,
    elasticNetParam=0,
    fitIntercept=True,
    standardization=False
)

## Create the evaluator

In [None]:
# Create an evaluator
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol=target, metricName='rmse')

## Create the parameter grid

In [None]:
# Create the whole param combinations to check
params = ParamGridBuilder() \
    .addGrid(lr.regParam, [0, 0.5, 1.0]) \
    .addGrid(lr.elasticNetParam, [0, 0.5, 1.0]) \
    .addGrid(lr.fitIntercept, [False, True]) \
    .build()

## TrainValidationSplit

### Create the model

In [None]:
# Create the train validation split
tvs = TrainValidationSplit(estimator=lr, evaluator=evaluator, estimatorParamMaps=params, trainRatio=0.7)

### Train and analyze the model

In [None]:
# Entrenamos el modelo
trained_tvs = tvs.fit(train)

In [None]:
# Retrieve the results obtained by each combination
for i, combination in enumerate(params):
    combination_values = {}
    for param, value in combination.items():
        combination_values[param.name] = value
    print(f'{combination_values} --> {trained_tvs.validationMetrics[i]}')

In [None]:
# Retrieve best model
best_lr = trained_tvs.bestModel

In [None]:
# Retrieve the parameters of the best model
for param, value in best_lr.extractParamMap().items():
    print(f'{param.name}: {value}')

In [None]:
# Retrieve the whole set of coefficients + intercept
coefs = list(best_lr.coefficients.toArray()) + [best_lr.intercept]
pvalues = best_lr.summary.pValues
pd.DataFrame({
    'features': features + ['Intercept'], 
    'coefficients': coefs,
    'pvalues': pvalues
})

### Evaluate the model

In [None]:
# Retrieve predictions in train and test
train_preds = best_lr.transform(train)
test_preds = best_lr.transform(test)

# Create the evaluator
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol=target)

In [None]:
# Evaluate in train
train_rmse = evaluator.evaluate(train_preds, {evaluator.metricName: 'rmse'})
train_mse = evaluator.evaluate(train_preds, {evaluator.metricName: 'mse'})
train_r2 = evaluator.evaluate(train_preds, {evaluator.metricName: 'r2'})
train_mae = evaluator.evaluate(train_preds, {evaluator.metricName: 'mae'})

# Evaluate in test
test_rmse = evaluator.evaluate(test_preds, {evaluator.metricName: 'rmse'})
test_mse = evaluator.evaluate(test_preds, {evaluator.metricName: 'mse'})
test_r2 = evaluator.evaluate(test_preds, {evaluator.metricName: 'r2'})
test_mae = evaluator.evaluate(test_preds, {evaluator.metricName: 'mae'})

# Display model metrics
print(f'RMSE (Train/Test): {train_rmse} / {test_rmse}')
print(f'MSE (Train/Test): {train_mse} / {test_mse}')
print(f'R2 (Train/Test): {train_r2} / {test_r2}')
print(f'MAE (Train/Test): {train_mae} / {test_mae}')

In [None]:
# Display train results
train_results = train_preds.select(target, 'prediction').toPandas()
(
    ggplot(train_results, aes(x='medv', y='prediction')) + 
        geom_point() + 
        geom_abline(slope=1, intercept=0, colour='red')
)

In [None]:
# Display test results
test_results = test_preds.select(target, 'prediction').toPandas()
(
    ggplot(test_results, aes(x='medv', y='prediction')) + 
        geom_point() + 
        geom_abline(slope=1, intercept=0, colour='red')
)

## CrossValidator

### Create the model

In [None]:
# Create the CrossValidator model
cv = CrossValidator(estimator=lr, evaluator=evaluator, estimatorParamMaps=params, numFolds=3)

### Train and analyze the model

In [None]:
# Entrenamos nuestro modelo
trained_cv = cv.fit(train)

In [None]:
# Retrieve the results obtained by each combination
for i, combination in enumerate(params):
    combination_values = {}
    for param, value in combination.items():
        combination_values[param.name] = value
    print(f'{combination_values} --> {trained_cv.avgMetrics[i]}')

In [None]:
# Retrieve best model
best_lr = trained_cv.bestModel

In [None]:
# Retrieve the configuration of the best model
for param, value in best_lr.extractParamMap().items():
    print(f'{param.name}: {value}')

In [None]:
# Retrieve the whole list of coefficients + intercept
coefs = list(best_lr.coefficients.toArray()) + [best_lr.intercept]
pvalues = best_lr.summary.pValues
pd.DataFrame({
    'features': features + ['Intercept'], 
    'coefficients': coefs,
    'pvalues': pvalues
})

### Evaluate the model

In [None]:
# Retrieve predictions in train and test
train_preds = best_lr.transform(train)
test_preds = best_lr.transform(test)

# Create the evaluator
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol=target)

In [None]:
# Evalute in train
train_rmse = evaluator.evaluate(train_preds, {evaluator.metricName: 'rmse'})
train_mse = evaluator.evaluate(train_preds, {evaluator.metricName: 'mse'})
train_r2 = evaluator.evaluate(train_preds, {evaluator.metricName: 'r2'})
train_mae = evaluator.evaluate(train_preds, {evaluator.metricName: 'mae'})

# Evaluate in test
test_rmse = evaluator.evaluate(test_preds, {evaluator.metricName: 'rmse'})
test_mse = evaluator.evaluate(test_preds, {evaluator.metricName: 'mse'})
test_r2 = evaluator.evaluate(test_preds, {evaluator.metricName: 'r2'})
test_mae = evaluator.evaluate(test_preds, {evaluator.metricName: 'mae'})

# Display model metrics
print(f'RMSE (Train/Test): {train_rmse} / {test_rmse}')
print(f'MSE (Train/Test): {train_mse} / {test_mse}')
print(f'R2 (Train/Test): {train_r2} / {test_r2}')
print(f'MAE (Train/Test): {train_mae} / {test_mae}')

In [None]:
# Display train results
train_results = train_preds.select(target, 'prediction').toPandas()
(
    ggplot(train_results, aes(x='medv', y='prediction')) + 
        geom_point() + 
        geom_abline(slope=1, intercept=0, colour='red')
)

In [None]:
# Display test results
test_results = test_preds.select(target, 'prediction').toPandas()
(
    ggplot(test_results, aes(x='medv', y='prediction')) + 
        geom_point() + 
        geom_abline(slope=1, intercept=0, colour='red')
)

### Close the Spark session

In [None]:
spark.stop()