# Spark ML - Regression

## Prepare the Spark session

In [None]:
# Import findspark
import findspark

# Configure the environment
findspark.init()

# Import the Spark components required for the session creation
from pyspark import SparkConf
from pyspark.sql import SparkSession

# Configure and create the session
conf = SparkConf()
conf = conf.setAppName('mds-session')
conf = conf.setMaster('local[*]')
spark = SparkSession.builder.config(conf = conf).getOrCreate()

## Package import

In [None]:
# Import required packages
import pandas as pd
from plotnine import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import (
    LinearRegression,
    DecisionTreeRegressor,
    RandomForestRegressor,
    GBTRegressor
)
from pyspark.ml.evaluation import RegressionEvaluator

## Read a sample CSV

In [None]:
# Read a sample data set
data = spark.read.options(sep=',', header=True, inferSchema=True).csv('./data/housing.csv')

## Target vs. features

In [None]:
target = 'medv'
features = [col for col in data.columns if col != target]

## Vectorize inputs

In [None]:
# Create the assembler
assembler = VectorAssembler(inputCols=features, outputCol='features')

# Apply the transformation
vectorized_data = assembler.transform(data)

# Check the transformed data
vectorized_data.show(5)

## Split train-test

In [None]:
# Split train test (70-30)
train, test = vectorized_data.randomSplit([0.7, 0.3])
print(f'Train size: {train.count()}')
print(f'Test size: {test.count()}')

## Linear regression

### Create the model

In [None]:
# Create a linear regression model
lr = LinearRegression(
    featuresCol='features', 
    labelCol=target, 
    predictionCol='prediction', 
    regParam=0,
    elasticNetParam=0, # 0 == L2 (Ridge); 1 == L1 (Lasso)
    fitIntercept=True,
    standardization=False
)

### Train and analyze the model

In [None]:
# Train the model
trained_lr = lr.fit(train)

In [None]:
# Retrieve the whole list of coefficients + intercept
coefs = list(trained_lr.coefficients.toArray()) + [trained_lr.intercept]

# Retrieve the p-values of the coefficients
pvalues = trained_lr.summary.pValues

# Display the results fo the model
pd.DataFrame({
    'features': features + ['Intercept'], 
    'coefficients': coefs,
    'pvalues': pvalues
})

### Evaluate the model

In [None]:
# Retrieve the predictions in train and test
train_preds = trained_lr.transform(train)
test_preds = trained_lr.transform(test)

# Create a regression evaluator 
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol=target)

In [None]:
# Evaluate in train
train_rmse = evaluator.evaluate(train_preds, {evaluator.metricName: 'rmse'})
train_mse = evaluator.evaluate(train_preds, {evaluator.metricName: 'mse'})
train_r2 = evaluator.evaluate(train_preds, {evaluator.metricName: 'r2'})
train_mae = evaluator.evaluate(train_preds, {evaluator.metricName: 'mae'})

# Evaluate in test
test_rmse = evaluator.evaluate(test_preds, {evaluator.metricName: 'rmse'})
test_mse = evaluator.evaluate(test_preds, {evaluator.metricName: 'mse'})
test_r2 = evaluator.evaluate(test_preds, {evaluator.metricName: 'r2'})
test_mae = evaluator.evaluate(test_preds, {evaluator.metricName: 'mae'})

# Display model metrics
print(f'RMSE (Train/Test): {train_rmse} / {test_rmse}')
print(f'MSE (Train/Test): {train_mse} / {test_mse}')
print(f'R2 (Train/Test): {train_r2} / {test_r2}')
print(f'MAE (Train/Test): {train_mae} / {test_mae}')

In [None]:
# Display train results
train_results = train_preds.select(target, 'prediction').toPandas()
(
    ggplot(train_results, aes(x='medv', y='prediction')) + 
        geom_point() + 
        geom_abline(slope=1, intercept=0, colour='red')
)

In [None]:
# Display test results
test_results = test_preds.select(target, 'prediction').toPandas()
(
    ggplot(test_results, aes(x='medv', y='prediction')) + 
        geom_point() + 
        geom_abline(slope=1, intercept=0, colour='red')
)

## Decision tree

### Create the model

In [None]:
# Create the decission tree regressor
tree = DecisionTreeRegressor(
    featuresCol='features', 
    labelCol=target, 
    predictionCol='prediction', 
    maxDepth=5,
    minInstancesPerNode=1
)

### Train and analyze the model

In [None]:
# Train the model
trained_tree = tree.fit(train)

In [None]:
# Display feature importance
importance = trained_tree.featureImportances.toArray()
results = pd.DataFrame({
    'features': features, 
    'importance': importance,
})
(
    ggplot(results, aes(x='features', y='importance')) + 
        geom_bar(stat='identity') + 
        coord_flip() + 
        scale_x_discrete(limits=results.sort_values('importance')['features'])
)

### Evaluate the model

In [None]:
# Retrieve predictions in train and test
train_preds = trained_tree.transform(train)
test_preds = trained_tree.transform(test)

# Create an evaluator 
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol=target)

In [None]:
# Evaluate in train
train_rmse = evaluator.evaluate(train_preds, {evaluator.metricName: 'rmse'})
train_mse = evaluator.evaluate(train_preds, {evaluator.metricName: 'mse'})
train_r2 = evaluator.evaluate(train_preds, {evaluator.metricName: 'r2'})
train_mae = evaluator.evaluate(train_preds, {evaluator.metricName: 'mae'})

# Evaluate in test
test_rmse = evaluator.evaluate(test_preds, {evaluator.metricName: 'rmse'})
test_mse = evaluator.evaluate(test_preds, {evaluator.metricName: 'mse'})
test_r2 = evaluator.evaluate(test_preds, {evaluator.metricName: 'r2'})
test_mae = evaluator.evaluate(test_preds, {evaluator.metricName: 'mae'})

# Display model metrics
print(f'RMSE (Train/Test): {train_rmse} / {test_rmse}')
print(f'MSE (Train/Test): {train_mse} / {test_mse}')
print(f'R2 (Train/Test): {train_r2} / {test_r2}')
print(f'MAE (Train/Test): {train_mae} / {test_mae}')

In [None]:
# Display results in train
train_results = train_preds.select(target, 'prediction').toPandas()
(
    ggplot(train_results, aes(x='medv', y='prediction')) + 
        geom_point() + 
        geom_abline(slope=1, intercept=0, colour='red')
)

In [None]:
# Display results in test
test_results = test_preds.select(target, 'prediction').toPandas()
(
    ggplot(test_results, aes(x='medv', y='prediction')) + 
        geom_point() + 
        geom_abline(slope=1, intercept=0, colour='red')
)

## Random forest

### Create the model

In [None]:
# Create a random forest regressor
rf = RandomForestRegressor(
    featuresCol='features', 
    labelCol=target, 
    predictionCol='prediction', 
    numTrees=100,
    maxDepth=8,
    minInstancesPerNode=5
)

### Train and analyze the model

In [None]:
# Train the model
trained_rf = rf.fit(train)

In [None]:
# Display the feature importance
importance = trained_rf.featureImportances.toArray()
results = pd.DataFrame({
    'features': features, 
    'importance': importance,
})
(
    ggplot(results, aes(x='features', y='importance')) + 
        geom_bar(stat='identity') + 
        coord_flip() + 
        scale_x_discrete(limits=results.sort_values('importance')['features'])
)

### Evaluate the model

In [None]:
# Retrieve predictions in train and test
train_preds = trained_rf.transform(train)
test_preds = trained_rf.transform(test)

# Create a evaluator
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol=target)

In [None]:
# Evaluate in train
train_rmse = evaluator.evaluate(train_preds, {evaluator.metricName: 'rmse'})
train_mse = evaluator.evaluate(train_preds, {evaluator.metricName: 'mse'})
train_r2 = evaluator.evaluate(train_preds, {evaluator.metricName: 'r2'})
train_mae = evaluator.evaluate(train_preds, {evaluator.metricName: 'mae'})

# Evaluate in test
test_rmse = evaluator.evaluate(test_preds, {evaluator.metricName: 'rmse'})
test_mse = evaluator.evaluate(test_preds, {evaluator.metricName: 'mse'})
test_r2 = evaluator.evaluate(test_preds, {evaluator.metricName: 'r2'})
test_mae = evaluator.evaluate(test_preds, {evaluator.metricName: 'mae'})

# Display model metrics
print(f'RMSE (Train/Test): {train_rmse} / {test_rmse}')
print(f'MSE (Train/Test): {train_mse} / {test_mse}')
print(f'R2 (Train/Test): {train_r2} / {test_r2}')
print(f'MAE (Train/Test): {train_mae} / {test_mae}')

In [None]:
# Display results in train
train_results = train_preds.select(target, 'prediction').toPandas()
(
    ggplot(train_results, aes(x='medv', y='prediction')) + 
        geom_point() + 
        geom_abline(slope=1, intercept=0, colour='red')
)

In [None]:
# Display results in test
test_results = test_preds.select(target, 'prediction').toPandas()
(
    ggplot(test_results, aes(x='medv', y='prediction')) + 
        geom_point() + 
        geom_abline(slope=1, intercept=0, colour='red')
)

## Gradient Boosted Tree

### Create the model

In [None]:
# Create a GBT regressor
gbt = GBTRegressor(
    featuresCol='features', 
    labelCol=target, 
    predictionCol='prediction', 
    maxIter=20,
    maxDepth=8,
    minInstancesPerNode=5
)

### Train and analyze the model

In [None]:
# Train the model
trained_gbt = gbt.fit(train)

In [None]:
# Diplay feature importance
importance = trained_gbt.featureImportances.toArray()
results = pd.DataFrame({
    'features': features, 
    'importance': importance,
})
(
    ggplot(results, aes(x='features', y='importance')) + 
        geom_bar(stat='identity') + 
        coord_flip() + 
        scale_x_discrete(limits=results.sort_values('importance')['features'])
)

### Evaluate the model

In [None]:
# Retrieve predictions in train and test
train_preds = trained_gbt.transform(train)
test_preds = trained_gbt.transform(test)

# Create an evaluator
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol=target)

In [None]:
# Evaluate in train
train_rmse = evaluator.evaluate(train_preds, {evaluator.metricName: 'rmse'})
train_mse = evaluator.evaluate(train_preds, {evaluator.metricName: 'mse'})
train_r2 = evaluator.evaluate(train_preds, {evaluator.metricName: 'r2'})
train_mae = evaluator.evaluate(train_preds, {evaluator.metricName: 'mae'})

# Evaluate in test
test_rmse = evaluator.evaluate(test_preds, {evaluator.metricName: 'rmse'})
test_mse = evaluator.evaluate(test_preds, {evaluator.metricName: 'mse'})
test_r2 = evaluator.evaluate(test_preds, {evaluator.metricName: 'r2'})
test_mae = evaluator.evaluate(test_preds, {evaluator.metricName: 'mae'})

# Display model metrics
print(f'RMSE (Train/Test): {train_rmse} / {test_rmse}')
print(f'MSE (Train/Test): {train_mse} / {test_mse}')
print(f'R2 (Train/Test): {train_r2} / {test_r2}')
print(f'MAE (Train/Test): {train_mae} / {test_mae}')

In [None]:
# Display results in train
train_results = train_preds.select(target, 'prediction').toPandas()
(
    ggplot(train_results, aes(x='medv', y='prediction')) + 
        geom_point() + 
        geom_abline(slope=1, intercept=0, colour='red')
)

In [None]:
# Display results in test
test_results = test_preds.select(target, 'prediction').toPandas()
(
    ggplot(test_results, aes(x='medv', y='prediction')) + 
        geom_point() + 
        geom_abline(slope=1, intercept=0, colour='red')
)

## Close the Spark session

In [None]:
spark.stop()