# Spark ML - Preprocessing

## Prepare the Spark session

In [None]:
# Import findspark
import findspark

# Configure the environment
findspark.init()

# Import the Spark components required for the session creation
from pyspark import SparkConf
from pyspark.sql import SparkSession

# Configure and create the session
conf = SparkConf()
conf = conf.setAppName('mds-session')
conf = conf.setMaster('local[*]')
spark = SparkSession.builder.config(conf = conf).getOrCreate()

## Package import

In [None]:
# Import required packages
import numpy as np
import pandas as pd
from plotnine import *
from pyspark.ml.feature import (
    StringIndexer, 
    OneHotEncoder, 
    QuantileDiscretizer,
    VectorAssembler,
    StandardScaler,
    MinMaxScaler,
    MaxAbsScaler,
    Imputer,
    PCA
)
from pyspark.sql import functions as F

## Read a sample CSV

In [None]:
# Read a sample data set
data = spark.read.options(sep=',', header=True, inferSchema=True).csv('./data/housing.csv')

## Split train - test

In [None]:
# Unique split into train and test datasets
train, test = data.randomSplit([0.7, 0.3])

In [None]:
data.count()

In [None]:
train.count()

In [None]:
test.count()

## Categorical encoding - Indexing

In [None]:
# Let's add a categorical variable to the dataset
cat_data = data.withColumn('category_chas', F.when(data.chas == 0, 'NO').otherwise('YES'))
cat_data.show(5)

In [None]:
# Create the indexer
indexer = StringIndexer(inputCol='category_chas', outputCol='category_chas_indexed')

In [None]:
# Fit the transformation
trained_indexer = indexer.fit(cat_data)

In [None]:
# Apply the transformation to the data
transformed_data = trained_indexer.transform(cat_data)

In [None]:
# Check the final results
transformed_data.select('category_chas', 'category_chas_indexed').distinct().show()

## Categorical encoding - OneHotEncoding

In [None]:
# Create the encoder
encoding_inputs = ['category_chas_indexed', 'rad']
encoding_outputs = ['category_chas_encoded', 'rad_encoded']
encoder = OneHotEncoder(inputCols=encoding_inputs, outputCols=encoding_outputs)

In [None]:
# Fit the transformation
trained_encoder = encoder.fit(transformed_data)

In [None]:
# Apply the transformation to the data
encoded_data = trained_encoder.transform(transformed_data)

In [None]:
# Check the final results
encoded_data.select('category_chas', 'category_chas_indexed', 'category_chas_encoded').distinct().show()

In [None]:
encoded_data.select('rad', 'rad_encoded').distinct().show()

## Discretization

In [None]:
# Create the discretizer
discretizer = QuantileDiscretizer(numBuckets=5, inputCol='crim', outputCol='crim_binned')

In [None]:
# Fit the transformation
trained_discretizer = discretizer.fit(data)

In [None]:
# Apply the transformation to our data
discretized_data = trained_discretizer.transform(data)

In [None]:
# Check the final results
discretized_data.groupBy('crim_binned').agg(
    F.min('crim'),
    F.max('crim')
).orderBy('crim_binned').show()

## Columns to vectors

In [None]:
# Create the assembler
assembler = VectorAssembler(inputCols=data.columns, outputCol='features')

In [None]:
# Apply the transformation to our data
vectorized_data = assembler.transform(data)

In [None]:
# Check the final results
vectorized_data.select('features').show(5)

## Feature scaling - Standard

In [None]:
# Create the standard scaler
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withMean=True, withStd=True)

In [None]:
# Fit the transformation
trained_scaler = scaler.fit(vectorized_data)

In [None]:
# Apply the transformation to our data
scaled_data = trained_scaler.transform(vectorized_data)

In [None]:
# Check the final results
scaled_data.select('features', 'scaledFeatures').show(5)

## Feature scaling - MinMax

In [None]:
# Create the min-max scaler
scaler = MinMaxScaler(inputCol='features', outputCol='scaledFeatures')

In [None]:
# Fit the transformation
trained_scaler = scaler.fit(vectorized_data)

In [None]:
# Apply the transformation to our data
scaled_data = trained_scaler.transform(vectorized_data)

In [None]:
# Check the final results
scaled_data.select('features', 'scaledFeatures').show(5)

## Feature scaling - MaxAbs

In [None]:
# Create the max abs scaler
scaler = MaxAbsScaler(inputCol='features', outputCol='scaledFeatures')

In [None]:
# Fit the transformation
trained_scaler = scaler.fit(vectorized_data)

In [None]:
# Apply the transformation
scaled_data = trained_scaler.transform(vectorized_data)

In [None]:
# Check the final results
scaled_data.select('features', 'scaledFeatures').show(5)

## Missing values - Impute

In [None]:
# Let's add some missing values
missing_data = data.withColumn('missing_col', F.when(data.crim < 0.02, None).otherwise(data.crim))
missing_data.show(5)

In [None]:
# Check the number of missing values
missing_data.where(missing_data.missing_col.isNull()).count()

In [None]:
# Create the imputer
imputer = Imputer(inputCols=['missing_col'], outputCols=['imputed_col'], strategy='mean')

In [None]:
# Fit the transformation
trained_imputer = imputer.fit(missing_data)

In [None]:
# Apply the transformation to our data
imputed_data = trained_imputer.transform(missing_data)

In [None]:
# Check the final results
imputed_data.show(5)

In [None]:
# Chequeamos el número de missing values
imputed_data.select(
    F.sum(imputed_data.missing_col.isNull().cast('Int')).alias('missing_col'),
    F.sum(imputed_data.imputed_col.isNull().cast('Int')).alias('imputed_col')
).show()

## Dimensionality reduction - PCA

In [None]:
# Create the PCA
pca = PCA(k=5, inputCol='features', outputCol='pcaFeatures')

In [None]:
# Fit the transformation
trained_pca = pca.fit(vectorized_data)

In [None]:
# Apply the transformation to our data
pca_data = trained_pca.transform(vectorized_data)

In [None]:
# Check the final results
pca_data.select('pcaFeatures').show(5)

In [None]:
# Check explained variance per component
trained_pca.explainedVariance

In [None]:
# Visualize PCA results
labels = ['PC' + str(pc) for pc in range(1, 6)]
values = trained_pca.explainedVariance.toArray()
pca_results = pd.DataFrame({'comps': labels, 'variances': values})
ggplot(pca_results, aes(x='comps', y='variances')) + geom_bar(stat='identity')

## Close the session

In [None]:
spark.stop()