# Hello MLlib

In [10]:
# Prereqquisites
from pyspark.sql import SparkSession

In [11]:
# Spark Session and Context
spark = SparkSession.builder.master("local").getOrCreate()
sc = spark.sparkContext
print("Spark Version: ", spark.version)

Spark Version:  3.4.1


### Data Ingestion

In [12]:
file_path = "data/sf-airbnb/sf-airbnb-clean.parquet/"

df_airbnb = spark.read.parquet(file_path)

df_airbnb.select("neighbourhood_cleansed", "room_type", "bedrooms", "bathrooms", "number_of_reviews", "price").show(5)
print("Total number of rows: ", df_airbnb.count())

+----------------------+---------------+--------+---------+-----------------+-----+
|neighbourhood_cleansed|      room_type|bedrooms|bathrooms|number_of_reviews|price|
+----------------------+---------------+--------+---------+-----------------+-----+
|      Western Addition|Entire home/apt|     1.0|      1.0|            180.0|170.0|
|        Bernal Heights|Entire home/apt|     2.0|      1.0|            111.0|235.0|
|        Haight Ashbury|   Private room|     1.0|      4.0|             17.0| 65.0|
|        Haight Ashbury|   Private room|     1.0|      4.0|              8.0| 65.0|
|      Western Addition|Entire home/apt|     2.0|      1.5|             27.0|785.0|
+----------------------+---------------+--------+---------+-----------------+-----+
only showing top 5 rows

Total number of rows:  7146


### Data Cleansing

In [6]:
df_airbnb.columns

['host_is_superhost',
 'cancellation_policy',
 'instant_bookable',
 'host_total_listings_count',
 'neighbourhood_cleansed',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'minimum_nights',
 'number_of_reviews',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'price',
 'bedrooms_na',
 'bathrooms_na',
 'beds_na',
 'review_scores_rating_na',
 'review_scores_accuracy_na',
 'review_scores_cleanliness_na',
 'review_scores_checkin_na',
 'review_scores_communication_na',
 'review_scores_location_na',
 'review_scores_value_na']

In [7]:
df_airbnb.describe()

DataFrame[summary: string, host_is_superhost: string, cancellation_policy: string, instant_bookable: string, host_total_listings_count: string, neighbourhood_cleansed: string, latitude: string, longitude: string, property_type: string, room_type: string, accommodates: string, bathrooms: string, bedrooms: string, beds: string, bed_type: string, minimum_nights: string, number_of_reviews: string, review_scores_rating: string, review_scores_accuracy: string, review_scores_cleanliness: string, review_scores_checkin: string, review_scores_communication: string, review_scores_location: string, review_scores_value: string, price: string, bedrooms_na: string, bathrooms_na: string, beds_na: string, review_scores_rating_na: string, review_scores_accuracy_na: string, review_scores_cleanliness_na: string, review_scores_checkin_na: string, review_scores_communication_na: string, review_scores_location_na: string, review_scores_value_na: string]

In [8]:
df_airbnb.summary()

DataFrame[summary: string, host_is_superhost: string, cancellation_policy: string, instant_bookable: string, host_total_listings_count: string, neighbourhood_cleansed: string, latitude: string, longitude: string, property_type: string, room_type: string, accommodates: string, bathrooms: string, bedrooms: string, beds: string, bed_type: string, minimum_nights: string, number_of_reviews: string, review_scores_rating: string, review_scores_accuracy: string, review_scores_cleanliness: string, review_scores_checkin: string, review_scores_communication: string, review_scores_location: string, review_scores_value: string, price: string, bedrooms_na: string, bathrooms_na: string, beds_na: string, review_scores_rating_na: string, review_scores_accuracy_na: string, review_scores_cleanliness_na: string, review_scores_checkin_na: string, review_scores_communication_na: string, review_scores_location_na: string, review_scores_value_na: string]

### Split data into training and test sets

In [13]:
df_train, df_test = df_airbnb.randomSplit([0.8, 0.2], seed=42)
print(f"training set size = {df_train.count()}, test set size = {df_test.count()}")

training set size = 5780, test set size = 1366


### Prepare features with Transformers

Linear regression requires all the input features to be contained within a single vector in a DataFrame.

Transformers in Spark take a DataFrame as input and return a new DataFrame with one or more columns appended to it.


In [14]:
from pyspark.ml.feature import VectorAssembler

# Experiment with a simple transformer
vec_assembler = VectorAssembler(inputCols = ["bedrooms"], outputCol = "features")
df_vec_train = vec_assembler.transform(df_train)
df_vec_train.select("bedrooms", "features", "price").show(10)

+--------+--------+-----+
|bedrooms|features|price|
+--------+--------+-----+
|     1.0|   [1.0]|200.0|
|     1.0|   [1.0]|130.0|
|     1.0|   [1.0]| 95.0|
|     1.0|   [1.0]|250.0|
|     3.0|   [3.0]|250.0|
|     1.0|   [1.0]|115.0|
|     1.0|   [1.0]|105.0|
|     1.0|   [1.0]| 86.0|
|     1.0|   [1.0]|100.0|
|     2.0|   [2.0]|220.0|
+--------+--------+-----+
only showing top 10 rows



### Use an Estimator to Build a model

An estimator takes in a DataFrame and returns a Model. Estimators learn parameters from data, have an estimator_name.fit() method, and are eagerly evaluated (i.e., kick off Spark jobs), whereas transformers are lazily evaluated.

Other examples of estimators include Imputer, DecisionTreeClassifier, and RandomForestRegressor.


In [15]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol = "features", labelCol = "price")
lr_model = lr.fit(df_vec_train)

```lr.fit()``` returns a LinearRegressionModel (```lr_model```), which is a transformer. I.e., the output of an estimator’s ```fit()``` method is a transformer. Once the estimator has learned the parameters, the transformer can apply these parameters to new data points to generate predictions.

In [11]:
# Inspect learned parameters
m = round(lr_model.coefficients[0], 3)
b = round(lr_model.intercept, 3)

print(f"The linear regression formula is: price = {m}*bedrooms + {b}")

The linear regression formula is: price = 123.676*bedrooms + 47.51


### Creating a Pipeline

Often data preparation pipelines will have multiple steps. The Pipeline API streamlines the process by handling managing the stages to pass the data through and takes care of the processing.


In [13]:
from pyspark.ml import Pipeline

# Create a Pipeline
pipeline = Pipeline(stages=[vec_assembler, lr])
pipeline_model = pipeline.fit(df_train)

Pipeline API also automatically knows which stages are transformers vs. estimators

# Run test data thru Pipeline
df_pred = pipeline_model.transform(df_test)
df_pred.select("bedrooms", "features", "price", "prediction").show(10)

### One Hot Encoding

To convert categorical values into numeric values, use one-hot encoding (OHE).

Spark internally uses a SparseVector when the majority of the entries are 0, as is often the case after OHE, so it does not waste space storing 0 values.  More memory efficient than Pandas.


In [21]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

# Convert Catecorigal values to one-hot encoded numerical values
categorical_cols = [field for (field, dataType) in df_train.dtypes if dataType == "string"]

print("Categorilcal Columns: ", categorical_cols)

index_output_cols = [x + "Index" for x in categorical_cols]
print("index_output_cols: ", index_output_cols)
ohe_output_cols = [x + "OHE" for x in categorical_cols]
print("ohe_output_cols: ", ohe_output_cols)

string_indexer = StringIndexer(inputCols = categorical_cols,
                               outputCols = index_output_cols,
                               handleInvalid = "skip")


ohe_encoder = OneHotEncoder(inputCols = index_output_cols,
                            outputCols = ohe_output_cols)
                               
numeric_cols = [field for (field, dataType ) in df_train.dtypes 
                if ((dataType == "double") & (field != "price"))]

assembler_inputs = ohe_output_cols + numeric_cols
vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")


Categorilcal Columns:  ['host_is_superhost', 'cancellation_policy', 'instant_bookable', 'neighbourhood_cleansed', 'property_type', 'room_type', 'bed_type']
index_output_cols:  ['host_is_superhostIndex', 'cancellation_policyIndex', 'instant_bookableIndex', 'neighbourhood_cleansedIndex', 'property_typeIndex', 'room_typeIndex', 'bed_typeIndex']
ohe_output_cols:  ['host_is_superhostOHE', 'cancellation_policyOHE', 'instant_bookableOHE', 'neighbourhood_cleansedOHE', 'property_typeOHE', 'room_typeOHE', 'bed_typeOHE']
handleInvalid: how to handle invalid data (unseen or NULL values) in features and label column of string type. Options are 'skip' (filter out rows with invalid data), error (throw an error), or 'keep' (put invalid data in a special additional bucket, at index numLabels). (default: error, current: skip)
inputCol: input column name. (undefined)
inputCols: input column names. (current: ['host_is_superhost', 'cancellation_policy', 'instant_bookable', 'neighbourhood_cleansed', 'proper

Difficulty with the above approach is that one needs to tell StringIndexer explicitly which features should be treated as categorical features.

Another approach is to use RFormula. The syntax is inspired by the R programming language. With RFormula, one provides labels and which features to include. It supports a limited subset of the R operators, including ~, ., :, +, and -.

The downside of RFormula automatically combining the StringIndexer and OneHotEncoder is that one-hot encoding is not required or recommended for all algorithms. E.g., tree-based algorithms can handle categorical variables directly and one can just use the StringIndexer for the categorical features.


In [24]:
# Using RFormula
from pyspark.ml.feature import RFormula

r_formula = RFormula(formula="price ~ .", featuresCol="features", labelCol="price", handleInvalid="skip")

### Do Linear Regression with all the Features

In [29]:
lr = LinearRegression(labelCol="price", featuresCol="features")

pipeline = Pipeline(stages=[string_indexer, ohe_encoder, vec_assembler, lr])

pipeline_model = pipeline.fit(df_train)

df_pred = pipeline_model.transform(df_test)

df_pred.select("features","price","prediction").show(15)
#df_pred.select("features","price","prediction").show(n=15, truncate=False)

+--------------------+------+-------------------+
|            features| price|         prediction|
+--------------------+------+-------------------+
|(98,[0,3,6,22,43,...|  85.0|  55.24365707389188|
|(98,[0,3,6,22,43,...|  45.0| 23.357685914717877|
|(98,[0,3,6,22,43,...|  70.0| 28.474464479034395|
|(98,[0,3,6,12,42,...| 128.0|  -91.6079079594947|
|(98,[0,3,6,12,43,...| 159.0|  95.05688229945372|
|(98,[0,3,6,12,43,...| 250.0| 263.54004583288406|
|(98,[0,3,6,11,42,...|  99.0| 152.64999694892595|
|(98,[0,3,6,31,42,...|  95.0| 180.93447481340718|
|(98,[0,3,6,28,42,...| 100.0|-52.840740912944966|
|(98,[0,3,6,28,43,...|2010.0| 261.05611673671683|
|(98,[0,3,6,33,43,...| 270.0| 164.99137394276022|
|(98,[0,3,6,33,43,...| 500.0| 376.95687662805994|
|(98,[0,3,6,33,42,...| 125.0|  78.23687803734356|
|(98,[0,3,6,13,42,...| 210.0| 362.62605262643956|
|(98,[0,3,6,18,43,...|  60.0| 107.34159362908304|
+--------------------+------+-------------------+
only showing top 15 rows



### Evaluate Model

##### RMSE = Root Mean-Square Error

In [31]:
from pyspark.ml.evaluation import RegressionEvaluator

# Calculate RMSE
regression_evaluator = RegressionEvaluator(
    predictionCol = "prediction",
    labelCol = "price",
    metricName = "rmse")

rmse = regression_evaluator.evaluate(df_pred)

print("RMSE = ", rmse)

RMSE =  220.56321700343753


##### R-squared

R2 = 1 - SSres/SStot

If the model perfectly predicts every data point, then your SSres = 0, making your R2 = 1. And if model predicts as well as baseline (average value) then SSres = SStot, and R2 is 0. 

If the model performs worse than always predicting ȳ and SSres is really large? Then R2 can actually be negative, which means the model is either really bad or something is wrong.



In [32]:
# Calculate R-squared
r2 = regression_evaluator.setMetricName("r2").evaluate(df_pred)

print("R-squared = ", r2)

R-squared =  0.16043316698848087


### Saving the Model

In [33]:
# Save the model
pipeline_path = "/tmp/lr-pipeline-model"

pipeline_model.write().overwrite().save(pipeline_path)

When loading back a saved model, one needs to specify the type of model one is loading back in (e.g.,  LinearRegressionModel or a LogisticRegressionModel?). For this reason, it is recommend one always puts one's transformers/estimators into a Pipeline, so that for one can load a PipelineModel and only need to change the file path to the model



In [34]:
# Reading back a Pipeline model
from pyspark.ml.pipeline import PipelineModel

saved_pipeline_model = PipelineModel.load(pipeline_path)

After loading, one can apply the model to new data points. However, one can’t use the weights from this model as initialization parameters for training a new model (as opposed to starting with random weights), as Spark has no concept of “warm starts.” If the dataset changes slightly, one has to retrain the entire linear regression model from scratch.


## Hyperparameter Tuning with Tree-Based Models

##### DecisionTreeRegressors

For decision trees, one doesn’t have to worry about standardizing or scaling input features, because this has no impact on the splits, but one has to be careful about how to prepare categorical features.

Tree-based methods can naturally handle categorical variables. In spark.ml, one just needs to pass the categorical columns to the StringIndexer, and the decision tree can take care of the rest. 

In [37]:
# Prepare DecisionTreeRegressor
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(labelCol="price")

# Filter for numeric columns, exclude label column "price"
numeric_cols = [field for (field, dataType) in df_train.dtypes 
                if ((dataType == "double") & (field != "price"))]

# Combine output of StringIndexer (defined earlies in notebook) and numeric columns 
assembler_inputs = index_output_cols + numeric_cols
vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

# Combine stages into a pipeline
stages = [string_indexer, vec_assembler, dt]
pipeline = Pipeline(stages=stages)
# Need to define MaxBins in Spark
dt.setMaxBins(40)
pipeline_model = pipeline.fit(df_train)


maxBins determines the number of bins into which continuous features are discretized, or split. This discretization step is crucial for performing distributed training. There is no maxBins parameter in scikit-learn because all of the data and the model reside on a single machine. In Spark, however, workers have all the columns of the data, but only a subset of the rows. Thus, when communicating about which features and values to split on, we need to be sure they’re all talking about the same split values, which we get from the common discretization set up at training time.

##### Extract Decision Tree rules

In [38]:
# Decision Tree rules
dt_model = pipeline_model.stages[-1]
print(dt_model.toDebugString)

DecisionTreeRegressionModel: uid=DecisionTreeRegressor_ff0abbfa2ac7, depth=5, numNodes=47, numFeatures=33
  If (feature 12 <= 2.5)
   If (feature 12 <= 1.5)
    If (feature 5 in {1.0,2.0})
     If (feature 4 in {0.0,1.0,3.0,5.0,9.0,10.0,11.0,13.0,14.0,16.0,18.0,24.0})
      If (feature 3 in {0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0,32.0,33.0,34.0})
       Predict: 104.23992784125075
      Else (feature 3 not in {0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0,32.0,33.0,34.0})
       Predict: 250.7111111111111
     Else (feature 4 not in {0.0,1.0,3.0,5.0,9.0,10.0,11.0,13.0,14.0,16.0,18.0,24.0})
      If (feature 3 in {0.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0,22.0,23.0,27.0,33.0,35.0})
       Predict: 151.94179894179894
      Else (feat

### Determine Most Important Features

In [40]:
import pandas as pd 

feature_importances = pd.DataFrame(
    list(zip(vec_assembler.getInputCols(), dt_model.featureImportances)),
    columns=["feature", "importance"])

feature_importances.sort_values(by="importance", ascending=False)



Unnamed: 0,feature,importance
12,bedrooms,0.283406
1,cancellation_policyIndex,0.167893
2,instant_bookableIndex,0.140081
4,property_typeIndex,0.128179
15,number_of_reviews,0.126233
3,neighbourhood_cleansedIndex,0.0562
9,longitude,0.03881
14,minimum_nights,0.029473
13,beds,0.015218
5,room_typeIndex,0.010905


### Random Forests

Random forests are an ensemble of decision trees with two key tweaks: 
- Bootstrapping samples by rows Bootstrapping is a technique for simulating new data by sampling with replacement from the original data. Each decision tree is trained on a different bootstrap sample of the dataset, which produces slightly different decision trees, and then aggregate their predictions. This technique is known as bootstrap aggregating, or bagging. In a typical random forest implementation, each tree samples the same number of data points with replacement from the original data set, and that number can be controlled through the subsamplingRate parameter.<br /><br />
- Random feature selection by columns The main drawback with bagging is that the trees are all highly correlated, and thus learn similar patterns in your data. To mitigate this problem, each time one wants to make a split, one only consider a random subset of the columns (1/3 of the features for RandomForestRegressor and  for RandomForestClassifier). Due to this randomness, one typically wants each tree to be quite shallow. Each of the trees learns something different about your data set, and combining this collection of “weak” learners into an ensemble makes the forest much more robust than a single decision tree.


In [41]:
# Instantiate Random Forest Regressor
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(labelCol="price", maxBins=40, seed=42)

Random forests truly demonstrate the power of distributed machine learning with Spark, as each tree can be built independently of the other trees

### k-fold Cross-Validation

Split our training data into k subsets, or “folds” (e.g., three). Then, for a given hyperparameter configuration, train the model on k–1 folds and evaluate on the remaining fold, repeating this process k times.


In [43]:
# k-fold validation
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

pipeline = Pipeline(stages=[string_indexer, vec_assembler, rf])

# Vary maxDepth to 2, 4, or 6 and numTrees to 10 or 100
param_grid = (ParamGridBuilder()
              .addGrid(rf.maxDepth, [2, 4, 6])
              .addGrid(rf.numTrees, [10, 100])
              .build())

# EValuation Criteria
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")

# Perform Cross Validation
cv = CrossValidator(estimator=pipeline, evaluator=evaluator,
                    estimatorParamMaps=param_grid, numFolds=3, seed=42)

cv_model = cv.fit(df_train)

Inspect results:

In [44]:
list(zip(cv_model.getEstimatorParamMaps(), cv_model.avgMetrics))

[({Param(parent='RandomForestRegressor_f2123dc9428f', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 2,
   Param(parent='RandomForestRegressor_f2123dc9428f', name='numTrees', doc='Number of trees to train (>= 1).'): 10},
  291.1822640924783),
 ({Param(parent='RandomForestRegressor_f2123dc9428f', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 2,
   Param(parent='RandomForestRegressor_f2123dc9428f', name='numTrees', doc='Number of trees to train (>= 1).'): 100},
  286.7714750274078),
 ({Param(parent='RandomForestRegressor_f2123dc9428f', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 4,
   Param(parent='RandomForestRegressor_f2123dc9428f', name

### Optimizing Pipelines

Add parallelism parameter

In [45]:
cv_model = cv.setParallelism(4).fit(df_train)

Another trick, put the cross validators inside the pipeline

In [46]:
cv = CrossValidator(estimator=rf,
                    evaluator=evaluator,
                    estimatorParamMaps=param_grid, 
                    numFolds=3, 
                    parallelism=4,
                    seed=42)

pipeline = Pipeline(stages=[string_indexer, vec_assembler, cv])
pipeline_model = pipeline.fit(df_train)