## References

##### General
[MLlib](https://spark.apache.org/docs/latest/ml-classification-regression.html#decision-tree-classifier)

[MLlib Classification](https://people.eecs.berkeley.edu/~jegonzal/pyspark/_modules/pyspark/ml/classification.html)

[Feature Engineering](https://docs.databricks.com/applications/machine-learning/preprocess-data/mllib.html) 

[Feature Transformers](https://spark.apache.org/docs/latest/ml-features)

[Feature Importance](https://spark.apache.org/docs/2.1.0/api/python/pyspark.ml.html?highlight=featureimportance)

##### Tree Algorithms
[Decision Tree](https://spark.apache.org/docs/1.5.2/ml-decision-tree.html)

[Gradient Boosted Trees](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.GBTClassifier.html)

[Xgboost](https://databricks.github.io/spark-deep-learning/_modules/sparkdl/xgboost/xgboost.html)

## Libraries

In [0]:
# general
import re
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# sql 
from pyspark.sql import functions as f
from pyspark.sql import SQLContext

# ml pipeline
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.feature import OneHotEncoder

# models
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, LogisticRegression

from sparkdl.xgboost import XgboostClassifier

# metrics
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sklearn.metrics import classification_report

sqlContext = SQLContext(sc)

## Helper Functions

In [0]:
def ExtractFeatureImp(featureImp, dataset, featuresCol):
  '''
  Locates the indices from the pipeline transformations for each feature using the schema
  Maps through each index to find its feature importance
  
  output:                Pandas dataframe
  '''
  list_extract = []
  for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
      list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
  varlist = pd.DataFrame(list_extract)
  varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
  return(varlist.sort_values('score', ascending = False))

In [0]:
def eval_metrics(prediction):
  '''
  Calculates roc_auc and pr_auc when given the predictions
  '''
  
  # initialize evaluators
  eval_roc_auc = BinaryClassificationEvaluator(metricName='areaUnderROC')
  eval_pr_auc = BinaryClassificationEvaluator(metricName='areaUnderPR')
  
  # calculate metrics
  roc_auc = eval_roc_auc.evaluate(prediction)
  pr_auc = eval_pr_auc.evaluate(prediction)
  
  # return metrics
  metrics = {'roc_auc':roc_auc, 'pr_auc': pr_auc}
  return metrics

In [0]:
def cv_scores(cvModel):
  '''
  Shows scores for each iteration during cross validation and param grid search
  '''
  params = [{p.name: v for p, v in m.items()} for m in cvModel.getEstimatorParamMaps()]

  scores = pd.DataFrame.from_dict([
      {cvModel.getEvaluator().getMetricName(): metric, **ps} 
      for ps, metric in zip(params, cvModel.avgMetrics)
  ])
  
  return scores

## Import data

In [0]:
data_6m = spark.read.option("header", "true").parquet(f"dbfs:/tmp/out/final_air_weather_6m.parquet")

In [0]:
print('shape: ', (data_6m.count(), len(data_6m.columns)))

## Undersampling - Optional

In [0]:
delayed_df = data_6m.filter(f.col('DEP_DEL15')==1)
ontime_df = data_6m.filter(f.col('DEP_DEL15')==0)
sampleRatio = 1.0* delayed_df.count() / data_6m.count()
new_ontime_df = ontime_df.sample(False, sampleRatio)
new_data_6m = delayed_df.unionAll(new_ontime_df)

print(new_data_6m.count(), len(new_data_6m.columns))

## Prep Data

In [0]:
def data_pipeline(data, label_col): 
  '''
  This funcion transforms input data into two columns: label and features.
  StringIndexer and one-hot encoding are applied to categorical features.  
  VectorAssembler combines both categorical and numeric features into one column.
  '''
  
  # stages in pipeline
  stages = []

  # convert label into label indices using the StringIndexer
  label_stringIdx = StringIndexer(inputCol=label_col, outputCol="label")
  stages += [label_stringIdx]

  # one hot encode categorical variables
  categoricalColumns =  [i[0] for i in data.dtypes if i[1]=='string']

  for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + ",classVec"])
    stages += [stringIndexer, encoder]

  # grabs numeric features - excluding our output
  numericCols = [i[0] for i in data.drop(label_col).dtypes if i[1]!='string']

  # transform all features into a single column called features using VectorAssembler
  assemblerInputs = [c + ",classVec" for c in categoricalColumns] + numericCols
  assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
  stages += [assembler]

  # puts data through all the feature transformations
  partialPipeline = Pipeline().setStages(stages)
  pipelineModel = partialPipeline.fit(data)
  preppedDataDF = pipelineModel.transform(data)

  # rename
  dataset = preppedDataDF
  
  return dataset

In [0]:
# prepped dataset for train/test split
dataset_6m = data_pipeline(data_6m, "DEP_DEL15")

In [0]:
new_dataset_6m = data_pipeline(new_data_6m, "DEP_DEL15")

## Train/Test Split

In [0]:
# train test split
(trainingData_6m, testData_6m) = dataset_6m.randomSplit([0.8, 0.2], seed=2021)

In [0]:
# train test split - for undersampling
(trainingData_6m, testData_6m) = new_dataset_6m.randomSplit([0.8, 0.2], seed=2021)

In [0]:
# train distribution
print('train')
print(trainingData_6m.filter(f.col('label')==0).count()/trainingData_6m.count())
print(trainingData_6m.filter(f.col('label')==1).count()/trainingData_6m.count())

# test distribution
print('test')
print(testData_6m.filter(f.col('label')==0).count()/testData_6m.count())
print(testData_6m.filter(f.col('label')==1).count()/testData_6m.count())

## Logistic Regression

In [0]:
lr = LogisticRegression(labelCol="label", featuresCol="features", regParam=0.001, elasticNetParam=0)

In [0]:
# create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.01, 0.001])
             .addGrid(lr.elasticNetParam, [0, 0.5, 1])
             .build())

# create 5-fold CrossValidator
evaluator = MulticlassClassificationEvaluator(metricName='f1')
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
 
# run cross validations - 2 minutes
lr_cvModel = cv.fit(trainingData_6m)

# model
print()
print("regParam = ", lr_cvModel.bestModel.regParam)
print("elasticParam = ", lr_cvModel.bestModel.elasticNetParam)

In [0]:
# scores during training
cv_scores(lr_cvModel)

Unnamed: 0,f1,regParam,elasticNetParam
0,0.623005,0.1,0.0
1,0.43927,0.1,0.5
2,0.40715,0.1,1.0
3,0.633189,0.01,0.0
4,0.624669,0.01,0.5
5,0.615927,0.01,1.0
6,0.634189,0.001,0.0
7,0.63488,0.001,0.5
8,0.634408,0.001,1.0


In [0]:
# use test set to measure the accuracy of the model on new data
lrPred = lr_cvModel.bestModel.transform(testData_6m)

# evaluate predictions
lrScore = eval_metrics(lrPred)
print(lrScore)

# classification report
print(classification_report(testData_6m.select(f.col('label')).toPandas(), lrPred.select(f.col('prediction')).toPandas()))

## Decision Tree

##### Train

In [0]:
# create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3)

In [0]:
# create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [5, 10, 20])
             .addGrid(dt.maxBins, [20, 40, 80])
             .build())

# create 5-fold CrossValidator
evaluator = MulticlassClassificationEvaluator(metricName='f1')
cv = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
 
# run cross validations - 2 minutes
dt_cvModel = cv.fit(trainingData_6m)

# tree
print()
print("numNodes = ", dt_cvModel.bestModel.numNodes)
print("depth = ", dt_cvModel.bestModel.depth)

In [0]:
# scores during training
cv_scores(dt_cvModel)

Unnamed: 0,f1,maxDepth,maxBins
0,0.630247,5,20
1,0.628356,5,40
2,0.632747,5,80
3,0.657078,10,20
4,0.65598,10,40
5,0.655706,10,80
6,0.641049,20,20
7,0.639394,20,40
8,0.638734,20,80


##### Score

In [0]:
# use test set to measure the accuracy of the model on new data
dtPred = dt_cvModel.bestModel.transform(testData_6m)

# evaluate predictions
dtScore = eval_metrics(dtPred)
print(dtScore)

# classification report
print(classification_report(testData_6m.select(f.col('label')).toPandas(), dtPred.select(f.col('prediction')).toPandas()))

##### Feature Importance

In [0]:
# get feature importance with helper function
ExtractFeatureImp(dt_cvModel.bestModel.featureImportances, new_dataset_6m, "features").head(10)

Unnamed: 0,idx,name,score
13,237,CLOUD_BASE_HEIGHT,0.162027
8,232,TMP_TEMP,0.136012
2,226,CRS_DEP_TIME,0.117774
15,0,"OP_CARRIER,classVec_DL",0.094788
9,233,DEW_TEMP,0.080795
0,224,MONTH,0.049125
7,231,VIS_DIST,0.039958
1,225,DAY_OF_WEEK,0.031692
3,227,CRS_ARR_TIME,0.031359
27,12,"ORIGIN,classVec_ATL",0.030273


## Random Forest

##### Train

In [0]:
# create an initial RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

In [0]:
# create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [5, 10])
             .addGrid(rf.maxBins, [20, 40, 80])
             .addGrid(rf.numTrees, [10, 20, 30])
             .build())

# create 5-fold CrossValidator
evaluator = MulticlassClassificationEvaluator(metricName='f1')
rf_cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
 
# run cross validations - 10 minutes
rf_cvModel = rf_cv.fit(trainingData_6m)

# RF
print()
print("num trees = ", rf_cvModel.bestModel.getNumTrees)
print("maxdepth = ", rf_cvModel.bestModel.getOrDefault('maxDepth'))
print("maxbins = ", rf_cvModel.bestModel.getMaxBins())

In [0]:
# scores during training
cv_scores(rf_cvModel)

Unnamed: 0,f1,maxDepth,maxBins,numTrees
0,0.587836,5,20,10
1,0.579393,5,20,20
2,0.59324,5,20,30
3,0.591456,5,40,10
4,0.582066,5,40,20
5,0.590128,5,40,30
6,0.593355,5,80,10
7,0.581344,5,80,20
8,0.590644,5,80,30
9,0.639977,10,20,10


##### Score

In [0]:
# make predictions on test data 
rfPred = rf_cvModel.bestModel.transform(testData_6m)

# evaluate predictions
rfScore = eval_metrics(rfPred)
print(rfScore)

# classification report
print(classification_report(testData_6m.select(f.col('label')).toPandas(), rfPred.select(f.col('prediction')).toPandas()))

##### Feature Importance

In [0]:
# get feature importance with helper function
ExtractFeatureImp(rf_cvModel.bestModel.featureImportances, new_dataset_6m, "features").head(10)

Unnamed: 0,idx,name,score
13,237,CLOUD_BASE_HEIGHT,0.14389
2,226,CRS_DEP_TIME,0.111233
8,232,TMP_TEMP,0.092173
9,233,DEW_TEMP,0.081657
15,0,"OP_CARRIER,classVec_DL",0.063212
3,227,CRS_ARR_TIME,0.062272
7,231,VIS_DIST,0.056023
0,224,MONTH,0.041549
27,12,"ORIGIN,classVec_ATL",0.040412
10,234,SLP_PRESSURE,0.033812


## Gradient Boosted Tree

##### Train

In [0]:
# create an initial RandomForest model.
gb = GBTClassifier(labelCol="label", featuresCol="features", seed=2021)

In [0]:
# create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(gb.maxDepth, [5, 10])
             .addGrid(gb.maxBins, [20, 80])
             .addGrid(gb.maxIter, [10, 20, 50])
             .build())

# create 5-fold CrossValidator
evaluator = MulticlassClassificationEvaluator(metricName='f1')
gb_cv = CrossValidator(estimator=gb, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
 
# run cross validations - 5hrs
gb_cvModel = gb_cv.fit(trainingData_6m)

# GBT
print()
print("num trees = ", gb_cvModel.bestModel.getMaxIter())
print("maxdepth = ", gb_cvModel.bestModel.getMaxDepth())

##### Score

In [0]:
# make predictions on test data
gbPred = gb_cvModel.bestModel.transform(testData_6m)

# evaluate predictions
gbScore = eval_metrics(gbPred)
print(gbScore)

# classification report
print(classification_report(testData_6m.select(f.col('label')).toPandas(), gbPred.select(f.col('prediction')).toPandas()))

##### Feature Importance

In [0]:
# get feature importance with helper function
ExtractFeatureImp(gb_cvModel.bestModel.featureImportances, new_dataset_6m, "features").head(10)

Unnamed: 0,idx,name,score
9,233,DEW_TEMP,0.085205
13,237,CLOUD_BASE_HEIGHT,0.084095
8,232,TMP_TEMP,0.083123
10,234,SLP_PRESSURE,0.074958
2,226,CRS_DEP_TIME,0.062432
6,230,WND_SPD,0.059636
14,238,ALTIMETER_SET,0.050972
1,225,DAY_OF_WEEK,0.050899
3,227,CRS_ARR_TIME,0.046449
0,224,MONTH,0.042926


## XGBoost

##### Train

In [0]:
# initiate model
xgb = XgboostClassifier(labelCol="label", featuresCol="features", missing=0)

In [0]:
# create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(xgb.max_depth, [1, 5, 10])
             .addGrid(xgb.n_estimators, [10, 100, 200, 400])
             .build())

# create 5-fold CrossValidator
evaluator = MulticlassClassificationEvaluator(metricName='f1')
xgb_cv = CrossValidator(estimator=xgb, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
 
# run cross validations - 25 minutes
xgb_cvModel = xgb_cv.fit(trainingData_6m)

# RF
print()
print("maxdepth = ", xgb_cvModel.bestModel.getOrDefault('max_depth'))
print("n_estimators = ", xgb_cvModel.bestModel.getOrDefault('n_estimators'))

In [0]:
# scores during training
cv_scores(xgb_cvModel)

Unnamed: 0,f1,max_depth,n_estimators
0,0.608854,1,10
1,0.649231,1,100
2,0.65285,1,200
3,0.654781,1,400
4,0.656952,5,10
5,0.679286,5,100
6,0.680005,5,200
7,0.679487,5,400
8,0.680527,10,10
9,0.67723,10,100


#### Score

In [0]:
# use test set to measure the accuracy of the model on new data
xgbPred = xgb_cvModel.bestModel.transform(testData_6m)

# evaluate predictions
xgbScore = eval_metrics(xgbPred)
print(xgbScore)

# classification report
print(classification_report(testData_6m.select(f.col('label')).toPandas(), xgbPred.select(f.col('prediction')).toPandas()))

## Save models

In [0]:
# save models
dt_cvModel.bestModel.write().overwrite().save("dbfs:/tmp/out/sl_dt_cv")
rf_cvModel.bestModel.write().overwrite().save("dbfs:/tmp/out/sl_rf_cv")
gb_cvModel.bestModel.write().overwrite().save("dbfs:/tmp/out/sl_gb_cv")
xgbb_cvModel.bestModel.write().overwrite().save("dbfs:/tmp/out/sl_xgb_cv")