# Gradient Boost

In [0]:
import pyspark.pandas as ps
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, ArrayType
from pyspark.sql.functions import col, current_date, datediff, skewness, kurtosis, max, length, coalesce

from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer, VectorAssembler, Imputer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

from sklearn.metrics import classification_report, precision_recall_fscore_support

In [0]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

import numpy as np
import time

In [0]:
## Place this cell in any team notebook that needs access to the team cloud storage.
blob_container  = "261-final-project"       # The name of your container created in https://portal.azure.com
storage_account = "ansonbquon"  # The name of your Storage account created in https://portal.azure.com
secret_scope    = "final_project"           # The name of the scope created in your local computer using the Databricks CLI
secret_key      = "project_key"             # The name of the secret key created in your local computer using the Databricks CLI
team_blob_url   = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"  #points to the root of your team storage bucket

# the 261 course blob storage is mounted here.
mids261_mount_path      = "dbfs:/mnt/mids-w261"

# SAS Token: Grant the team limited access to Azure Storage resources
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

# see what's in the blob storage root folder 
display(dbutils.fs.ls(f"{team_blob_url}/PRD/1ymodel/"))

## Build Pipeline

## Load training and testing data

In [0]:
# Use of 2015 train dataset and test dataset 
validationData = spark.read.parquet(f'{team_blob_url}/PRD/1ymodel/df_test_processed')
trainingData = spark.read.parquet(f'{team_blob_url}/PRD/1ymodel/df_train_processed')

trainingData.limit(5).display()

In [0]:
print("Training Data size: ",(trainingData.count(), len(trainingData.columns)))
print("Validation Data size: ",(validationData.count(), len(validationData.columns)))

## Baseline features

In [0]:
label = 'depDel15'

features_columns = [
    'distance',
    'quarterIndex',
    'crsDepTimeSine',
    'monthSine',
    'dayOfWeekSine',
    'originIndex',
    'originTypeIndex',
    'originPageRank',
    'destPageRank',
    'timeBtwFlightsPlanned',
    'depDel15PrevFIndex',
    'depDelayGroupPrevFIndex',
    'depTimeBlkPrevFIndex',
    'arrDel15PrevFIndex',
    'arrDelayGroupPrevFIndex',
    'arrTimeBlkPrevFIndex',
    'originTypePrevFIndex',
    'distancePrevF',
    'daysToNearestHoliday'
]

len(features_columns)

19

## Pre-process data and build pipeline

In [0]:
# define pipeline - a sequence of transformations to be applied to the data

# StringIndexer that will convert categorical string labels into numerical indices
gb_label = StringIndexer(inputCol=label, outputCol='label')
# VectorAssembler that will combine multiple feature columns into a single vector column
gb_features = VectorAssembler(inputCols=features_columns, outputCol="features")

# Chain indexers and GBT in a Pipeline

# specifies a sequence of transformations to be applied to the data
pipeline = Pipeline(stages=[gb_label, gb_features])

# processes the data through each stage of the pipeline, fitting the indexer to learn how to index the categorical features and preparing the assembler to combine features
model = pipeline.fit(trainingData)

# applies the fitted pipeline model to the input data to transform it according to the defined stages.
transformed_training_data = model.transform(trainingData)
transformed_validation_data = model.transform(validationData)

## Hyperparameters

In [0]:
# GBT Hyper-parameters
gbt_max_iterations = 10
max_decision_tree_depth = 10


## Training and Testing

In [0]:
# Train a GBT model.
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=gbt_max_iterations, maxBins=320, maxDepth=max_decision_tree_depth)
gbt_model = gbt.fit(transformed_training_data)

In [0]:
# Test the GBT model
predictions = gbt_model.transform(transformed_validation_data)
result = predictions.select("label","prediction")


## Evaluation Metrics

In [0]:

predicted_labels = result.select("prediction").toPandas().to_numpy().astype(int)
true_labels = result.select("label").toPandas().to_numpy().astype(int)

TP = np.sum((predicted_labels == 1) & (true_labels == 1))
FP = np.sum((predicted_labels == 1) & (true_labels == 0))
FN = np.sum((predicted_labels == 0) & (true_labels == 1))

# Calculate precision and recall
precision = TP / (TP + FP) 
recall = TP / (TP + FN)

# Calculate F-beta score
beta = 2
fbeta_score = (1+(beta*beta)) * (precision * recall) / ((precision * (beta*beta)) + recall)

print("Baseline performance on the validation set")
print("Precision: ",precision)
print("Recall: ", recall)
print("F-beta Score: ",fbeta_score)

Baseline performance on the validation set
Precision:  0.7128742857776021
Recall:  0.74422962728752
F-beta Score:  0.7377398199090754


## Early Stopping

In [0]:
# set patience parameter, to avoid stop too early, if the model continue traininng result to evaluation metric worse than the last best one, early stop the training iteration
max_patience = 3

In [0]:
def get_fbeta_score(result):
  predicted_labels = result.select("prediction").toPandas().to_numpy().astype(int)
  true_labels = result.select("label").toPandas().to_numpy().astype(int)

  TP = np.sum((predicted_labels == 1) & (true_labels == 1))
  FP = np.sum((predicted_labels == 1) & (true_labels == 0))
  FN = np.sum((predicted_labels == 0) & (true_labels == 1))

  # Calculate precision and recall
  precision = TP / (TP + FP) 
  recall = TP / (TP + FN)

  # Calculate F-beta score
  beta = 2
  fbeta_score = (1+(beta*beta)) * (precision * recall) / ((precision * (beta*beta)) + recall)
  return fbeta_score


best_fbeta = 0.0
best_iteration = 0

# count extra loop the model run after getting the best result
patience_counter = 0

for idx in range(gbt_max_iterations):
  # set number of iterations in each run
  gbt.setMaxIter(idx+1)
  # Train
  gbt_model = gbt.fit(transformed_training_data)
  # Validate
  predictions = gbt_model.transform(transformed_validation_data)
  # Evaluate
  fbeta = get_fbeta_score(predictions)
  print("iteration: ", idx+1, "F-beta score: ",fbeta)
  # Early stop
  if fbeta > best_fbeta:
    best_fbeta = fbeta
    best_iteration = idx + 1
    patience_counter = 0
  else:
    patience_counter += 1

  if patience_counter > max_patience:
    # early stop
    print("early stop")
    break
  
print("best iteration: ", best_iteration, " and F-beta score: ", best_fbeta)




## Features importance

In [0]:
feature_importances = gbt_model.featureImportances

for feature, importance in zip(features_columns, feature_importances):
  print(feature, "{:.2f}".format(importance))

distance 0.02
quarterIndex 0.01
crsDepTimeSine 0.10
monthSine 0.04
dayOfWeekSine 0.02
originIndex 0.22
originTypeIndex 0.00
originPageRank 0.00
destPageRank 0.02
timeBtwFlightsPlanned 0.12
depDel15PrevFIndex 0.17
depDelayGroupPrevFIndex 0.08
depTimeBlkPrevFIndex 0.03
arrDel15PrevFIndex 0.00
arrDelayGroupPrevFIndex 0.01
arrTimeBlkPrevFIndex 0.04
originTypePrevFIndex 0.00
distancePrevF 0.10
daysToNearestHoliday 0.02


The top 3 features are: originIndex, depDel15PrevFIndex and timeBtwFlightsPlanned

## Features selections
### 1. Baseline (baseline features)

In [0]:
label = 'depDel15'

features_columns = [
    'distance',
    'quarterIndex',
    'crsDepTimeSine',
    'monthSine',
    'dayOfWeekSine',
    'originIndex',
    'originTypeIndex',
    'originPageRank',
    'destPageRank',
    'timeBtwFlightsPlanned',
    'depDel15PrevFIndex',
    'depDelayGroupPrevFIndex',
    'depTimeBlkPrevFIndex',
    'arrDel15PrevFIndex',
    'arrDelayGroupPrevFIndex',
    'arrTimeBlkPrevFIndex',
    'originTypePrevFIndex',
    'distancePrevF',
    'daysToNearestHoliday'
]

len(features_columns)

19

In [0]:
def evaluation_metrics(result):
    predicted_labels = result.select("prediction").toPandas().to_numpy().astype(int)
    true_labels = result.select("label").toPandas().to_numpy().astype(int)

    TP = np.sum((predicted_labels == 1) & (true_labels == 1))
    FP = np.sum((predicted_labels == 1) & (true_labels == 0))
    FN = np.sum((predicted_labels == 0) & (true_labels == 1))

    # Calculate precision and recall
    precision = TP / (TP + FP) 
    recall = TP / (TP + FN)

    # Calculate F-beta score
    beta = 2
    fbeta_score = (1+(beta*beta)) * (precision * recall) / ((precision * (beta*beta)) + recall)    

    print("Baseline performance on the validation set")
    print("Precision: ",precision)
    print("Recall: ", recall)
    print("F-beta Score: ",fbeta_score)

    return precision, recall, fbeta_score

In [0]:
def runGBT(maxIter, maxTreeDepth):

    # define pipeline - a sequence of transformations to be applied to the data

    # StringIndexer that will convert categorical string labels into numerical indices
    gb_label = StringIndexer(inputCol=label, outputCol='label')
    # VectorAssembler that will combine multiple feature columns into a single vector column
    gb_features = VectorAssembler(inputCols=features_columns, outputCol="features")

    # Chain indexers and GBT in a Pipeline

    # specifies a sequence of transformations to be applied to the data
    pipeline = Pipeline(stages=[gb_label, gb_features])

    # processes the data through each stage of the pipeline, fitting the indexer to learn how to index the categorical features and preparing the assembler to combine features
    model = pipeline.fit(trainingData)

    # applies the fitted pipeline model to the input data to transform it according to the defined stages.
    transformed_training_data = model.transform(trainingData)
    transformed_validation_data = model.transform(validationData)

    print("Number of features used in Training: ",len(features_columns))

    # Train a GBT model.
    start = time.time()

    gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=maxIter, maxBins=320, maxDepth=maxTreeDepth)
    gbt_model = gbt.fit(transformed_training_data)

    # Training dataset prediction evaluation results
    predictions = gbt_model.transform(transformed_training_data)
    result = predictions.select("label","prediction")
    print("Evaluation Result - Training Data")
    evaluation_metrics(result)
    print("\n")

    # Test the GBT model
    predictions = gbt_model.transform(transformed_validation_data)
    result = predictions.select("label","prediction")
    print("Evaluation Result - Testing Data")
    evaluation_metrics(result)

    print(f'Time to complete: {time.time() - start} seconds.')
    print("\n")

    feature_importances = gbt_model.featureImportances

    for feature, importance in zip(features_columns, feature_importances):
        print(feature, "{:.2f}".format(importance))

    return

In [0]:
# GBT Hyper-parameters
gbt_max_iterations = 10
max_decision_tree_depth = 5

runGBT(maxIter=gbt_max_iterations, maxTreeDepth=max_decision_tree_depth)





Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Number of features used in Training:  19


Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation Result - Training Data
Baseline performance on the validation set
Precision:  0.6830794599852896
Recall:  0.7582355791769162
F-beta Score:  0.741909788758745


Evaluation Result - Testing Data
Baseline performance on the validation set
Precision:  0.7120185978974696
Recall:  0.7487449005560225
F-beta Score:  0.7410996368943435
Time to complete: 59.13782739639282 seconds.


distance 0.01
quarterIndex 0.00
crsDepTimeSine 0.16
monthSine 0.06
dayOfWeekSine 0.01
originIndex 0.20
originTypeIndex 0.00
originPageRank 0.00
destPageRank 0.00
timeBtwFlightsPlanned 0.13
depDel15PrevFIndex 0.17
depDelayGroupPrevFIndex 0.09
depTimeBlkPrevFIndex 0.01
arrDel15PrevFIndex 0.00
arrDelayGroupPrevFIndex 0.00
arrTimeBlkPrevFIndex 0.03
originTypePrevFIndex 0.00
distancePrevF 0.12
daysToNearestHoliday 0.01


### 2. No Weather

In [0]:
label = 'depDel15'

features_columns = [
    'distance',
    'quarterIndex',
    'crsDepTimeSine',
    'monthSine',
    'dayOfWeekSine',
    'dayOfMonthIndex',
    'originIndex',
    'originTypeIndex',
    'originPageRank',
    'destPageRank',
    'opCarrierFlNumBinIndex1',
    # 'hourlyAltimeterSetting',
    # 'hourlyDewPointTemperature',
    # 'hourlyDryBulbTemperature',
    # 'hourlyPrecipitation',
    # 'hourlyPressureChange',
    # 'hourlyPressureTendency',
    # 'hourlyRelativeHumidity',
    # 'hourlySeaLevelPressure',
    # 'hourlyStationPressure',
    # 'hourlyVisibility',
    # 'hourlyWetBulbTemperature',
    # 'hourlyWindDirection',
    # 'hourlyWindGustSpeed',
    # 'hourlyWindSpeed',
    'daysToNearestHoliday',
    'timeBtwFlightsPlanned',
    'depDel15PrevFIndex',
    'depDelayGroupPrevFIndex',
    'depTimeBlkPrevFIndex',
    'arrDel15PrevFIndex',
    'arrDelayGroupPrevFIndex',
    'arrTimeBlkPrevFIndex',
    'originTypePrevFIndex',
    'distancePrevF',
    'daysToNearestHoliday'
]

len(features_columns)

22

In [0]:
# GBT Hyper-parameters
gbt_max_iterations = 10
max_decision_tree_depth = 10

runGBT(maxIter=gbt_max_iterations, maxTreeDepth=max_decision_tree_depth)

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Number of features used in Training:  22


Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation Result - Training Data
Baseline performance on the validation set
Precision:  0.7056945674420144
Recall:  0.7859697545918556
F-beta Score:  0.7684861650704822


Evaluation Result - Testing Data
Baseline performance on the validation set
Precision:  0.7114411352896564
Recall:  0.7523540344410083
F-beta Score:  0.7437992900404384
Time to complete: 85.96324610710144 seconds.


distance 0.01
quarterIndex 0.01
crsDepTimeSine 0.07
monthSine 0.04
dayOfWeekSine 0.01
dayOfMonthIndex 0.14
originIndex 0.24
originTypeIndex 0.00
originPageRank 0.00
destPageRank 0.02
opCarrierFlNumBinIndex1 0.00
daysToNearestHoliday 0.01
timeBtwFlightsPlanned 0.10
depDel15PrevFIndex 0.12
depDelayGroupPrevFIndex 0.07
depTimeBlkPrevFIndex 0.03
arrDel15PrevFIndex 0.00
arrDelayGroupPrevFIndex 0.01
arrTimeBlkPrevFIndex 0.03
originTypePrevFIndex 0.00
distancePrevF 0.08
daysToNearestHoliday 0.00


### 3. No Weather + No features with 0.00 importance

In [0]:
label = 'depDel15'

features_columns = [
    'distance',
    #'quarterIndex',
    'crsDepTimeSine',
    'monthSine',
    #'dayOfWeekSine',
    'dayOfMonthIndex',
    'originIndex',
    #'originTypeIndex',
    #'originPageRank',
    'destPageRank',
    #'opCarrierFlNumBinIndex1',
    # 'hourlyAltimeterSetting',
    # 'hourlyDewPointTemperature',
    # 'hourlyDryBulbTemperature',
    # 'hourlyPrecipitation',
    # 'hourlyPressureChange',
    # 'hourlyPressureTendency',
    # 'hourlyRelativeHumidity',
    # 'hourlySeaLevelPressure',
    # 'hourlyStationPressure',
    # 'hourlyVisibility',
    # 'hourlyWetBulbTemperature',
    # 'hourlyWindDirection',
    # 'hourlyWindGustSpeed',
    # 'hourlyWindSpeed',
    #'daysToNearestHoliday',
    'timeBtwFlightsPlanned',
    'depDel15PrevFIndex',
    'depDelayGroupPrevFIndex',
    'depTimeBlkPrevFIndex',
    #'arrDel15PrevFIndex',
    #'arrDelayGroupPrevFIndex',
    'arrTimeBlkPrevFIndex',
    #'originTypePrevFIndex',
    'distancePrevF',
    #'daysToNearestHoliday'
]

len(features_columns)


12

In [0]:
# GBT Hyper-parameters
gbt_max_iterations = 5
max_decision_tree_depth = 5

runGBT(maxIter=gbt_max_iterations, maxTreeDepth=max_decision_tree_depth)

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Number of features used in Training:  12


Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation Result - Training Data
Baseline performance on the validation set
Precision:  0.6792342198810487
Recall:  0.7551041101518591
F-beta Score:  0.7386038251176077


Evaluation Result - Testing Data
Baseline performance on the validation set
Precision:  0.7075958046575412
Recall:  0.7520609851084669
F-beta Score:  0.7427264287242947
Time to complete: 51.4596107006073 seconds.


distance 0.01
crsDepTimeSine 0.19
monthSine 0.05
dayOfMonthIndex 0.04
originIndex 0.18
destPageRank 0.00
timeBtwFlightsPlanned 0.09
depDel15PrevFIndex 0.23
depDelayGroupPrevFIndex 0.07
depTimeBlkPrevFIndex 0.01
arrTimeBlkPrevFIndex 0.03
distancePrevF 0.10


### 4. Full (All include features and experimental features from data dict) 

In [0]:
features_columns = [
    'distance',
    'quarterIndex',
    'crsDepTimeSine',
    'monthSine',
    'dayOfWeekSine',
    'dayOfMonthIndex',
    'originIndex',
    'originTypeIndex',
    'originPageRank',
    'destPageRank',
    'opCarrierFlNumBinIndex1',
    'hourlyAltimeterSetting',
    'hourlyDewPointTemperature',
    'hourlyDryBulbTemperature',
    'hourlyPrecipitation',
    'hourlyPressureChange',
    'hourlyPressureTendency',
    'hourlyRelativeHumidity',
    'hourlySeaLevelPressure',
    'hourlyStationPressure',
    'hourlyVisibility',
    'hourlyWetBulbTemperature',
    'hourlyWindDirection',
    'hourlyWindGustSpeed',
    'hourlyWindSpeed',
    'daysToNearestHoliday',
    'timeBtwFlightsPlanned',
    'depDel15PrevFIndex',
    'depDelayGroupPrevFIndex',
    'depTimeBlkPrevFIndex',
    'arrDel15PrevFIndex',
    'arrDelayGroupPrevFIndex',
    'arrTimeBlkPrevFIndex',
    'originTypePrevFIndex',
    'distancePrevF',
    'daysToNearestHoliday'
]

len(features_columns)

36

In [0]:
# GBT Hyper-parameters
gbt_max_iterations = 5
max_decision_tree_depth = 10

runGBT(maxIter=gbt_max_iterations, maxTreeDepth=max_decision_tree_depth)

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Number of features used in Training:  36


Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation Result - Training Data
Baseline performance on the validation set
Precision:  0.702308349823226
Recall:  0.7865913077372831
F-beta Score:  0.7681543016296247


Evaluation Result - Testing Data
Baseline performance on the validation set
Precision:  0.7159430051627835
Recall:  0.7705230930585867
F-beta Score:  0.7589513255413485
Time to complete: 71.14635181427002 seconds.


distance 0.02
quarterIndex 0.00
crsDepTimeSine 0.09
monthSine 0.04
dayOfWeekSine 0.00
dayOfMonthIndex 0.08
originIndex 0.19
originTypeIndex 0.00
originPageRank 0.00
destPageRank 0.01
opCarrierFlNumBinIndex1 0.00
hourlyAltimeterSetting 0.00
hourlyDewPointTemperature 0.01
hourlyDryBulbTemperature 0.02
hourlyPrecipitation 0.02
hourlyPressureChange 0.00
hourlyPressureTendency 0.00
hourlyRelativeHumidity 0.01
hourlySeaLevelPressure 0.01
hourlyStationPressure 0.00
hourlyVisibility 0.01
hourlyWetBulbTemperature 0.01
hourlyWindDirection 0.00
hourlyWindGustSpeed 0.00
hourlyWindSpeed 0.00
daysToNearestHoliday 0.00
t


The 0.0 importance features are:
- quarterIndex 0.00
- dayOfWeekSine 0.00
- originTypeIndex 0.00
- originPageRank 0.00
- opCarrierFlNumBinIndex1 0.00
- hourlyAltimeterSetting 0.00
- hourlyPressureChange 0.00
- hourlyPressureTendency 0.00
- hourlyStationPressure 0.00
- hourlyWindDirection 0.00
- hourlyWindGustSpeed 0.00
- hourlyWindSpeed 0.00
- daysToNearestHoliday 0.00
- arrDel15PrevFIndex 0.00
- arrDelayGroupPrevFIndex 0.00
- originTypePrevFIndex 0.00
- daysToNearestHoliday 0.00  

The top 3 important features are:
- originIndex 0.19
- depDel15PrevFIndex 0.16
- timeBtwFlightsPlanned 0.10



# Results and Discussions

In [0]:
display(dbutils.fs.ls(f"{team_blob_url}/PRD/xval/"))

### Cross Validation - Find Best Hyperparameters

In [0]:
# 3 cross validation datasets
df_xval1_train = spark.read.parquet(f"{team_blob_url}/PRD/xval/df_xval1_train_processed/")
df_xval1_test = spark.read.parquet(f"{team_blob_url}/PRD/xval/df_xval1_test_processed/")

df_xval2_train = spark.read.parquet(f"{team_blob_url}/PRD/xval/df_xval2_train_processed/")
df_xval2_test = spark.read.parquet(f"{team_blob_url}/PRD/xval/df_xval2_test_processed/")

df_xval3_train = spark.read.parquet(f"{team_blob_url}/PRD/xval/df_xval3_train_processed/")
df_xval3_test = spark.read.parquet(f"{team_blob_url}/PRD/xval/df_xval3_test_processed/")



In [0]:
label = 'depDel15'

features_columns = [
    'distance',
    'quarterIndex',
    'crsDepTimeSine',
    'monthSine',
    'dayOfWeekSine',
    'dayOfMonthIndex',
    'originIndex',
    'originTypeIndex',
    'originPageRank',
    'destPageRank',
    'opCarrierFlNumBinIndex1',
    'hourlyAltimeterSetting',
    'hourlyDewPointTemperature',
    'hourlyDryBulbTemperature',
    'hourlyPrecipitation',
    'hourlyPressureChange',
    'hourlyPressureTendency',
    'hourlyRelativeHumidity',
    'hourlySeaLevelPressure',
    'hourlyStationPressure',
    'hourlyVisibility',
    'hourlyWetBulbTemperature',
    'hourlyWindDirection',
    'hourlyWindGustSpeed',
    'hourlyWindSpeed',
    'daysToNearestHoliday',
    'timeBtwFlightsPlanned',
    'depDel15PrevFIndex',
    'depDelayGroupPrevFIndex',
    'depTimeBlkPrevFIndex',
    'arrDel15PrevFIndex',
    'arrDelayGroupPrevFIndex',
    'arrTimeBlkPrevFIndex',
    'originTypePrevFIndex',
    'distancePrevF',
    'daysToNearestHoliday'
]

len(features_columns)

36

In [0]:
def crossValidateGBT(maxIter, maxTreeDepth):

    # define pipeline - a sequence of transformations to be applied to the data

    # StringIndexer that will convert categorical string labels into numerical indices
    gb_label = StringIndexer(inputCol=label, outputCol='label')
    # VectorAssembler that will combine multiple feature columns into a single vector column
    gb_features = VectorAssembler(inputCols=features_columns, outputCol="features")

    # Chain indexers and GBT in a Pipeline

    # specifies a sequence of transformations to be applied to the data
    pipeline = Pipeline(stages=[gb_label, gb_features])

    # Train a GBT model.
    start = time.time()
    
    total_train_precision = 0
    total_train_recall = 0
    total_train_fbeta = 0

    total_test_precision = 0
    total_test_recall = 0
    total_test_fbeta = 0

    gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=maxIter, maxBins=350, maxDepth=maxTreeDepth)

    for i in range(3):
        
        if i == 0:
            trainingData = df_xval1_train
            validationData = df_xval1_test
        elif i == 1:
            trainingData = df_xval2_train
            validationData = df_xval2_test
        elif i == 2:
            trainingData = df_xval3_train
            validationData = df_xval3_test

        # processes the data through each stage of the pipeline, fitting the indexer to learn how to index the categorical features and preparing the assembler to combine features
        model = pipeline.fit(trainingData)

        # applies the fitted pipeline model to the input data to transform it according to the defined stages.
        transformed_training_data = model.transform(trainingData)
        transformed_validation_data = model.transform(validationData)

        
        gbt_model = gbt.fit(transformed_training_data)

        # Training dataset prediction evaluation results
        predictions = gbt_model.transform(transformed_training_data)
        result = predictions.select("label","prediction")
        print("Evaluation Result - Training Data")
        precision, recall, fbeta = evaluation_metrics(result)

        weight = 0.0
        if i == 0:
            weight = 0.2
        elif i == 1:
            weight = 0.3
        elif i == 2:
            weight = 0.5

        total_train_precision += weight * precision
        total_train_recall += weight * recall
        total_train_fbeta += weight * fbeta

        # Test the GBT model
        predictions = gbt_model.transform(transformed_validation_data)
        result = predictions.select("label","prediction")
        print("Evaluation Result - Testing Data")
        precision, recall, fbeta =evaluation_metrics(result)
        total_test_precision += weight * precision
        total_test_recall += weight * recall
        total_test_fbeta += weight * fbeta
        
        print("\n")


    print("Overall Training result:")
    print("Weighted Train Precision: ", total_train_precision)
    print("Weighted Train Recall: ", total_train_recall)
    print("Weighted Train F-beta: ", total_train_fbeta)
    print("Overall Testing result:")
    print("Weighted Test Precision: ", total_test_precision)
    print("Weighted Test Recall: ", total_test_recall)
    print("Weighted Test F-beta: ", total_test_fbeta)


    print(f'Time to complete: {time.time() - start} seconds.')
    print("\n")

    #feature_importances = gbt_model.featureImportances

    #for feature, importance in zip(features_columns, feature_importances):
    #    print(feature, "{:.2f}".format(importance))

    return

In [0]:
# Hyper-parameters
gbt_max_iterations = 10
max_decision_tree_depth = 5

crossValidateGBT(gbt_max_iterations, max_decision_tree_depth)

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation Result - Training Data
Baseline performance on the validation set
Precision:  0.6864776559961768
Recall:  0.7503637018076306
F-beta Score:  0.7366526009722948
Evaluation Result - Testing Data
Baseline performance on the validation set
Precision:  0.7072700572917744
Recall:  0.7572613005235046
F-beta Score:  0.7467055758243812




Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation Result - Training Data
Baseline performance on the validation set
Precision:  0.6857050312149192
Recall:  0.7731833323160268
F-beta Score:  0.7539465013019215
Evaluation Result - Testing Data
Baseline performance on the validation set
Precision:  0.6352416285658496
Recall:  0.7511250125243408
F-beta Score:  0.7246850100156266




Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation Result - Training Data
Baseline performance on the validation set
Precision:  0.6932422108171409
Recall:  0.7691222080577796
F-beta Score:  0.7526457848999716
Evaluation Result - Testing Data
Baseline performance on the validation set
Precision:  0.6877060418634411
Recall:  0.7730498397128531
F-beta Score:  0.7543275445587702


Overall Training result:
Weighted Train Precision:  0.6896281459722815
Weighted Train Recall:  0.766588844085224
Weighted Train F-beta:  0.7498373630350212
Overall Testing result:
Weighted Test Precision:  0.6758795209598303
Weighted Test Recall:  0.7633146837184297
Weighted Test F-beta:  0.7439103904489492
Time to complete: 351.68534231185913 seconds.




In [0]:
gbt_max_iterations = 10
max_decision_tree_depth = 10

crossValidateGBT(gbt_max_iterations, max_decision_tree_depth)

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation Result - Training Data
Baseline performance on the validation set
Precision:  0.7061556476880716
Recall:  0.782983066685516
F-beta Score:  0.7663086920880231
Evaluation Result - Testing Data
Baseline performance on the validation set
Precision:  0.706994176557116
Recall:  0.776974657418972
F-beta Score:  0.7618917742788672




Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation Result - Training Data
Baseline performance on the validation set
Precision:  0.7079043646922368
Recall:  0.799428840740826
F-beta Score:  0.7792783662627634
Evaluation Result - Testing Data
Baseline performance on the validation set
Precision:  0.6298551243069218
Recall:  0.7670385488318602
F-beta Score:  0.7350208138177701




Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation Result - Training Data
Baseline performance on the validation set
Precision:  0.7122056060294744
Recall:  0.8060669686978926
F-beta Score:  0.7853663301963
Evaluation Result - Testing Data
Baseline performance on the validation set
Precision:  0.6895347615258741
Recall:  0.7865475244540647
F-beta Score:  0.7650208966400815


Overall Training result:
Weighted Train Precision:  0.7097052419600225
Weighted Train Recall:  0.7994587499082972
Weighted Train F-beta:  0.7797284133945837
Overall Testing result:
Weighted Test Precision:  0.6751227533664368
Weighted Test Recall:  0.7787802583603848
Weighted Test F-beta:  0.7553950473211453
Time to complete: 477.05323791503906 seconds.




In [0]:
gbt_max_iterations = 5
max_decision_tree_depth = 5

crossValidateGBT(gbt_max_iterations, max_decision_tree_depth)

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation Result - Training Data
Baseline performance on the validation set
Precision:  0.680097172692057
Recall:  0.7449475276799384
F-beta Score:  0.7310065682543556
Evaluation Result - Testing Data
Baseline performance on the validation set
Precision:  0.7039796782387807
Recall:  0.754860797392221
F-beta Score:  0.7441045571749026




Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation Result - Training Data
Baseline performance on the validation set
Precision:  0.6778271080431317
Recall:  0.7803458375318075
F-beta Score:  0.7574340433946658
Evaluation Result - Testing Data
Baseline performance on the validation set
Precision:  0.6261369127135265
Recall:  0.7677311993308735
F-beta Score:  0.7345108179208387




Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation Result - Training Data
Baseline performance on the validation set
Precision:  0.6891186841862008
Recall:  0.7663240519255544
F-beta Score:  0.7495293571996728
Evaluation Result - Testing Data
Baseline performance on the validation set
Precision:  0.6854733119600597
Recall:  0.7674363646327096
F-beta Score:  0.7495123067289631


Overall Training result:
Weighted Train Precision:  0.6839269090444513
Weighted Train Recall:  0.7662552827583071
Weighted Train F-beta:  0.7481962052691074
Overall Testing result:
Weighted Test Precision:  0.6713736654418438
Weighted Test Recall:  0.7650097015940611
Weighted Test F-beta:  0.7439303101757136
Time to complete: 352.7487714290619 seconds.




In [0]:
gbt_max_iterations = 5
max_decision_tree_depth = 10

crossValidateGBT(gbt_max_iterations, max_decision_tree_depth)

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation Result - Training Data
Baseline performance on the validation set
Precision:  0.6978459872264244
Recall:  0.7726610539782611
F-beta Score:  0.7564416492616676
Evaluation Result - Testing Data
Baseline performance on the validation set
Precision:  0.706881154761035
Recall:  0.7712745697253152
F-beta Score:  0.757474132186398




Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation Result - Training Data
Baseline performance on the validation set
Precision:  0.6978465986645193
Recall:  0.7960206970184246
F-beta Score:  0.7742365470171182
Evaluation Result - Testing Data
Baseline performance on the validation set
Precision:  0.6282552315903034
Recall:  0.7661411525878555
F-beta Score:  0.7339255816903784




Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation Result - Training Data
Baseline performance on the validation set
Precision:  0.7031351651861227
Recall:  0.8013629320564551
F-beta Score:  0.7795814735180001
Evaluation Result - Testing Data
Baseline performance on the validation set
Precision:  0.6877960221808802
Recall:  0.7863214784776831
F-beta Score:  0.7644211080072317


Overall Training result:
Weighted Train Precision:  0.7004907596377021
Weighted Train Recall:  0.7940198859294072
Weighted Train F-beta:  0.7733500307164691
Overall Testing result:
Weighted Test Precision:  0.6737508115197381
Weighted Test Recall:  0.7772579989602613
Weighted Test F-beta:  0.753883054948009
Time to complete: 399.21109986305237 seconds.




Based on the cross validation results above, the best hyperparameters are: max iterations = 10 and max decision tree depth = 10

### 4 year Model Training

In [0]:
display(dbutils.fs.ls(f"{team_blob_url}/PRD/xval_corrected/"))

In [0]:
# Best HyperParameters
gbt_max_iterations = 10
max_decision_tree_depth = 10

### Final 2019 Testing

In [0]:
# final testing data
trainingData = spark.read.parquet(f'{team_blob_url}/PRD/xval_corrected/df_xval1-3_train_processed/')
testingData = spark.read.parquet(f'{team_blob_url}/PRD/xval_corrected/df_final_val_processed/')

print((trainingData.count(), len(trainingData.columns)))
print((testingData.count(), len(testingData.columns)))


(8068150, 148)
(2487029, 148)


In [0]:
def testGBT(maxIter, maxTreeDepth):

    # define pipeline - a sequence of transformations to be applied to the data

    # StringIndexer that will convert categorical string labels into numerical indices
    gb_label_test = StringIndexer(inputCol=label, outputCol='label', handleInvalid="keep")
    # VectorAssembler that will combine multiple feature columns into a single vector column
    gb_features_test = VectorAssembler(inputCols=features_columns, outputCol="features")

    # Chain indexers and GBT in a Pipeline

    # specifies a sequence of transformations to be applied to the data
    pipeline_test = Pipeline(stages=[gb_label_test, gb_features_test])

    # Train a GBT model.
    start = time.time()
    


    gbt_test = GBTClassifier(labelCol="label", featuresCol="features", maxIter=maxIter, maxBins=360, maxDepth=maxTreeDepth)

    

    # processes the data through each stage of the pipeline, fitting the indexer to learn how to index the categorical features and preparing the assembler to combine features
    model_test = pipeline_test.fit(trainingData)

    # applies the fitted pipeline model to the input data to transform it according to the defined stages.
    transformed_training_data = model_test.transform(trainingData)
    transformed_testing_data = model_test.transform(testingData)

        
    gbt_model_test = gbt_test.fit(transformed_training_data)

    # Training dataset prediction evaluation results
    predictions_test = gbt_model_test.transform(transformed_training_data)
    result = predictions_test.select("label","prediction")
    print("Evaluation Result - Training Data")
    precision, recall, fbeta = evaluation_metrics(result)
    
    

    # Test the GBT model
    predictions = gbt_model_test.transform(transformed_testing_data)
    result = predictions.select("label","prediction")
    print("Evaluation Result - Testing Data")
    precision, recall, fbeta =evaluation_metrics(result)

    print(f'Time to complete: {time.time() - start} seconds.')

    feature_importances = gbt_model_test.featureImportances

    for feature, importance in zip(features_columns, feature_importances):
        print(feature, "{:.2f}".format(importance))

    print("\n")


In [0]:
testGBT(maxIter=gbt_max_iterations, maxTreeDepth=max_decision_tree_depth)


Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation Result - Training Data
Baseline performance on the validation set
Precision:  0.6981400469792437
Recall:  0.7919810747331582
F-beta Score:  0.7421057210241908
Evaluation Result - Testing Data
Baseline performance on the validation set
Precision:  0.6784064871192708
Recall:  0.8059523789333003
F-beta Score:  0.7366996414170306
Time to complete: 452.068806886673 seconds.
distance 0.02
quarterIndex 0.01
crsDepTimeSine 0.08
monthSine 0.02
dayOfWeekSine 0.01
dayOfMonthIndex 0.04
originIndex 0.17
originTypeIndex 0.00
originPageRank 0.00
destPageRank 0.05
opCarrierFlNumBinIndex1 0.00
hourlyAltimeterSetting 0.00
hourlyDewPointTemperature 0.01
hourlyDryBulbTemperature 0.02
hourlyPrecipitation 0.02
hourlyPressureChange 0.00
hourlyPressureTendency 0.00
hourlyRelativeHumidity 0.01
hourlySeaLevelPressure 0.01
hourlyStationPressure 0.00
hourlyVisibility 0.01
hourlyWetBulbTemperature 0.01
hourlyWindDirection 0.00
hourlyWindGustSpeed 0.01
hourlyWindSpeed 0.00
daysToNearestHoliday 0.00
timeB