#Objective

In [0]:
model_name_list = ['Baseline Logistic Regression','Baseline Decision Tree', 'Tuned Logistic Regression', 'Tuned Decision Tree', 'Tuned Random Forest']
balanced_imbalanced = ['Imbalanced', 'Imbalanced', 'Balanced', 'Balanced', 'Balanced']
parameters_list = ['maxIter=100, regParam=0.0, elasticNetParam=0.0, threshold=0.5', 'maxDepth=5, maxBins=32,impurity=gini', 'maxIter=100, regParam=0.01, elasticNetParam=0, threshold=0.6', 'maxDepth=10, maxBins=6168,impurity=gini', 'numTrees=10, maxDepth=5, impurity=gini, featureSubsetStrategy=all']
CV_f_5_score_list = [.19676, .13511, 0.326934, 0.324010, .304559]
test_2019_f_5_score_list = [.194581, .142804, .37662, .340830, .31712]
recall_2019_list = [.053607, .036411, .40484, .562892, .66261]
precision_2019_list = [.568012, .529867, .37016, .310233, .36269]

model_performance = {'Model': model_name_list, 'Target Distribution':balanced_imbalanced, 'Parameters': parameters_list, 'Cross Validation F0.5 Score': CV_f_5_score_list, '2019 Test F0.5 Score': test_2019_f_5_score_list, '2019 Test Recall': recall_2019_list, '2019 Test Precision': precision_2019_list}
all_models = pd.DataFrame(model_performance, columns = model_performance.keys())
all_models

Unnamed: 0,Model,Target Distribution,Parameters,Cross Validation F0.5 Score,2019 Test F0.5 Score,2019 Test Recall,2019 Test Precision
0,Baseline Logistic Regression,Imbalanced,"maxIter=100, regParam=0.0, elasticNetParam=0.0, threshold=0.5",0.19676,0.194581,0.053607,0.568012
1,Baseline Decision Tree,Imbalanced,"maxDepth=5, maxBins=32,impurity=gini",0.13511,0.142804,0.036411,0.529867
2,Tuned Logistic Regression,Balanced,"maxIter=100, regParam=0.01, elasticNetParam=0, threshold=0.6",0.326934,0.37662,0.40484,0.37016
3,Tuned Decision Tree,Balanced,"maxDepth=10, maxBins=6168,impurity=gini",0.32401,0.34083,0.562892,0.310233
4,Tuned Random Forest,Balanced,"numTrees=10, maxDepth=5, impurity=gini, featureSubsetStrategy=all",0.304559,0.31712,0.66261,0.36269


#Notebook Initialization

##Import Packages

In [0]:
from pyspark.sql.functions import col,isnan,when,count,lit, to_date,lpad,date_format,rpad,regexp_replace,concat,to_utc_timestamp,to_timestamp, countDistinct,unix_timestamp, row_number, when
from pyspark.sql.types import IntegerType,BooleanType,DateType,StringType,TimestampType
from pyspark.sql import DataFrameNaFunctions
from pyspark import StorageLevel
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from pytz import timezone
import datetime
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder, StandardScaler, PCA, VectorSlicer, Imputer
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import percent_rank
from pyspark.sql import Window
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import functions as f
pd.set_option('display.max_colwidth', None)

##Cloud Storage Parameters

In [0]:
blob_container = "tm30container" # The name of your container created in https://portal.azure.com
storage_account = "w261tm30" # The name of your Storage account created in https://portal.azure.com
secret_scope = "w261tm30" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "tm30key" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

test_pq = spark.read.parquet(f"{blob_url}/2022-03-24_data_chkpt_PQ_full")

##Define Functions

In [0]:
def data_pull(df, time_window = 'full', date_col='FLIGHT_UTC_DATE'):
    """Pull processed dataset"""
    if time_window == '2019':
        df = df.filter(f.year(col(date_col)) == 2019)
    elif time_window == '2018':
        df = df.filter(f.year(col(date_col)) == 2018)
    elif time_window == '2017':
        df = df.filter(f.year(col(date_col)) == 2017)
    elif time_window == '2016':
        df = df.filter(f.year(col(date_col)) == 2016) 
    
    #The commands below are for 2015 data
    elif time_window == '6m':
        df = df.filter(col(date_col) < "2015-07-01T00:00:00.000")  
    elif time_window == '3m':
        df = df.filter(col(date_col) < "2015-04-01T00:00:00.000")
        #comment this out if it takes too long
    
    print(f'{df.count():,} total records imported for the {time_window} dataset')
    return df

In [0]:
def pre_pipeline(index_cols, cont_cols, cat_cols, pred_cols):
    ''' This function creates a pre-processed pipeline to be used to prepare for crossfold validation and model training
    '''
    pre_pipeline = None
    
    #Convert string to index
    indexer = StringIndexer(inputCols=cat_cols, outputCols=[c+"_idx" for c in cat_cols]).setHandleInvalid("keep")

    #Convert categorical columns to index
    encoder = OneHotEncoder(inputCols=[c+"_idx" for c in cat_cols], outputCols= [c+"_OHE" for c in cat_cols])
    
    #Vector assembler for categorical
    assembler_cat = VectorAssembler(inputCols= [x+"_OHE" for x in cat_cols], outputCol="cat_features")
        
    assembler_lab = StringIndexer(inputCol='DEP_DEL15', outputCol="label")
        
    pre_pipeline = Pipeline(stages=[indexer, encoder, assembler_cat, assembler_lab])
    
    return pre_pipeline

In [0]:
def scaled_pipeline(model, param_grid):
    ''' This function creates a scaled processed and scaled pipeline to be used to train models.
        Parameters:
            model:    lr = Logistic Regression;
                      rf = Random Forest 
                      dt = Decision Trees
        Returns: a pipeline model
    '''
    pipeline_model = None
    
    #Ensure continuous variables have values
    imputer = Imputer(inputCols=cont_cols, outputCols=cont_cols)
    
    #Assemble cont variables
    assembler_num = VectorAssembler(inputCols=cont_cols, outputCol="scale_nums")
    
    #Scale the values
    scaler = StandardScaler(inputCol="scale_nums", outputCol="scaledFeatures", withStd=True, withMean=True)

    #Vector assembler combined
    assembler = VectorAssembler(inputCols=["scaledFeatures", "cat_features"], outputCol="features")
    
    #Models for the pipeline
    if model == 'lr':
        max_iter, reg_param, ela, threshold = param_grid.values()
        
        class_model = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter = max_iter, regParam = reg_param, elasticNetParam = ela, threshold = threshold)
        
    elif model == 'rf':
        num_trees, depth, criterion, features = param_grid.values()
        
        class_model = RandomForestClassifier(featuresCol = 'features', labelCol = 'label', numTrees = num_trees, maxDepth = depth, impurity = criterion, featureSubsetStrategy = features)
        
    elif model == 'dt':
        md, mb, info = param_grid.values()
    
        class_model = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = md, maxBins = mb, impurity = info)
        
    pipeline_model = Pipeline(stages=[imputer, assembler_num, scaler, assembler, class_model])
    
    return pipeline_model

In [0]:
def custom_test(df_train, df_test, pre_pipeline, class_model, sample):
    
    #Preprocess Pipleine
    fit_train_df = pre_pipeline.fit(df_train)
    transform_train_df = fit_train_df.transform(df_train).persist(StorageLevel.MEMORY_AND_DISK)
    transform_test_df = fit_train_df.transform(df_test)
    
    if class_model == 'lr':
        param_grid = {'maxIter':100, 'regParam':.01, 'elasticNetParam':0, 'threshold': .6}
        scaled_pipelines = scaled_pipeline('lr', param_grid)
    
    elif class_model == 'dt':
        param_grid = {'max_depth':10, 'max_bins': 6168, 'info_type':'gini'}
        scaled_pipelines = scaled_pipeline('dt', param_grid)
        
    elif class_model == 'rf':
        param_grid = {'num_trees':10, 'depth': 5, 'criterion':'gini', 'substrategy':'all'}
        scaled_pipelines = scaled_pipeline('rf', param_grid)
        
    if sample == 'down':
        train_df = downsample(transform_train_df)

        #Generate model 
    model = scaled_pipelines.fit(train_df)
    predict = model.transform(transform_test_df)

        #Calculate evaluation metrics
    evaluatorf_5 = MulticlassClassificationEvaluator(metricName='fMeasureByLabel', metricLabel=1, beta=0.5)
    f_5 = evaluatorf_5.evaluate(predict)
    evaluator_recall = MulticlassClassificationEvaluator(metricName='recallByLabel', metricLabel=1)
    recall = evaluator_recall.evaluate(predict)
    evaluator_precision = MulticlassClassificationEvaluator(metricName='precisionByLabel', metricLabel=1)
    precision = evaluator_precision.evaluate(predict)
    
    transform_train_df.unpersist()
    
    return predict, f_5, recall, precision

In [0]:
def downsample(train_df):
    '''Downsample minority class to balance classes. 
    Only works if delay count is less than on_time count (should be the case...)'''
    
    delay_count = train_df.filter(col("label") == 1).count()
    on_time_count = train_df.filter(col("label") == 0).count()
    
    total_count = delay_count + on_time_count
    delay_pct = delay_count / on_time_count
    
    train_delay = train_df.filter(col('label') == 1)
    train_on_time = train_df.filter(col('label') == 0).sample(withReplacement=False, fraction = delay_pct, seed= 2022)
    train_downsampled = train_delay.union(train_on_time)
    
    return train_downsampled

#Pipeline Initialization

In [0]:
#Last minute data curation...
test_pq = test_pq.na.replace('', None, 'wnd_type')\
            .na.replace('', None, 'ga1_cld')\
            .na.replace('', None, 'ga1_cov')\
            .withColumn('wnd_dir_angle',col('wnd_dir_angle').cast(IntegerType()))\
            .withColumn('ka1_temp', when(f.isnull('ka1_temp'), '0').when(f.col('ka1_temp') < 0, -1).otherwise('1'))\
            .withColumn('FLIGHT_ROUTE', concat(col('ORIGIN'),lit("-"),col('DEST')))

df_2015_2018 = test_pq.filter(col('FLIGHT_UTC_DATE') < "2019-01-01T00:00:00.000")

df_6m = data_pull(test_pq, time_window='6m', date_col='FLIGHT_UTC_DATE')

df_2019 = data_pull(test_pq, time_window='2019', date_col='FLIGHT_UTC_DATE')

df_small_test = test_pq.filter(col('FLIGHT_UTC_DATE') < "2015-02-01T00:00:00.000")

In [0]:
(df_2015_2018.count(), len(df_2015_2018.columns))

In [0]:
(df_2019.count(), len(df_2019.columns))

#Create Baseline Models

In [0]:
#select columns
index_cols = ['UNIQUE_ID','FLIGHT_UTC_DATE', 'rank']
cat_cols = ['TIME_OF_DAY', 'MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'wnd_type', 'cig_ceil_is_qual', 'tmp_air_is_qual',  'slp_prs_is_qual', 'ga1_cov','ga1_cld', 'ga1_bs_ht_is_qual', 'wnd_spd_is_qual', 'ga1_cld_qual', 'dew_pnt_is_qual', 'ga1_cov_is_qual', 'aa1_is_qual', 'vis_dist_is_qual', 'ka1_temp', 'FLIGHT_ROUTE']
cont_cols = ['ELEVATION', 'wnd_dir_angle', 'wnd_spd_rate', 'cig_ceil_ht', 'vis_dist', 'tmp_air', 'dew_pnt_tmp','slp_prs', 'aa1_prd_quant_hr', 'aa1_dp', 'ga1_bs_ht']
pred_cols = ['DEP_DEL15']

#Initialize Pre-processing Pipeline
pre_pipe = pre_pipeline(index_cols, cont_cols, cat_cols, pred_cols)
#Add rank to allow forx custom crossvalidation and windowing
# train_test_window = df_2015_2018.withColumn("rank", percent_rank().over(Window.partitionBy().orderBy("FLIGHT_UTC_DATE")))

##Logistic Regression Test

In [0]:
def threshold_CV(df_rank, pre_pipeline, class_model, sample, kfolds):

    #Create evaluation metric lists
    f_5_score_list_CV_average = []
    threshold_list = []
    
    #Logistic Model Lists
    threshold = [.2, .3, .4, .6, .7, .8, .9]
    
    #Preprocess Pipleine
    fit_df = pre_pipeline.fit(df_rank)
    transform_df = fit_df.transform(df_rank).persist(StorageLevel.MEMORY_AND_DISK)
    
    if class_model == 'lr':
        
        for thresh in threshold:
                    
                    # Create Time Splits
            splits = 1.0/(kfolds + 1)
            cutoff = splits
                    
            f_5_score_list = []
            param_grid = {'maxIter':10, 'regParam': 0, 'elasticNetParam':1, 'threshold':thresh}
                    
            scaled_pipelines = scaled_pipeline('lr', param_grid)
                    
            for split in range(kfolds):
                        
                train_df = transform_df.where(f"rank <= {cutoff}").cache()
                test_df = transform_df.where(f"rank > {cutoff} and rank <= {cutoff+splits}").cache()
                cutoff += splits
                        
                if sample == 'down':
                    train_df = downsample(train_df)
                            
                        #Generate model 
                model = scaled_pipelines.fit(train_df)
                predict = model.transform(test_df)
                        
#                       Calculate evaluation metrics
                evaluatorf_5 = MulticlassClassificationEvaluator(metricName='fMeasureByLabel', metricLabel=1, beta=0.5)
                f_5 = evaluatorf_5.evaluate(predict)
                f_5_score_list.append(f_5)
                    
            f_score_avg = np.mean(f_5_score_list)
                    
            f_5_score_list_CV_average.append(f_score_avg)
            threshold_list.append(thresh)
                        
            print('F Score: {:3f}\nParam Grid: {}'.format(f_score_avg, param_grid.items()))
                    
                    
        Eval_df = pd.DataFrame()
        Eval_df['F_0.5 Score'] = f_5_score_list_CV_average
        Eval_df['Threshhold'] = threshold_list
        
    transform_df.unpersist()
    
    return Eval_df

In [0]:
threshold_df = threshold_CV(train_test_window, pre_pipe, 'lr', 'down', 5)

In [0]:
threshold_df

Unnamed: 0,F_0.5 Score,Threshhold
0,0.235965,0.2
1,0.256654,0.3
2,0.287723,0.4
3,0.353857,0.6
4,0.328073,0.7
5,0.163428,0.8
6,0.029157,0.9


In [0]:
lr_df, f_5, recall, precision = custom_test(df_2015_2018, df_2019, pre_pipe, 'lr', 'down')

In [0]:
f_5

In [0]:
recall

In [0]:
precision

In [0]:
lr_df.groupBy('label', 'prediction').count().show()
# TN = lr_df.filter('prediction = 0 AND label = prediction').count()
# TP = lr_df.filter('prediction = 1 AND label = prediction').count()
# FN = lr_df.filter('prediction = 0 AND label <> prediction').count()
# FP = lr_df.filter('prediction = 1 AND label <> prediction').count()

|  | Predicted Delay | Predicted On-Time |
| ----| ----| ----|
| Label Delay   |702,626 | **1,032,905** |
| Label On-Time | **1,195,502** | 5,608,809 |

##Decision Tree

In [0]:
dt_df, f_5, recall, precision = custom_test(df_2015_2018, df_2019, pre_pipe, 'dt', 'down')

In [0]:
f_5

In [0]:
recall

In [0]:
precision

In [0]:
dt_df.groupBy('label', 'prediction').count().show()

##Random Forest

In [0]:
rf_df, f_5, recall, precision = custom_test(df_2015_2018, df_2019, pre_pipe, 'rf', 'down')

In [0]:
f_5

In [0]:
recall

In [0]:
precision

In [0]:
rf_df.groupBy('label', 'prediction').count().show()

#References

 - https://machinelearningmastery.com/k-fold-cross-validation/
 - https://www.analyticsvidhya.com/blog/2019/11/build-machine-learning-pipelines-pyspark/
 - https://medium.com/@junwan01/oversampling-and-undersampling-with-pyspark-5dbc25cdf253