#Objective

#Notebook Initialization

##Import Packages

In [0]:
from pyspark.sql.functions import col,isnan,when,count,lit, to_date,lpad,date_format,rpad,regexp_replace,concat,to_utc_timestamp,to_timestamp, countDistinct,unix_timestamp, row_number, when
from pyspark.sql.types import IntegerType,BooleanType,DateType,StringType,TimestampType
from pyspark.sql import DataFrameNaFunctions
from pyspark import StorageLevel
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from pytz import timezone
import datetime
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder, StandardScaler, PCA, VectorSlicer, Imputer
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import percent_rank
from pyspark.sql import Window
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import functions as f

##Cloud Storage Parameters

In [0]:
blob_container = "tm30container" # The name of your container created in https://portal.azure.com
storage_account = "w261tm30" # The name of your Storage account created in https://portal.azure.com
secret_scope = "w261tm30" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "tm30key" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

test_pq = spark.read.parquet(f"{blob_url}/2022-03-24_data_chkpt_PQ_full")

##Define Functions

In [0]:
def data_pull(df, time_window = 'full', date_col='FLIGHT_UTC_DATE'):
    """Pull processed dataset"""
    if time_window == '2019':
        df = df.filter(f.year(col(date_col)) == 2019)
    elif time_window == '2018':
        df = df.filter(f.year(col(date_col)) == 2018)
    elif time_window == '2017':
        df = df.filter(f.year(col(date_col)) == 2017)
    elif time_window == '2016':
        df = df.filter(f.year(col(date_col)) == 2016) 
    
    #The commands below are for 2015 data
    elif time_window == '6m':
        df = df.filter(col(date_col) < "2015-07-01T00:00:00.000")  
    elif time_window == '3m':
        df = df.filter(col(date_col) < "2015-04-01T00:00:00.000")
        #comment this out if it takes too long
    
    print(f'{df.count():,} total records imported for the {time_window} dataset')
    return df

In [0]:
def pre_pipeline(index_cols, cont_cols, cat_cols, pred_cols):
    ''' This function creates a pre-processed pipeline to be used to prepare for crossfold validation and model training
    '''
    pre_pipe = None
    
    #Convert string to index
    indexer = StringIndexer(inputCols=cat_cols, outputCols=[c+"_idx" for c in cat_cols]).setHandleInvalid("keep")

    #Convert categorical columns to index
    encoder = OneHotEncoder(inputCols=[c+"_idx" for c in cat_cols], outputCols= [c+"_OHE" for c in cat_cols])
    
    #Vector assembler for categorical
    assembler_cat = VectorAssembler(inputCols= [x+"_OHE" for x in cat_cols], outputCol="cat_features")
        
    assembler_lab = StringIndexer(inputCol='DEP_DEL15', outputCol="label")
        
    pre_pipe = Pipeline(stages=[indexer, encoder, assembler_cat, assembler_lab])
    
    return pre_pipe

In [0]:
def scaled_pipeline(model, cont_cols, param_grid):
    ''' This function creates a scaled processed and scaled pipeline to be used to train models.
        Parameters:
            model:    lr = Logistic Regression;
                      rf = Random Forest 
                      dt = Decision Trees
        Returns: a pipeline model
    '''
    pipeline_model = None
    
    #Ensure continuous variables have values
    imputer = Imputer(inputCols=cont_cols, outputCols=cont_cols)
    
    #Assemble cont variables
    assembler_num = VectorAssembler(inputCols=cont_cols, outputCol="scale_nums")
    
    #Scale the values
    scaler = StandardScaler(inputCol="scale_nums", outputCol="scaledFeatures", withStd=True, withMean=True)

    #Vector assembler combined
    assembler = VectorAssembler(inputCols=["scaledFeatures", "cat_features"], outputCol="features")
    
    #Models for the pipeline
    if model == 'lr':
        max_iter, reg_param, ela = param_grid.values()
        
        class_model = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter = max_iter, regParam = reg_param, elasticNetParam = ela, threshold = 0.6)
        
    elif model == 'rf':
        num_trees, depth, criterion = param_grid.values()
        
        class_model = RandomForestClassifier(featuresCol = 'features', labelCol = 'label', numTrees = num_trees, maxDepth = depth, impurity = criterion)
        
    elif model == 'dt':
        class_model = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label')
        
    pipeline_model = Pipeline(stages=[imputer, assembler_num, scaler, assembler, class_model])
    
    return pipeline_model

In [0]:
def Custom_CV(transform_df, model, cont_cols, param_grid, sample, kfolds):
                       
    # Create Time Splits
    splits = 1.0/(kfolds + 1)
    cutoff = splits

    f_5_score_list = []
    
    scaled_pipelines = scaled_pipeline(model, cont_cols, param_grid)

    for split in range(kfolds):

        train_df = transform_df.where(f"rank <= {cutoff}").cache()
        test_df = transform_df.where(f"rank > {cutoff} and rank <= {cutoff+splits}").cache()
        cutoff += splits

        if sample == 'down':
            train_df = downsample(train_df)

        #Generate model 
        model = scaled_pipelines.fit(train_df)
        predict = model.transform(test_df)

        #Calculate evaluation metrics
        evaluatorf_5 = MulticlassClassificationEvaluator(metricName='fMeasureByLabel', metricLabel=1, beta=0.5)
        f_5 = evaluatorf_5.evaluate(predict)
        f_5_score_list.append(f_5)
        
    return f_5_score_list

In [0]:
def Custom_GridSearch(df_rank, pre_pipeline, cont_cols, class_model, sample, kfolds):

    #Create evaluation metric lists
    f_5_score_list_CV_average = []

    #Logistic Model Lists
    maxiteration = []
    regulationparameter = []
    elasticnet = []
    
    #Decision Tree Model Lists
    maxdepth = []
    maxbins = []
    infotype = []
    
    #Random Forest Model Lists
    numtrees = []
    depthtree = []
    criterion_type = []
    feature_subset = []
    
    #Preprocess Pipleine
    fit_df = pre_pipeline.fit(df_rank)
    transform_df = fit_df.transform(df_rank).persist(StorageLevel.MEMORY_AND_DISK)
    
    if class_model == 'lr':
        maxiter = [10, 100]
        regulation_param = [0.01, 0.5, 2]
        elastic = [0, 0.5, 1]        
        #threshold paremeters were tested in a seperate function

        for max_iter in maxiter:
            for reg_param in regulation_param:
                for ela in elastic:
                    
                    param_grid = {'maxIter':max_iter, 'regParam': reg_param, 'elasticNetParam':ela}
                        
                    f_5_score_list = Custom_CV(transform_df, class_model, cont_cols, param_grid, sample, kfolds)
                    f_score_avg = np.mean(f_5_score_list)
                    
                    f_5_score_list_CV_average.append(f_score_avg)
                    maxiteration.append(max_iter)
                    regulationparameter.append(reg_param)
                    elasticnet.append(ela)

                        
                    print('F Score: {:3f}\nParam Grid: {}'.format(f_score_avg, param_grid.items()))
                    
                    
        Eval_df = pd.DataFrame()
        Eval_df['F_0.5 Score'] = f_5_score_list_CV_average
        Eval_df['Max Iterations'] = maxiteration
        Eval_df['Regulation Parameter'] = regulationparameter
        Eval_df['Elastic Net'] = elasticnet
        
    elif class_model == 'dt':
        max_depth = [5, 10, 20]
        max_bins = [6168]
        info_type = ['gini']
        
        for md in max_depth:
            for mb in max_bins:
                for it in info_type:
                    
                    param_grid = {'max_depth':md, 'max_bins': mb, 'info_type':it}
                    f_5_score_list = Custom_CV(transform_df, class_model, cont_cols,  param_grid, sample, kfolds)
                    
                    f_score_avg = np.mean(f_5_score_list)
                    
                    f_5_score_list_CV_average.append(f_score_avg)
                    maxdepth.append(md)
                    maxbins.append(mb)
                    infotype.append(it)
                        
                    print('F Score: {:3f}\nParam Grid: {}'.format(f_score_avg, param_grid.items()))
                    
                    
        Eval_df = pd.DataFrame()
        Eval_df['F_0.5 Score'] = f_5_score_list_CV_average
        Eval_df['Max Depth'] = maxdepth
        Eval_df['Max Bins'] = maxbins
        Eval_df['Info Type'] = infotype

    elif class_model == 'rf':
        num_trees = [10, 20, 50]
        depth = [3, 5, 10]
        criterion = ['gini']
        feature_subsetstrat = ['sqrt', 'all']
        
        for nt in num_trees:
            for d in depth:
                for crit in criterion:
                    for feature in feature_subsetstrat:
                    
                        param_grid = {'num_trees':nt, 'depth': d, 'criterion':crit, 'substrategy':feature}
                        f_5_score_list = Custom_CV(transform_df, class_model, cont_cols, param_grid, sample, kfolds)
                        
                        f_score_avg = np.mean(f_5_score_list)

                        f_5_score_list_CV_average.append(f_score_avg)
                        numtrees.append(nt)
                        depthtree.append(d)
                        criterion_type.append(crit)
                        feature_subset.append(feature)

                        print('F Score: {:3f}\nParam Grid: {}'.format(f_score_avg, param_grid.items()))
                    
                    
        Eval_df = pd.DataFrame()
        Eval_df['F_0.5 Score'] = f_5_score_list_CV_average
        Eval_df['Tree Number'] = numtrees
        Eval_df['Depth'] = depthtree
        Eval_df['Info Type'] = criterion_type
        Eval_df['Feature Substrategy'] = feature_subset
        
    transform_df.unpersist()
    
    return Eval_df

In [0]:
def downsample(train_df):
    '''Downsample minority class to balance classes. 
    Only works if delay count is less than on_time count (should be the case...)'''
    
    delay_count = train_df.filter(col("label") == 1).count()
    on_time_count = train_df.filter(col("label") == 0).count()
    
    total_count = delay_count + on_time_count
    delay_pct = delay_count / on_time_count
    
    train_delay = train_df.filter(col('label') == 1)
    train_on_time = train_df.filter(col('label') == 0).sample(withReplacement=False, fraction = delay_pct, seed= 2022)
    train_downsampled = train_delay.union(train_on_time)
    
    return train_downsampled

In [0]:
def ml_model(df_train, df_test, model, sample):
    
    #Separate columns between categorical and continuous variables
    index_cols = ['UNIQUE_ID','FLIGHT_UTC_DATE', 'rank']

    cat_cols = ['TIME_OF_DAY', 'MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'wnd_type', 'cig_ceil_is_qual', 
                'tmp_air_is_qual',  'slp_prs_is_qual', 'ga1_cov','ga1_cld', 'ga1_bs_ht_is_qual', 'wnd_spd_is_qual', 
                'ga1_cld_qual', 'dew_pnt_is_qual', 'ga1_cov_is_qual', 'aa1_is_qual', 'vis_dist_is_qual', 'ka1_temp', 'FLIGHT_ROUTE']


    cont_cols = ['ELEVATION', 'wnd_dir_angle', 'wnd_spd_rate', 'cig_ceil_ht', 'vis_dist', 'tmp_air', 'dew_pnt_tmp',
                 'slp_prs', 'aa1_prd_quant_hr', 'aa1_dp', 'ga1_bs_ht']

    pred_cols = ['DEP_DEL15']
    
    
    #Add rank value to allow for custom crossvalidation and windowing
    train_test_window = df_train.withColumn("rank", percent_rank().over(Window.partitionBy().orderBy("FLIGHT_UTC_DATE")))
    rank_test = df_test.withColumn("rank", percent_rank().over(Window.partitionBy().orderBy("FLIGHT_UTC_DATE")))
    #Initialize Pre-processing Pipeline
    pre_pipe_ml = pre_pipeline(index_cols, cont_cols, cat_cols, pred_cols)
    
    #Run through custom grid search to determine best parameters
    Eval = Custom_GridSearch(train_test_window, pre_pipe_ml, cont_cols, model, sample, 5)
    
    #Sort to find best parameters, store as param_grid
    param_grid = Eval.sort_values('F_0.5 Score', axis = 0, ascending = False).head(1).iloc[0,1:].to_dict()
     
    #Run test model on the best function
    pipeline = pre_pipe_ml.fit(train_test_window)
    transform_test = pipeline.transform(rank_test)

    f_5_score_list = Custom_CV(transform_test, model, cont_cols, param_grid, sample, 5)
    
    #Evaluate metrics based on unseen data
    
    f_score_avg = np.mean(f_5_score_list)
    print('F Score on Unseen Data: {:3f}'.format(f_score_avg))
    
    return Eval, f_score_avg
    

#Pipeline Initialization

In [0]:
#Last minute data curation...
test_pq = test_pq.na.replace('', None, 'wnd_type')\
            .na.replace('', None, 'ga1_cld')\
            .na.replace('', None, 'ga1_cov')\
            .withColumn('wnd_dir_angle',col('wnd_dir_angle').cast(IntegerType()))\
            .withColumn('ka1_temp', when(f.isnull('ka1_temp'), '0').when(f.col('ka1_temp') < 0, -1).otherwise('1'))\
            .withColumn('FLIGHT_ROUTE', concat(col('ORIGIN'),lit("-"),col('DEST')))

df_2015_2018 = test_pq.filter(col('FLIGHT_UTC_DATE') < "2019-01-01T00:00:00.000")

df_6m = data_pull(test_pq, time_window='6m', date_col='FLIGHT_UTC_DATE')

df_2019 = data_pull(test_pq, time_window='2019', date_col='FLIGHT_UTC_DATE')

In [0]:
(df_2015_2018.count(), len(df_2015_2018.columns))

##Test Pipeline Function

In [0]:
df_test = df_6m.filter(col('FLIGHT_UTC_DATE') < "2015-02-01T00:00:00.000")
df_test.count()

In [0]:
df_validate = df_6m.filter((col('FLIGHT_UTC_DATE') > "2015-02-01T00:00:00.000") &\
                          (col('FLIGHT_UTC_DATE') < "2015-04-01T00:00:00.000"))

df_validate.count()

In [0]:
Eval, f_score_df = ml_model(df_test, df_validate, 'lr', 'down')

In [0]:
Eval

Unnamed: 0,F_0.5 Score,Max Iterations,Regulation Parameter,Elastic Net
0,0.277473,1,0.0,0.0
1,0.277473,1,0.0,0.5
2,0.277473,1,0.0,1.0
3,0.277162,1,0.5,0.0
4,0.0,1,0.5,0.5
5,0.0,1,0.5,1.0
6,0.274141,1,10.0,0.0
7,0.0,1,10.0,0.5
8,0.0,1,10.0,1.0
9,0.30323,5,0.0,0.0


In [0]:
f_score_df

In [0]:
param_grid = Eval.sort_values('F_0.5 Score', axis = 0, ascending = False).head(1).iloc[0,1:].to_dict()

param_grid

#Create Baseline Models

In [0]:
#Modify to switch from test to scaled up model
df_model = df_2015_2018

In [0]:
#Add rank to allow forx custom crossvalidation and windowing
train_test_window = df_model.withColumn("rank", percent_rank().over(Window.partitionBy().orderBy("FLIGHT_UTC_DATE")))

In [0]:
#Initialize Pre-processing Pipeline
pre_pipe = pre_pipeline(index_cols, cont_cols, cat_cols, pred_cols)

##Logistic Regression Pipeline

In [0]:
Eval_lr, metric_lr, best_model_lr = custom_CV(train_test_window, pre_pipe, 'lr', 'down', 5)

In [0]:
Eval_lr

Unnamed: 0,F_0.5 Score,Max Iterations,Regulation Parameter,Elastic Net
0,0.002436,1,0.0,0.0
1,0.002436,1,0.0,0.5
2,0.002436,1,0.0,1.0
3,0.002437,1,0.5,0.0
4,0.094082,1,0.5,0.5
5,0.094082,1,0.5,1.0
6,0.002436,1,10.0,0.0
7,0.094082,1,10.0,0.5
8,0.094082,1,10.0,1.0
9,0.31352,5,0.0,0.0


In [0]:
best_model_lr

##Decision Tree Pipeline

In [0]:
Eval_dt, metric_dt, best_model_dt = custom_CV(train_test_window, pre_pipe, dt_model, 'down', 5)

In [0]:
Eval_dt

Unnamed: 0,K-Fold,F_0.5 Score,Recall,Precision
0,1,0.266504,0.657183,0.232021
1,2,0.311281,0.653145,0.275262
2,3,0.317414,0.67554,0.280269
3,4,0.301232,0.659018,0.265233
4,5,0.321497,0.689491,0.28365


##Random Forest Pipeline

In [0]:
Eval_rf, metric_rf, best_model_rf = custom_CV(train_test_window, pre_pipe, rf_model, 'down', 5)

In [0]:
Eval_rf

Unnamed: 0,K-Fold,F_0.5 Score,Recall,Precision
0,1,0.262586,0.617124,0.229608
1,2,0.304598,0.652695,0.268764
2,3,0.313122,0.642113,0.277569
3,4,0.302325,0.594569,0.26924
4,5,0.319489,0.69461,0.281485


##Test on unseen data

In [0]:
pre_pipeline = pre_pipe.fit(df_model)
transform_2019 = pre_pipeline.transform(df_2019)

In [0]:
lr_pred_2019 = best_model_lr.transform(transform_2019)

In [0]:
#calcuate f Score
lr_evaluatorf_5 = MulticlassClassificationEvaluator(metricName='fMeasureByLabel', metricLabel=1, beta=0.5)
lr_f_5 = lr_evaluatorf_5.evaluate(lr_pred_2019)

lr_evaluator_recall = MulticlassClassificationEvaluator(metricName='recallByLabel', metricLabel=1)
lr_recall = lr_evaluator_recall.evaluate(lr_pred_2019)

lr_evaluator_precision = MulticlassClassificationEvaluator(metricName='precisionByLabel', metricLabel=1)
lr_precision = lr_evaluator_precision.evaluate(lr_pred_2019)

print('F Score: {:3f}\nRecall: {:3f}\nPrecision  {:3f}'.format(lr_f_5, lr_recall, lr_precision))

In [0]:
dt_pred_2019 = best_model_dt.transform(transform_2019)

In [0]:
#calcuate f Score
dt_evaluatorf_5 = MulticlassClassificationEvaluator(metricName='fMeasureByLabel', metricLabel=1, beta=0.5)
dt_f_5 = dt_evaluatorf_5.evaluate(dt_pred_2019)

dt_evaluator_recall = MulticlassClassificationEvaluator(metricName='recallByLabel', metricLabel=1)
dt_recall = dt_evaluator_recall.evaluate(dt_pred_2019)

dt_evaluator_precision = MulticlassClassificationEvaluator(metricName='precisionByLabel', metricLabel=1)
dt_precision = dt_evaluator_precision.evaluate(dt_pred_2019)

print('F Score: {:3f}\nRecall: {:3f}\nPrecision  {:3f}'.format(dt_f_5, dt_recall, dt_precision))

In [0]:
rf_pred_2019 = best_model_rf.transform(transform_2019)

In [0]:
#calcuate f Score
rf_evaluatorf_5 = MulticlassClassificationEvaluator(metricName='fMeasureByLabel', metricLabel=1, beta=0.5)
rf_f_5 = rf_evaluatorf_5.evaluate(rf_pred_2019)

rf_evaluator_recall = MulticlassClassificationEvaluator(metricName='recallByLabel', metricLabel=1)
rf_recall = rf_evaluator_recall.evaluate(rf_pred_2019)

rf_evaluator_precision = MulticlassClassificationEvaluator(metricName='precisionByLabel', metricLabel=1)
rf_precision = rf_evaluator_precision.evaluate(rf_pred_2019)

print('F Score: {:3f}\nRecall: {:3f}\nPrecision  {:3f}'.format(rf_f_5, rf_recall, rf_precision))

#References

 - https://machinelearningmastery.com/k-fold-cross-validation/
 - https://www.analyticsvidhya.com/blog/2019/11/build-machine-learning-pipelines-pyspark/
 - https://medium.com/@junwan01/oversampling-and-undersampling-with-pyspark-5dbc25cdf253