In [0]:
# Release notes, first attempt at Lasso Regularization with Cross Validation

In [0]:
from itertools import chain
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyspark.sql.functions import col, count, isnan, when, substring, min, log, lit, row_number, concat, to_timestamp, format_string
from pyspark.sql.types import FloatType
from pyspark.ml import Pipeline
from pyspark.ml.feature import SQLTransformer, OneHotEncoder, StringIndexer, VectorAssembler,\
    StandardScaler, Imputer
#from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

from pyspark.sql.window import Window

In [0]:
## Place this cell in any team notebook that needs access to the team cloud storage.


# The following blob storage is accessible to team members only (read and write)
# access key is valid til TTL
# after that you will need to create a new SAS key and authenticate access again via DataBrick command line
blob_container  = "final-project-summer24-team3"       # The name of your container created in https://portal.azure.com
storage_account = "summer2024team3"  # The name of your Storage account created in https://portal.azure.com
secret_scope    = "summer24_team_3_2_scope"           # The name of the scope created in your local computer using the Databricks CLI
secret_key      = "final-project-summer24-team3"             # The name of the secret key created in your local computer using the Databricks CLI
team_blob_url   = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"  #points to the root of your team storage bucket

# SAS Token: Grant the team limited access to Azure Storage resources
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)
data_BASE_DIR = f"{team_blob_url}/TP"

In [0]:
display(dbutils.fs.ls(f"{data_BASE_DIR}"))

path,name,size,modificationTime
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_SUCCESS,_SUCCESS,0,1720561571000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_1002670972388615845,_committed_1002670972388615845,625,1720560531000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_5669257934384103852,_committed_5669257934384103852,221,1720561571000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_6618439955609603938,_committed_6618439955609603938,419,1720561337000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_9167039456723159873,_committed_9167039456723159873,318,1720559468000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_vacuum825530481471543349,_committed_vacuum825530481471543349,95,1720561338000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_1002670972388615845,_started_1002670972388615845,0,1720560531000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_5669257934384103852,_started_5669257934384103852,0,1720561570000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_6618439955609603938,_started_6618439955609603938,0,1720561337000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/df_1y_cleaned_transformed/,df_1y_cleaned_transformed/,0,1722046865000


In [0]:
# Load clean data    
df = spark.read.parquet(f"{data_BASE_DIR}/flight_stations_weather_all_cleaned3/")

# Drop the HourlyPrecipitation column (was causing issues with the saving as Parquet file)
# df_1y_cleaned = df_1y_cleaned.drop("HourlyPrecipitation")
# df_1y_cleaned = df_1y_cleaned.orderBy(col("FL_DATE").asc(), col("CRS_DEP_TIME").asc())
# display(df_1y_cleaned)

In [0]:
df.groupBy("YEAR").count().show()

+----+-------+
|YEAR|  count|
+----+-------+
|2016|2741210|
|2015|2895703|
|2017|2808936|
|2018|3607369|
|2019|7270542|
+----+-------+



In [0]:
#df_train_3m = merged_3m.filter(col("MONTH").isin(1,2))
#df_test_3m = merged_3m.filter(col("MONTH") == 3)
#df_train_1y = merged_1y.filter(col('QUARTER').isin(1, 2, 3))
#df_test_1y = merged_1y.filter(col('QUARTER').isin(4))

df_train = df.filter(col("YEAR").isin("2015", "2016", "2017"))
# Hold out 2018 for model blending 
df_val = df.filter(col("YEAR").isin("2018"))
df_test = df.filter(col("YEAR").isin("2019"))

In [0]:
display(df.limit(10))

index,MONTH,DAY_OF_WEEK,OP_CARRIER,ORIGIN_AIRPORT_ID,CRS_DEP_TIME,DISTANCE,DEP_DEL15,QUARTER,DEP_DELAY,DEST_AIRPORT_ID,FL_DATE,TAIL_NUM,ARR_TIME,ARR_DELAY,ARR_DEL15,CANCELLED,type,ELEVATION,HourlyAltimeterSetting,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySkyConditions,HourlySeaLevelPressure,HourlyVisibility,HourlyWindDirection,HourlyWindSpeed,HourlyPresentWeatherType,HOUR,MONTH_indexed,MONTH_vec,DAY_OF_WEEK_indexed,DAY_OF_WEEK_vec,OP_CARRIER_indexed,OP_CARRIER_vec,ORIGIN_AIRPORT_ID_indexed,ORIGIN_AIRPORT_ID_vec,HOUR_indexed,HOUR_vec,type_indexed,type_vec,HourlySkyConditions_vec,HourlyPresentWeatherType_vec,Rain,Snow,Thunder,Tornado,View Obstruction,MONTH_1,MONTH_2,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,MONTH_9,MONTH_10,MONTH_11,MONTH_12,HOUR_00,HOUR_01,HOUR_02,HOUR_03,HOUR_04,HOUR_05,HOUR_06,HOUR_07,HOUR_08,HOUR_09,HOUR_10,HOUR_11,HOUR_12,HOUR_13,HOUR_14,HOUR_15,HOUR_16,HOUR_17,HOUR_18,HOUR_19,HOUR_20,HOUR_21,HOUR_22,HOUR_23,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6,DAY_OF_WEEK_7,OP_CARRIER_9E,OP_CARRIER_AA,OP_CARRIER_AS,OP_CARRIER_B6,OP_CARRIER_DL,OP_CARRIER_EV,OP_CARRIER_F9,OP_CARRIER_G4,OP_CARRIER_HA,OP_CARRIER_MQ,OP_CARRIER_NK,OP_CARRIER_OH,OP_CARRIER_OO,OP_CARRIER_QX,OP_CARRIER_UA,OP_CARRIER_US,OP_CARRIER_VX,OP_CARRIER_WN,OP_CARRIER_YV,OP_CARRIER_YX,type_large_airport,type_medium_airport,type_seaplane_base,type_small_airport,DEP_12hr,DEP_6hr,DEP_4hr,DISTANCE_LAG,REALIZED_DELAY_MIN,REALIZED_DEL15,HOLIDAY,OUTDEG_AIRPORT_6hr,YEAR
0,1,4,AS,14747,25,1448.0,0.0,1,-1.0,10299,2015-01-01,N527AS,259,-21.0,0.0,0.0,large_airport,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 6, indices -> List(2), values -> List(1.0))",8.0,"Map(vectorType -> sparse, length -> 19, indices -> List(8), values -> List(1.0))",10.0,"Map(vectorType -> sparse, length -> 372, indices -> List(10), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))",,,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,,,0,1,0.0,2015
1,1,4,DL,14771,25,1589.0,0.0,1,-5.0,13487,2015-01-01,N3730B,610,8.0,0.0,0.0,large_airport,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 6, indices -> List(2), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 19, indices -> List(1), values -> List(1.0))",7.0,"Map(vectorType -> sparse, length -> 372, indices -> List(7), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))",,,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,,,0,1,0.0,2015
2,1,4,US,12892,30,2125.0,0.0,1,14.0,11057,2015-01-01,N584UW,753,-10.0,0.0,0.0,large_airport,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 6, indices -> List(2), values -> List(1.0))",17.0,"Map(vectorType -> sparse, length -> 19, indices -> List(17), values -> List(1.0))",4.0,"Map(vectorType -> sparse, length -> 372, indices -> List(4), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))",,,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0.0,0.0,0.0,,,0,1,0.0,2015
3,1,4,DL,11292,30,1199.0,0.0,1,-6.0,10397,2015-01-01,N958DN,453,-30.0,0.0,0.0,large_airport,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 6, indices -> List(2), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 19, indices -> List(1), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 372, indices -> List(3), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))",,,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,,,0,1,0.0,2015
4,1,4,AA,12889,35,2174.0,0.0,1,-8.0,13303,2015-01-01,N853AA,753,-10.0,0.0,0.0,large_airport,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 6, indices -> List(2), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 19, indices -> List(2), values -> List(1.0))",8.0,"Map(vectorType -> sparse, length -> 372, indices -> List(8), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))",,,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,,,0,1,0.0,2015
5,1,4,DL,14869,40,1590.0,0.0,1,-6.0,10397,2015-01-01,N3751B,553,-22.0,0.0,0.0,large_airport,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 6, indices -> List(2), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 19, indices -> List(1), values -> List(1.0))",17.0,"Map(vectorType -> sparse, length -> 372, indices -> List(17), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))",,,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,,,0,1,0.0,2015
6,1,4,DL,14747,40,1399.0,0.0,1,-1.0,13487,2015-01-01,N651DL,557,8.0,0.0,0.0,large_airport,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 6, indices -> List(2), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 19, indices -> List(1), values -> List(1.0))",10.0,"Map(vectorType -> sparse, length -> 372, indices -> List(10), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))",,,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,,,0,1,0.0,2015
7,1,4,DL,10299,45,1448.0,0.0,1,-14.0,14747,2015-01-01,N3743H,451,-24.0,0.0,0.0,large_airport,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 6, indices -> List(2), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 19, indices -> List(1), values -> List(1.0))",62.0,"Map(vectorType -> sparse, length -> 372, indices -> List(62), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))",,,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,,,0,1,0.0,2015
8,1,4,NK,12889,55,1139.0,1.0,1,25.0,13198,2015-01-01,N525NK,543,6.0,0.0,0.0,large_airport,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 6, indices -> List(2), values -> List(1.0))",10.0,"Map(vectorType -> sparse, length -> 19, indices -> List(10), values -> List(1.0))",8.0,"Map(vectorType -> sparse, length -> 372, indices -> List(8), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))",,,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,,,0,1,0.0,2015
9,1,4,NK,12889,103,1055.0,0.0,1,-1.0,11298,2015-01-01,N632NK,529,-1.0,0.0,0.0,large_airport,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,,1,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 6, indices -> List(2), values -> List(1.0))",10.0,"Map(vectorType -> sparse, length -> 19, indices -> List(10), values -> List(1.0))",8.0,"Map(vectorType -> sparse, length -> 372, indices -> List(8), values -> List(1.0))",20.0,"Map(vectorType -> sparse, length -> 23, indices -> List(20), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 269, indices -> List(221), values -> List(1.0))",,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,2,0,0,0,0.0,0.0,0.0,,,0,1,0.0,2015


In [0]:
df.columns

['index',
 'MONTH',
 'DAY_OF_WEEK',
 'OP_CARRIER',
 'ORIGIN_AIRPORT_ID',
 'CRS_DEP_TIME',
 'DISTANCE',
 'DEP_DEL15',
 'QUARTER',
 'DEP_DELAY',
 'DEST_AIRPORT_ID',
 'FL_DATE',
 'TAIL_NUM',
 'ARR_TIME',
 'ARR_DELAY',
 'ARR_DEL15',
 'CANCELLED',
 'type',
 'ELEVATION',
 'HourlyAltimeterSetting',
 'HourlyDryBulbTemperature',
 'HourlyPrecipitation',
 'HourlyRelativeHumidity',
 'HourlySkyConditions',
 'HourlySeaLevelPressure',
 'HourlyVisibility',
 'HourlyWindDirection',
 'HourlyWindSpeed',
 'HourlyPresentWeatherType',
 'HOUR',
 'MONTH_indexed',
 'MONTH_vec',
 'DAY_OF_WEEK_indexed',
 'DAY_OF_WEEK_vec',
 'OP_CARRIER_indexed',
 'OP_CARRIER_vec',
 'ORIGIN_AIRPORT_ID_indexed',
 'ORIGIN_AIRPORT_ID_vec',
 'HOUR_indexed',
 'HOUR_vec',
 'type_indexed',
 'type_vec',
 'HourlySkyConditions_vec',
 'HourlyPresentWeatherType_vec',
 'Rain',
 'Snow',
 'Thunder',
 'Tornado',
 'View Obstruction',
 'MONTH_1',
 'MONTH_2',
 'MONTH_3',
 'MONTH_4',
 'MONTH_5',
 'MONTH_6',
 'MONTH_7',
 'MONTH_8',
 'MONTH_9',
 'MONTH

In [0]:
df_train.count()

8445849

In [0]:
df_val.count()

3607369

In [0]:
df_test.count()

7270542

##Transform

In [0]:
# def create_row_id(data):
#     formatted_crs_dep_time = format_string("%04d", col("CRS_DEP_TIME"))
#     datetime_str = concat(col("FL_DATE"), lit(" "), formatted_crs_dep_time\
#         .substr(1, 2), lit(":"), formatted_crs_dep_time.substr(3, 2))
#     datetime_col = to_timestamp(datetime_str, 'yyyy-MM-dd HH:mm')
#     windowSpec = Window.orderBy("DATE_TIME")
#     return data.withColumn("DATE_TIME", datetime_col).withColumn("index", row_number().over(windowSpec) - 1)
# df_train = create_row_id(df_train)
# df_test = create_row_id(df_test)


#### DISTANCE_LAG

In [0]:
sample_rdd = df_train.sample(fraction=0.05, seed=42)

In [0]:
# print(f"Original DataFrame row count: {df_train.count()}")
# print(f"Test DataFrame row count: {sample_rdd.count()}")

In [0]:
# Create an Imputer
imputer = Imputer(
    inputCols=['DISTANCE_LAG'],
    outputCols=['DISTANCE_LAG']
).setStrategy("mean")
imputer_model = imputer.fit(sample_rdd)
sample_rdd = imputer_model.transform(sample_rdd)

In [0]:
# Run on main df
imputer = Imputer(
    inputCols=['DISTANCE_LAG'],
    outputCols=['DISTANCE_LAG']
).setStrategy("mean")

imputer_model = imputer.fit(df_train)


In [0]:

df_train = imputer_model.transform(df_train)

df_train

DataFrame[index: int, MONTH: int, DAY_OF_WEEK: int, OP_CARRIER: string, ORIGIN_AIRPORT_ID: int, CRS_DEP_TIME: int, DISTANCE: double, DEP_DEL15: double, QUARTER: int, DEP_DELAY: double, DEST_AIRPORT_ID: int, FL_DATE: string, TAIL_NUM: string, ARR_TIME: int, ARR_DELAY: double, ARR_DEL15: double, CANCELLED: double, type: string, ELEVATION: float, HourlyAltimeterSetting: float, HourlyDryBulbTemperature: float, HourlyPrecipitation: float, HourlyRelativeHumidity: float, HourlySkyConditions: string, HourlySeaLevelPressure: float, HourlyVisibility: float, HourlyWindDirection: float, HourlyWindSpeed: float, HourlyPresentWeatherType: string, HOUR: string, MONTH_indexed: double, MONTH_vec: vector, DAY_OF_WEEK_indexed: double, DAY_OF_WEEK_vec: vector, OP_CARRIER_indexed: double, OP_CARRIER_vec: vector, ORIGIN_AIRPORT_ID_indexed: double, ORIGIN_AIRPORT_ID_vec: vector, HOUR_indexed: double, HOUR_vec: vector, type_indexed: double, type_vec: vector, HourlySkyConditions_vec: vector, HourlyPresentWeathe

In [0]:
df_val = imputer_model.transform(df_val)

In [0]:
df_test = imputer_model.transform(df_test)

####REALIZED_DELAY_MIN

In [0]:
from pyspark.sql.functions import col, log, min, lit, when

def log_transform_delay(train_df, column_name="REALIZED_DELAY_MIN"):
    # Find the minimum value in the training set
    min_delay = train_df.agg(min(column_name)).collect()[0][0]
    
    # Set a floor value that's slightly lower than the minimum observed value
    floor_value = min_delay - 1 if min_delay > 0 else min_delay * 1.1 if min_delay < 0 else -1
    
    # Define the transformation
    def transform(df):
        return df.withColumn(
            f"LOG_{column_name}",
            when(col(column_name) > floor_value,
                 log(col(column_name) - floor_value + 1)
            ).otherwise(0)
        )
    
    # Apply the transformation to both dataframes
    train_df_transformed = transform(train_df)
    # val_df_transformed = transform(val_df)
    # test_df_transformed = transform(test_df)
    
    # return train_df_transformed, val_df_transformed, test_df_transformed, floor_value

    return train_df_transformed, floor_value

# Apply the transformation
df_train, floor_value = log_transform_delay(df_train)

# Display a sample of the transformed data
print("Sample of transformed training data:")
df_train.select("REALIZED_DELAY_MIN", "LOG_REALIZED_DELAY_MIN").show(5)

# print("\nSample of transformed test data:")
# df_test.select("REALIZED_DELAY_MIN", "LOG_REALIZED_DELAY_MIN").show(5)

print(f"\nFloor value used: {floor_value}")

Sample of transformed training data:
+------------------+----------------------+
|REALIZED_DELAY_MIN|LOG_REALIZED_DELAY_MIN|
+------------------+----------------------+
|              NULL|                   0.0|
|              NULL|                   0.0|
|              NULL|                   0.0|
|              NULL|                   0.0|
|              NULL|                   0.0|
+------------------+----------------------+
only showing top 5 rows


Floor value used: -251.90000000000003


In [0]:
df_test = df_test.withColumn(
                    "LOG_REALIZED_DELAY_MIN",
                    when(col("REALIZED_DELAY_MIN") > floor_value,
                        log(col("REALIZED_DELAY_MIN") - floor_value + 1)
                    ).otherwise(0)
                )

In [0]:
df_val = df_val.withColumn(
                    "LOG_REALIZED_DELAY_MIN",
                    when(col("REALIZED_DELAY_MIN") > floor_value,
                        log(col("REALIZED_DELAY_MIN") - floor_value + 1)
                    ).otherwise(0)
                )

In [0]:
numeric_cols = [
    'DISTANCE',
    'ELEVATION',
    'HourlyAltimeterSetting',
    'HourlyDryBulbTemperature',
    'HourlyPrecipitation',
    'HourlyRelativeHumidity',
    'HourlySeaLevelPressure',
    'HourlyVisibility',
    'HourlyWindDirection',
    'HourlyWindSpeed',
    'DISTANCE_LAG',
    # 'REALIZED_DELAY_MIN',    
    'LOG_REALIZED_DELAY_MIN',
    'OUTDEG_AIRPORT_6hr',
    'DEP_12hr', 'DEP_6hr', 'DEP_4hr'
]

categorical_cols = [
    'MONTH_1', 'MONTH_2', 'MONTH_3', 'MONTH_4', 'MONTH_5', 'MONTH_6', 'MONTH_7', 'MONTH_8', 'MONTH_9', 'MONTH_10', 'MONTH_11', 'MONTH_12',
    'HOUR_00', 'HOUR_01', 'HOUR_02', 'HOUR_03', 'HOUR_04', 'HOUR_05', 'HOUR_06', 'HOUR_07', 'HOUR_08', 'HOUR_09', 'HOUR_10', 'HOUR_11',
    'HOUR_12', 'HOUR_13', 'HOUR_14', 'HOUR_15', 'HOUR_16', 'HOUR_17', 'HOUR_18', 'HOUR_19', 'HOUR_20', 'HOUR_21', 'HOUR_22', 'HOUR_23',
    'DAY_OF_WEEK_1', 'DAY_OF_WEEK_2', 'DAY_OF_WEEK_3', 'DAY_OF_WEEK_4', 'DAY_OF_WEEK_5', 'DAY_OF_WEEK_6', 'DAY_OF_WEEK_7',
    'OP_CARRIER_9E', 'OP_CARRIER_AA', 'OP_CARRIER_AS', 'OP_CARRIER_B6', 'OP_CARRIER_DL', 'OP_CARRIER_EV', 'OP_CARRIER_F9',
    'OP_CARRIER_G4', 'OP_CARRIER_HA', 'OP_CARRIER_MQ', 'OP_CARRIER_NK', 'OP_CARRIER_OH', 'OP_CARRIER_OO', 'OP_CARRIER_UA',
    'OP_CARRIER_WN', 'OP_CARRIER_YV', 'OP_CARRIER_YX',
    'Rain',
    'Snow',
    'Thunder',
    'Tornado',
    'View Obstruction',
    'REALIZED_DEL15',
    'HOLIDAY',
    'type_large_airport',
    'type_medium_airport',
    'type_seaplane_base',
    'type_small_airport'
 ]

other_reqiured_cols = ["YEAR", "FL_DATE", "CRS_DEP_TIME", "DEP_DEL15"]
feats = numeric_cols + categorical_cols + other_reqiured_cols

In [0]:
df_train = df_train.select(feats + ["index"])

df_train = df_train.dropna()


In [0]:
# Assemble one-hot encoded variables with numerical features
assembler = VectorAssembler(inputCols=numeric_cols, outputCol="numeric_features")
scaler = StandardScaler(inputCol='numeric_features', outputCol="num_feats_scaled", withMean=True)
final_assembler = VectorAssembler(
    inputCols=["num_feats_scaled"] + categorical_cols,
    outputCol="features"
)
pipeline = Pipeline(stages=[assembler, scaler, final_assembler])
# pipeline_model = pipeline.fit(trainDF)
df_train_transform = pipeline \
    .fit(df_train) \
    .transform(df_train) \
    .select("YEAR", "FL_DATE", "CRS_DEP_TIME", "DEP_DEL15", "features", "index")


Downloading artifacts:   0%|          | 0/25 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

## Helper Functions

In [0]:
def evaluate_multiclass_metrics(predictions, target_feature):
    # Convert DataFrame to RDD
    prediction_and_label = predictions\
                            .select(["prediction", target_feature])\
                            .withColumn(target_feature, col(target_feature).cast(FloatType()))\
                            .orderBy(target_feature)

    # Create MulticlassMetrics object
    metrics = MulticlassMetrics(prediction_and_label.rdd.map(tuple))

    # Collect metrics
    accuracy = metrics.accuracy
    precision = metrics.precision(1.0)
    recall = metrics.recall(1.0)
    f1_measure = metrics.fMeasure(1.0)
    
    return accuracy, precision, recall, f1_measure

def evaluate_roc_curve(predictions, target_feature):
    evaluator = BinaryClassificationEvaluator(labelCol=target_feature)
    area_under_curve = evaluator.evaluate(predictions)
    return area_under_curve

In [0]:
def plot_precision_recall_curve(lr_model):
    training_summary = lr_model.summary
    score = training_summary.pr.toPandas()
    plt.plot(score["recall"], score["precision"])
    plt.title("Precision vs Recall")
    plt.ylabel("Precision")
    plt.xlabel("Recall")
    plt.show()

def plot_roc_curve(lr_model):
    training_summary = lr_model.summary
    lr_roc = training_summary.roc.toPandas()

    plt.plot(lr_roc["FPR"], lr_roc["TPR"])
    plt.ylabel("False Positive Rate")
    plt.xlabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.show()

    print("Training set areaUnderROC: " + str(training_summary.areaUnderROC))

In [0]:
def create_dataset(lr_predictions, lr_model):
    attrs = sorted((attr["idx"], attr["name"]) for attr in 
                   (chain(*lr_predictions.schema["features"].metadata["ml_attr"]["attrs"].values())
                   ))

    pairs = [(name, lr_model.coefficients[idx]) for idx, name in attrs]
    sorted_pairs = sorted(pairs, key = lambda p: abs(p[1]), reverse=True)
    variables = sorted_pairs

    dataset = pd.DataFrame(variables, columns = ["predictors", "coefficients"])
    dataset["exp_coef"] = np.exp(dataset.coefficients)
    dataset = dataset.sort_values(by="exp_coef", ascending=False)

    return dataset

In [0]:
def block_splits(n_splits, df):
    n_samples = df.count()
    k_fold_size = n_samples // n_splits
    indices = np.arange(n_samples)

    blocks = []
    margin = 0
    for i in range(n_splits):
        start = i * k_fold_size
        stop = start + k_fold_size
        mid = int(0.8 * (stop - start)) + start
        block = [start, mid, stop]
        blocks.append(block)

    return blocks

In [0]:
blocks = block_splits(n_splits=6, df=df_train_transform)
print(blocks)

[[0, 165264, 206580], [206580, 371844, 413160], [413160, 578424, 619740], [619740, 785004, 826320], [826320, 991584, 1032900], [1032900, 1198164, 1239480]]


##LogReg With Lasso With Block Time Series Splits

In [0]:
display(df_train_transform.limit(10))

YEAR,FL_DATE,CRS_DEP_TIME,DEP_DEL15,features,index
2015,2015-01-01,515,1.0,"Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 33, 55, 60, 76, 82, 83), values -> List(0.4931302957384638, -0.14711279512505573, 0.43012418358580196, -1.3507800584486882, -0.29681411056900553, -0.6413437089299155, 0.5477557688206123, 1.1369205965954012, -1.2885845784957484, -0.6522296362892461, 0.5069748911064916, -3.2148547156111107, -0.9751745435483247, -1.118160971981017, -0.9066118068884264, -0.879387815459479, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",28
2015,2015-01-01,600,0.0,"Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 34, 40, 53, 55, 62, 73, 76, 80, 82, 83), values -> List(0.4054122149618101, -0.6014243824661881, 0.19482285022736376, 0.8337922242069862, -0.29681411056900553, 0.8557696709017842, 0.5281015360715038, -0.3079805502185981, 1.8318741208185734, -0.6522296362892461, 0.41896270848393025, -3.2148547156111107, -0.9751745435483247, -1.118160971981017, -0.9066118068884264, -0.879387815459479, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0))",105
2015,2015-01-01,600,0.0,"Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 34, 55, 69, 76, 82, 83), values -> List(0.4897565234009002, -0.14711279512505573, 0.39943219628704774, -1.2947653845344402, -0.29681411056900553, -0.8552170489058727, 0.5461178770420306, 1.1544701274613054, 1.7427181579810214, -0.6522296362892461, 0.0787617718082605, -0.03176444365678946, -0.9751745435483247, -1.118160971981017, -0.9066118068884264, -0.879387815459479, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",142
2015,2015-01-01,625,0.0,"Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 34, 46, 54, 55, 60, 73, 76, 82, 83), values -> List(0.18105635451383054, -0.14711279512505573, 0.39943219628704774, -1.2947653845344402, -0.29681411056900553, -0.8552170489058727, 0.5461178770420306, 1.1544701274613054, 1.7427181579810214, -0.6522296362892461, 0.19385462600699455, -3.2148547156111107, -0.9751745435483247, -1.118160971981017, -0.9066118068884264, -0.879387815459479, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0))",289
2015,2015-01-01,630,1.0,"Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 34, 55, 65, 80, 82, 83), values -> List(1.3129569737664193, -0.5851310808601439, 0.22551483752611798, 0.6097335285499939, -0.29681411056900553, 0.9270607842271033, -1.9385070950099375, -0.8929608212905594, -1.2885845784957484, -0.6522296362892461, 1.3295502902327379, -3.2148547156111107, -0.9751745435483247, -1.118160971981017, -0.9066118068884264, -0.879387815459479, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",322
2015,2015-01-01,655,0.0,"Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 34, 55, 60, 76, 82, 83), values -> List(0.9452157889719865, -0.14711279512505573, 0.39943219628704774, -1.2947653845344402, -0.29681411056900553, -0.8552170489058727, 0.5461178770420306, 1.1544701274613054, 1.7427181579810214, -0.6522296362892461, 1.0875167880206942, -0.23341094594021725, -0.9751745435483247, -1.118160971981017, -0.9066118068884264, -0.879387815459479, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",421
2015,2015-01-01,655,0.0,"Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 22, 34, 52, 55, 60, 63, 76, 82, 83, 84), values -> List(0.9452157889719865, -0.14711279512505573, 0.39943219628704774, -1.2947653845344402, -0.29681411056900553, -0.8552170489058727, 0.5461178770420306, 1.1544701274613054, 1.7427181579810214, -0.6522296362892461, 0.9605761400073846, -3.2148547156111107, -0.9751745435483247, -1.118160971981017, -0.9066118068884264, -0.879387815459479, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",423
2015,2015-01-01,713,1.0,"Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 35, 55, 60, 80, 82, 83), values -> List(0.3649269469110469, -0.5867604095963854, 0.2152841750931999, 0.8337922242069862, -0.29681411056900553, 0.42802299094987, 0.5305582175414192, -0.015490414682617408, 1.6535621951434694, -0.6522296362892461, 0.3783417011196712, -3.2148547156111107, -0.9751745435483247, -1.118160971981017, -0.9066118068884264, -0.879387815459479, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",544
2015,2015-01-01,715,0.0,"Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 35, 55, 60, 80, 82, 83), values -> List(0.4509581415189188, -0.5867604095963854, 0.2152841750931999, 0.8337922242069862, -0.29681411056900553, 0.42802299094987, 0.5305582175414192, -0.015490414682617408, 1.6535621951434694, -0.6522296362892461, 0.4646613417687217, -3.2148547156111107, -0.9751745435483247, -1.118160971981017, -0.9066118068884264, -0.879387815459479, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",562
2015,2015-01-01,720,0.0,"Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 35, 55, 60, 80, 82, 83), values -> List(0.4324023936623189, -0.5867604095963854, 0.2152841750931999, 0.8337922242069862, -0.29681411056900553, 0.42802299094987, 0.5305582175414192, -0.015490414682617408, 1.6535621951434694, -0.6522296362892461, -1.0653966022850367, -0.07144557424155, -0.9751745435483247, -1.118160971981017, -0.9066118068884264, -0.879387815459479, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",591


In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import numpy as np

def lasso_block_cross_val_scores_with_coef(blocks, train, target_feature, final_assembler, reg_param=0.01, elastic_net_param=1.0):
    feature_names = final_assembler.getInputCols()
    feature_names = numeric_cols + feature_names[1:]
    acc, prec, rec, f1, auc = [], [], [], [], []
    all_coefficients = []

    for block in blocks:
        train_block = train.filter((col("index") >= block[0]) & (col("index") < block[1]))
        val_block = train.filter((col("index") >= block[1]) & (col("index") < block[2]))

        lasso_model = LogisticRegression(
            featuresCol="features",
            labelCol=target_feature,
            maxIter=10,
            regParam=reg_param,
            elasticNetParam=elastic_net_param
        ).fit(train_block)

        val_preds = lasso_model.transform(val_block)

        accuracy, precision, recall, f1_measure = evaluate_multiclass_metrics(val_preds, target_feature)
        evaluator = BinaryClassificationEvaluator(labelCol=target_feature, metricName="areaUnderROC")
        auc_score = evaluator.evaluate(val_preds)

        beta = 1/(2**0.5)
        fbeta = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)

        acc.append(accuracy)
        prec.append(precision)
        rec.append(recall)
        f1.append(fbeta)
        auc.append(auc_score)

        coefficients = lasso_model.coefficients.toArray()
        all_coefficients.append(coefficients)

        print(f"Block {block}:")
        print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {fbeta:.4f}, AUC: {auc_score:.4f}")
        print("Non-zero coefficients (sorted by absolute value):")
        non_zero_features = sorted([(feature, coef) for feature, coef in zip(feature_names, coefficients) if coef != 0], 
                                   key=lambda x: abs(x[1]), reverse=True)
        for feature, coef in non_zero_features:
            print(f"{feature}: {coef:.6f}")
        print(f"Number of non-zero coefficients: {len(non_zero_features)}")
        print("---")

    # Calculate average metrics
    avg_acc = np.mean(acc)
    avg_prec = np.mean(prec)
    avg_rec = np.mean(rec)
    avg_f1 = np.mean(f1)
    avg_auc = np.mean(auc)

    print("\nAverage Metrics Across All Blocks:")
    print(f"Accuracy: {avg_acc:.4f}")
    print(f"Precision: {avg_prec:.4f}")
    print(f"Recall: {avg_rec:.4f}")
    print(f"F-0.5 Score: {avg_f1:.4f}")
    print(f"AUC: {avg_auc:.4f}")

    # Calculate average coefficients across all blocks
    avg_coefficients = np.mean(all_coefficients, axis=0)
    
    print("\nOverall Non-zero coefficients (sorted by absolute average value) for reg_param: ", reg_param)
    overall_non_zero_features = sorted([(feature, coef) for feature, coef in zip(feature_names, avg_coefficients) if coef != 0], 
                                       key=lambda x: abs(x[1]), reverse=True)
    for feature, coef in overall_non_zero_features:
        print(f"{feature}: {coef:.6f}")
    print(f"Number of overall non-zero coefficients: {len(overall_non_zero_features)}")

    return {
        'avg_accuracy': avg_acc,
        'avg_precision': avg_prec,
        'avg_recall': avg_rec,
        'avg_f0.5': avg_f1,
        'avg_auc': avg_auc,
        'accuracies': acc,
        'precisions': prec,
        'recalls': rec,
        'f0.5_scores': f1,
        'auc_scores': auc,
        'avg_coefficients': dict(zip(feature_names, avg_coefficients)),
        'non_zero_features': dict(overall_non_zero_features),
        'feature_names': feature_names
    }


In [0]:
# Usage
results1 = lasso_block_cross_val_scores_with_coef(blocks, df_train_transform, 'DEP_DEL15', final_assembler, reg_param=0.01, elastic_net_param=1.0)
# results1

Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]



Block [0, 165264, 206580]:
Accuracy: 0.6808, Precision: 0.6061, Recall: 0.5752, F1: 0.5955, AUC: 0.7226
Non-zero coefficients (sorted by absolute value):
REALIZED_DEL15: 0.982882
HOUR_06: -0.708882
HOUR_05: -0.590236
HOUR_07: -0.496137
OP_CARRIER_MQ: 0.382845
Snow: 0.325588
DEP_4hr: 0.322137
OP_CARRIER_AS: -0.260305
DEP_6hr: 0.257766
HOUR_08: -0.223687
OP_CARRIER_DL: -0.172641
OP_CARRIER_OO: 0.133370
HourlyVisibility: -0.092126
DAY_OF_WEEK_2: -0.058313
HOUR_23: -0.051889
DAY_OF_WEEK_6: 0.040105
DAY_OF_WEEK_3: -0.037886
DAY_OF_WEEK_7: 0.037845
Rain: 0.025785
LOG_REALIZED_DELAY_MIN: -0.019866
type_medium_airport: 0.013429
HOUR_16: 0.012860
DISTANCE_LAG: -0.011511
HourlyWindSpeed: 0.011159
HourlyRelativeHumidity: 0.009344
OUTDEG_AIRPORT_6hr: 0.005859
Number of non-zero coefficients: 26
---


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]



Block [206580, 371844, 413160]:
Accuracy: 0.6527, Precision: 0.6904, Recall: 0.6014, F1: 0.6579, AUC: 0.7101
Non-zero coefficients (sorted by absolute value):
REALIZED_DEL15: 1.057763
HOUR_06: -0.591743
OP_CARRIER_MQ: 0.423241
HOUR_05: -0.403136
Snow: 0.396608
DEP_4hr: 0.315353
HOUR_07: -0.309789
HOLIDAY: 0.128328
DEP_6hr: 0.127490
OP_CARRIER_OO: 0.126455
DAY_OF_WEEK_6: -0.125544
HourlyVisibility: -0.123328
HOUR_08: -0.121091
DEP_12hr: 0.113321
HOUR_18: 0.108071
OP_CARRIER_AA: -0.106238
OP_CARRIER_B6: 0.069991
DAY_OF_WEEK_5: 0.060201
HourlyWindSpeed: 0.055198
Rain: 0.050842
HourlyPrecipitation: 0.045271
OUTDEG_AIRPORT_6hr: 0.042417
HourlySeaLevelPressure: -0.033829
HOUR_17: 0.032978
OP_CARRIER_AS: -0.025866
MONTH_2: 0.024848
LOG_REALIZED_DELAY_MIN: -0.023311
HOUR_09: -0.017758
ELEVATION: 0.012576
HOUR_19: 0.007029
HourlyWindDirection: 0.000153
Number of non-zero coefficients: 31
---


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Block [413160, 578424, 619740]:
Accuracy: 0.6871, Precision: 0.6144, Recall: 0.4582, F1: 0.5517, AUC: 0.7021
Non-zero coefficients (sorted by absolute value):
REALIZED_DEL15: 0.790018
HOUR_06: -0.612120
HOUR_05: -0.558470
HOUR_07: -0.552376
HOUR_08: -0.446537
OP_CARRIER_MQ: 0.438017
DEP_4hr: 0.293626
Snow: 0.283940
OP_CARRIER_AS: -0.170021
DEP_6hr: 0.166992
OP_CARRIER_F9: 0.143315
DAY_OF_WEEK_7: 0.131314
HourlyDryBulbTemperature: -0.086956
HOUR_19: 0.077064
HourlyVisibility: -0.053553
HourlyWindSpeed: 0.036309
HourlyPrecipitation: 0.029100
ELEVATION: 0.026663
DISTANCE_LAG: -0.023784
LOG_REALIZED_DELAY_MIN: -0.018749
HourlySeaLevelPressure: -0.006478
Number of non-zero coefficients: 21
---


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Block [619740, 785004, 826320]:
Accuracy: 0.6894, Precision: 0.7880, Recall: 0.5995, F1: 0.7133, AUC: 0.7611
Non-zero coefficients (sorted by absolute value):
REALIZED_DEL15: 1.016136
HOUR_05: -0.750347
HOUR_06: -0.734664
HOUR_07: -0.412240
Thunder: 0.410336
Snow: 0.357012
DEP_6hr: 0.311629
HOUR_08: -0.242612
DEP_4hr: 0.240913
HOUR_21: 0.191387
OP_CARRIER_UA: 0.146659
OP_CARRIER_DL: -0.116566
type_large_airport: 0.081215
HourlyVisibility: -0.079342
OP_CARRIER_MQ: 0.073380
HOUR_20: 0.068714
HOUR_19: 0.065050
OP_CARRIER_AS: -0.040562
OP_CARRIER_WN: 0.037674
HourlyWindSpeed: 0.025453
LOG_REALIZED_DELAY_MIN: -0.022281
OUTDEG_AIRPORT_6hr: 0.016908
HOUR_16: 0.009006
HourlyPrecipitation: 0.001844
HOUR_18: 0.000904
Number of non-zero coefficients: 25
---


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Block [826320, 991584, 1032900]:
Accuracy: 0.6969, Precision: 0.7456, Recall: 0.4552, F1: 0.6148, AUC: 0.7365
Non-zero coefficients (sorted by absolute value):
REALIZED_DEL15: 0.997491
Thunder: 0.795969
HOUR_06: -0.736563
HOUR_05: -0.663397
HOUR_07: -0.600705
DEP_4hr: 0.340850
HOUR_08: -0.321624
OP_CARRIER_UA: 0.212270
DEP_6hr: 0.200363
OP_CARRIER_DL: -0.160683
DAY_OF_WEEK_2: -0.123818
HourlyVisibility: -0.096782
HourlyWindSpeed: 0.080944
HOUR_00: -0.068735
HOUR_18: 0.062929
MONTH_5: -0.050541
HourlyPrecipitation: 0.049356
DAY_OF_WEEK_1: 0.030730
MONTH_4: 0.030337
DAY_OF_WEEK_5: 0.025203
HourlyDryBulbTemperature: 0.017673
OP_CARRIER_AS: -0.015693
OUTDEG_AIRPORT_6hr: 0.010085
HOUR_19: 0.006708
HourlySeaLevelPressure: -0.003031
Number of non-zero coefficients: 25
---


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Block [1032900, 1198164, 1239480]:
Accuracy: 0.7112, Precision: 0.7849, Recall: 0.6682, F1: 0.7417, AUC: 0.7831
Non-zero coefficients (sorted by absolute value):
REALIZED_DEL15: 0.902077
Thunder: 0.892707
OP_CARRIER_NK: 0.539764
HOUR_06: -0.449872
HOUR_07: -0.382242
DEP_4hr: 0.316897
HOUR_05: -0.269759
OP_CARRIER_UA: 0.242637
HOUR_08: -0.235394
DEP_6hr: 0.206493
HourlyDryBulbTemperature: 0.201330
Rain: 0.135492
OUTDEG_AIRPORT_6hr: 0.085445
OP_CARRIER_WN: 0.076518
HOUR_09: -0.076310
DAY_OF_WEEK_7: -0.058547
DAY_OF_WEEK_3: 0.057264
OP_CARRIER_EV: 0.054155
HOUR_19: 0.051061
HourlySeaLevelPressure: -0.050139
View Obstruction: -0.044887
HOUR_15: 0.042675
HourlyPrecipitation: 0.022778
HOUR_16: 0.019962
HourlyWindSpeed: 0.013097
OP_CARRIER_OO: -0.011845
HourlyWindDirection: -0.010359
HourlyVisibility: -0.010354
DAY_OF_WEEK_5: -0.010093
DISTANCE_LAG: -0.009434
Number of non-zero coefficients: 30
---

Average Metrics Across All Blocks:
Accuracy: 0.6864
Precision: 0.7049
Recall: 0.5596
F-0.5 Sco

In [0]:
results2 = lasso_block_cross_val_scores_with_coef(blocks, df_train_transform, 'DEP_DEL15', final_assembler, reg_param=0.05, elastic_net_param=1.0)
# results2

Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]



Block [0, 165264, 206580]:
Accuracy: 0.6794, Precision: 0.6158, Recall: 0.5261, F1: 0.5827, AUC: 0.6976
Non-zero coefficients (sorted by absolute value):
REALIZED_DEL15: 0.491783
DEP_4hr: 0.277414
DEP_6hr: 0.225438
Number of non-zero coefficients: 3
---


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Block [206580, 371844, 413160]:
Accuracy: 0.6489, Precision: 0.7181, Recall: 0.5340, F1: 0.6441, AUC: 0.6965
Non-zero coefficients (sorted by absolute value):
REALIZED_DEL15: 0.577376
DEP_4hr: 0.295223
DEP_6hr: 0.154681
Snow: 0.147422
Number of non-zero coefficients: 4
---


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Block [413160, 578424, 619740]:
Accuracy: 0.6554, Precision: 0.5474, Recall: 0.4992, F1: 0.5304, AUC: 0.6806
Non-zero coefficients (sorted by absolute value):
REALIZED_DEL15: 0.307137
DEP_4hr: 0.285653
DEP_6hr: 0.145911
Snow: 0.021572
Number of non-zero coefficients: 4
---


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Block [619740, 785004, 826320]:
Accuracy: 0.6534, Precision: 0.8081, Recall: 0.4893, F1: 0.6640, AUC: 0.7393
Non-zero coefficients (sorted by absolute value):
REALIZED_DEL15: 0.508969
DEP_4hr: 0.239336
DEP_6hr: 0.231332
Number of non-zero coefficients: 3
---


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Block [826320, 991584, 1032900]:
Accuracy: 0.6728, Precision: 0.7704, Recall: 0.3477, F1: 0.5483, AUC: 0.7232
Non-zero coefficients (sorted by absolute value):
REALIZED_DEL15: 0.476759
Thunder: 0.363420
DEP_4hr: 0.320925
DEP_6hr: 0.167517
Number of non-zero coefficients: 4
---


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Block [1032900, 1198164, 1239480]:
Accuracy: 0.6969, Precision: 0.7812, Recall: 0.6384, F1: 0.7270, AUC: 0.7664
Non-zero coefficients (sorted by absolute value):
Thunder: 0.577725
REALIZED_DEL15: 0.401145
DEP_4hr: 0.318763
DEP_6hr: 0.118426
HourlyDryBulbTemperature: 0.014611
Number of non-zero coefficients: 5
---

Average Metrics Across All Blocks:
Accuracy: 0.6678
Precision: 0.7069
Recall: 0.5058
F-0.5 Score: 0.6161
AUC: 0.7173

Overall Non-zero coefficients (sorted by absolute average value) for reg_param:  0.05
REALIZED_DEL15: 0.460528
DEP_4hr: 0.289552
DEP_6hr: 0.173884
Thunder: 0.156858
Snow: 0.028166
HourlyDryBulbTemperature: 0.002435
Number of overall non-zero coefficients: 6


In [0]:
results3 = lasso_block_cross_val_scores_with_coef(blocks, df_train_transform, 'DEP_DEL15', final_assembler, reg_param=0.03, elastic_net_param=1.0)
# results3

Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]



Block [0, 165264, 206580]:
Accuracy: 0.6830, Precision: 0.6200, Recall: 0.5341, F1: 0.5884, AUC: 0.7046
Non-zero coefficients (sorted by absolute value):
REALIZED_DEL15: 0.692590
DEP_4hr: 0.323878
DEP_6hr: 0.255318
HOUR_06: -0.215339
Snow: 0.051051
HOUR_07: -0.049374
OP_CARRIER_MQ: 0.041390
Number of non-zero coefficients: 7
---


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Block [206580, 371844, 413160]:
Accuracy: 0.6542, Precision: 0.7041, Recall: 0.5773, F1: 0.6560, AUC: 0.6998
Non-zero coefficients (sorted by absolute value):
REALIZED_DEL15: 0.783359
DEP_4hr: 0.326377
Snow: 0.275044
DEP_6hr: 0.191882
HOUR_06: -0.076099
OP_CARRIER_MQ: 0.047220
HourlyVisibility: -0.023201
DEP_12hr: 0.001550
Number of non-zero coefficients: 8
---


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]



Block [413160, 578424, 619740]:
Accuracy: 0.6869, Precision: 0.6209, Recall: 0.4367, F1: 0.5444, AUC: 0.6908
Non-zero coefficients (sorted by absolute value):
REALIZED_DEL15: 0.502890
DEP_4hr: 0.296960
DEP_6hr: 0.189328
Snow: 0.150264
HOUR_06: -0.131161
HOUR_07: -0.109389
OP_CARRIER_MQ: 0.100209
HOUR_08: -0.059936
HourlyDryBulbTemperature: -0.042197
Number of non-zero coefficients: 9
---


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Block [619740, 785004, 826320]:
Accuracy: 0.6631, Precision: 0.7917, Recall: 0.5302, F1: 0.6799, AUC: 0.7411
Non-zero coefficients (sorted by absolute value):
REALIZED_DEL15: 0.727139
DEP_4hr: 0.279446
DEP_6hr: 0.278916
HOUR_06: -0.249114
Snow: 0.124346
HOUR_07: -0.019632
Number of non-zero coefficients: 6
---


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Block [826320, 991584, 1032900]:
Accuracy: 0.6908, Precision: 0.7522, Recall: 0.4263, F1: 0.5994, AUC: 0.7284
Non-zero coefficients (sorted by absolute value):
REALIZED_DEL15: 0.711533
Thunder: 0.574932
DEP_4hr: 0.361543
HOUR_06: -0.239241
DEP_6hr: 0.192208
HOUR_07: -0.172963
HourlyWindSpeed: 0.014764
Number of non-zero coefficients: 7
---


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Block [1032900, 1198164, 1239480]:
Accuracy: 0.7039, Precision: 0.7874, Recall: 0.6468, F1: 0.7342, AUC: 0.7716
Non-zero coefficients (sorted by absolute value):
Thunder: 0.724992
REALIZED_DEL15: 0.620928
DEP_4hr: 0.352857
DEP_6hr: 0.165543
HourlyDryBulbTemperature: 0.107742
HOUR_06: -0.062721
HOUR_07: -0.056047
View Obstruction: -0.054565
OUTDEG_AIRPORT_6hr: 0.038618
Number of non-zero coefficients: 9
---

Average Metrics Across All Blocks:
Accuracy: 0.6803
Precision: 0.7127
Recall: 0.5252
F-0.5 Score: 0.6337
AUC: 0.7227

Overall Non-zero coefficients (sorted by absolute average value) for reg_param:  0.03
REALIZED_DEL15: 0.673073
DEP_4hr: 0.323510
Thunder: 0.216654
DEP_6hr: 0.212199
HOUR_06: -0.162279
Snow: 0.100118
HOUR_07: -0.067901
OP_CARRIER_MQ: 0.031470
HourlyDryBulbTemperature: 0.010924
HOUR_08: -0.009989
View Obstruction: -0.009094
OUTDEG_AIRPORT_6hr: 0.006436
HourlyVisibility: -0.003867
HourlyWindSpeed: 0.002461
DEP_12hr: 0.000258
Number of overall non-zero coefficients: 15


## Run baseline using lasso alpha value of 0.01 on full training set 2015-2017

In [0]:
# Assemble one-hot encoded variables with numerical features
assembler = VectorAssembler(inputCols=numeric_cols, outputCol="numeric_features")
scaler = StandardScaler(inputCol='numeric_features', outputCol="num_feats_scaled", withMean=True)
final_assembler = VectorAssembler(
    inputCols=["num_feats_scaled"] + categorical_cols,
    outputCol="features"
)

target_feature = 'DEP_DEL15'

lr = LogisticRegression(maxIter=10, featuresCol = "features", labelCol=target_feature, regParam=0.01, elasticNetParam=1.0)

pipeline = Pipeline(stages=[assembler, scaler, final_assembler, lr])

baseline_pipeline = pipeline.fit(df_train)
# df_train_transform = pipeline \
#     .fit(df_train) \
#     .transform(df_train) \
#     .select("QUARTER", "FL_DATE", "CRS_DEP_TIME", "DEP_DEL15", "features", "index")


Downloading artifacts:   0%|          | 0/35 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
baseline_model = baseline_pipeline.stages[-1]

In [0]:
# Training Summary & Metrics
training_summary = baseline_model.summary
train_precision = training_summary.precisionByLabel[1]
train_recall = training_summary.recallByLabel[1]
beta = 1/(2**0.5)
fbeta = (1 + beta**2) * (train_precision * train_recall) / ((beta**2 * train_precision) + train_recall)

print(f"Precision by Label: \n{training_summary.precisionByLabel}")
print(f"Recall by Label: \n{training_summary.recallByLabel}")
print(f"F-Beta Score: \n{fbeta}")


Precision by Label: 
[0.6913825906117017, 0.7281731801517264]
Recall by Label: 
[0.8174651131485664, 0.5726564744294893]
F-Beta Score: 
0.6677280599990196


In [0]:
# Model coefficients

coefficients = baseline_model.coefficients.toArray()
print(len(coefficients))
features = numeric_cols + categorical_cols

87


In [0]:
coefficients

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.03888968,
        0.        , -0.01367048, -0.03561889,  0.        ,  0.05266061,
        0.        ,  0.        ,  0.01724833,  0.        ,  0.23042965,
        0.38983607,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.01843534,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        , -0.63853353, -0.62712592,
       -0.50205953, -0.28064397,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.01550906,  0.01421074,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        , -0.14861588,  0.        , -0.07599524,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

In [0]:
baseline_coeff_df = pd.DataFrame({'Features': features, 'Coefficients': coefficients})
baseline_coeff_df = baseline_coeff_df.set_index('Features')
baseline_coeff_df['Exp_Coefficients'] = np.exp(baseline_coeff_df['Coefficients'])

In [0]:
baseline_coeff_df[baseline_coeff_df['Exp_Coefficients'] != 1].sort_values(by='Exp_Coefficients', ascending=False)

Unnamed: 0_level_0,Coefficients,Exp_Coefficients
Features,Unnamed: 1_level_1,Unnamed: 2_level_1
REALIZED_DEL15,1.02838,2.796533
Thunder,0.73445,2.084335
DEP_4hr,0.389836,1.476739
DEP_6hr,0.23043,1.259141
Snow,0.143994,1.154877
HourlyWindSpeed,0.052661,1.054072
HourlyPrecipitation,0.03889,1.039656
MONTH_6,0.018435,1.018606
OUTDEG_AIRPORT_6hr,0.017248,1.017398
HOUR_18,0.015509,1.01563


In [0]:
predictions_2018 = baseline_pipeline.transform(df_val)

In [0]:
# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol=target_feature)
area_under_curve = evaluator.evaluate(predictions_2018)
print(f"Test Area under ROC curve: {area_under_curve}")

# Convert DataFrame to RDD
prediction_and_label = predictions_2018\
                        .select(["prediction", target_feature])\

# Create MulticlassMetrics object
metrics = MulticlassMetrics(prediction_and_label.rdd.map(tuple))

beta = 1/(2**0.5)
fbeta = (1 + beta**2) * (metrics.precision(1.0) * metrics.recall(1.0)) / ((beta**2 * metrics.precision(1.0)) + metrics.recall(1.0))


# metrics
print("Test Metrics")
print("Accuracy:", metrics.accuracy)
print("Precision:", metrics.precision(1.0))
print("Recall:", metrics.recall(1.0))
print("F Beta:", fbeta)

Test Area under ROC curve: 0.7271615903130036




Test Metrics
Accuracy: 0.7128203408079407
Precision: 0.7100300659420004
Recall: 0.44689405225314527
F Beta: 0.5935365103795329


In [0]:
display(predictions_2018.limit(5))

index,MONTH,DAY_OF_WEEK,OP_CARRIER,ORIGIN_AIRPORT_ID,CRS_DEP_TIME,DISTANCE,DEP_DEL15,QUARTER,DEP_DELAY,DEST_AIRPORT_ID,FL_DATE,TAIL_NUM,ARR_TIME,ARR_DELAY,ARR_DEL15,CANCELLED,type,ELEVATION,HourlyAltimeterSetting,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySkyConditions,HourlySeaLevelPressure,HourlyVisibility,HourlyWindDirection,HourlyWindSpeed,HourlyPresentWeatherType,HOUR,MONTH_indexed,MONTH_vec,DAY_OF_WEEK_indexed,DAY_OF_WEEK_vec,OP_CARRIER_indexed,OP_CARRIER_vec,ORIGIN_AIRPORT_ID_indexed,ORIGIN_AIRPORT_ID_vec,HOUR_indexed,HOUR_vec,type_indexed,type_vec,HourlySkyConditions_vec,HourlyPresentWeatherType_vec,Rain,Snow,Thunder,Tornado,View Obstruction,MONTH_1,MONTH_2,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,MONTH_9,MONTH_10,MONTH_11,MONTH_12,HOUR_00,HOUR_01,HOUR_02,HOUR_03,HOUR_04,HOUR_05,HOUR_06,HOUR_07,HOUR_08,HOUR_09,HOUR_10,HOUR_11,HOUR_12,HOUR_13,HOUR_14,HOUR_15,HOUR_16,HOUR_17,HOUR_18,HOUR_19,HOUR_20,HOUR_21,HOUR_22,HOUR_23,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6,DAY_OF_WEEK_7,OP_CARRIER_9E,OP_CARRIER_AA,OP_CARRIER_AS,OP_CARRIER_B6,OP_CARRIER_DL,OP_CARRIER_EV,OP_CARRIER_F9,OP_CARRIER_G4,OP_CARRIER_HA,OP_CARRIER_MQ,OP_CARRIER_NK,OP_CARRIER_OH,OP_CARRIER_OO,OP_CARRIER_QX,OP_CARRIER_UA,OP_CARRIER_US,OP_CARRIER_VX,OP_CARRIER_WN,OP_CARRIER_YV,OP_CARRIER_YX,type_large_airport,type_medium_airport,type_seaplane_base,type_small_airport,DEP_12hr,DEP_6hr,DEP_4hr,DISTANCE_LAG,REALIZED_DELAY_MIN,REALIZED_DEL15,HOLIDAY,OUTDEG_AIRPORT_6hr,YEAR,LOG_REALIZED_DELAY_MIN,numeric_features,num_feats_scaled,features,rawPrediction,probability,prediction
8445849,1,1,DL,14771,20,1589.0,0.0,1,-6.0,13487,2018-01-01,N669DN,543,-12.0,0.0,0.0,large_airport,2.4,30.17,54.0,0.0,80.0,BKN:07 160,30.16,9.94,0.0,0.0,,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 6, indices -> List(1), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 19, indices -> List(1), values -> List(1.0))",7.0,"Map(vectorType -> sparse, length -> 372, indices -> List(7), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 269, indices -> List(55, 258), values -> List(1.0, 1.0))",,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.1194029850746268,0.1111111111111111,0.1071428571428571,2079.0,-1.0,0,1,0.0337078651685393,2018,5.529032183358565,"Map(vectorType -> dense, length -> 16, values -> List(1589.0, 2.4000000953674316, 30.170000076293945, 54.0, 0.0, 80.0, 30.15999984741211, 9.9399995803833, 0.0, 0.0, 2079.0, 5.529032183358565, 0.033707865168539325, 0.11940298507462686, 0.1111111111111111, 0.10714285714285714))","Map(vectorType -> dense, length -> 16, values -> List(1.2825930227283469, -0.6041399329496747, 0.23574549995903235, -0.1184572323352305, -0.29681411056900553, -0.2848881423033194, 0.53137716343071, 1.1369205965954012, -1.3777405413333004, -0.652229636289246, 2.1284301017298333, -0.03176444365678946, 0.6769743860053596, -0.4923255922718723, -0.39540686784453966, -0.4384035529200434))","Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 28, 52, 63, 82, 83), values -> List(1.2825930227283469, -0.6041399329496747, 0.23574549995903235, -0.1184572323352305, -0.29681411056900553, -0.2848881423033194, 0.53137716343071, 1.1369205965954012, -1.3777405413333004, -0.652229636289246, 2.1284301017298333, -0.03176444365678946, 0.6769743860053596, -0.4923255922718723, -0.39540686784453966, -0.4384035529200434, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(0.7393824318427165, -0.7393824318427165))","Map(vectorType -> dense, length -> 2, values -> List(0.6768607963274049, 0.3231392036725951))",0.0
8445850,1,1,AA,12892,30,2125.0,0.0,1,-4.0,11057,2018-01-01,N927UW,745,-30.0,0.0,0.0,large_airport,29.6,30.14,58.0,0.0,87.0,5,30.14,3.73,0.0,0.0,BR:1 ||,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 6, indices -> List(1), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 19, indices -> List(2), values -> List(1.0))",4.0,"Map(vectorType -> sparse, length -> 372, indices -> List(4), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 269, indices -> List(187), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(4), values -> List(1.0))",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.2669322709163346,0.32,0.2424242424242424,2279.0,14.0,0,1,0.0413450937155457,2018,5.586874056410479,"Map(vectorType -> dense, length -> 16, values -> List(2125.0, 29.600000381469727, 30.139999389648438, 58.0, 0.0, 87.0, 30.139999389648438, 3.7300000190734863, 0.0, 0.0, 2279.0, 5.586874056410479, 0.041345093715545754, 0.26693227091633465, 0.32, 0.24242424242424243))","Map(vectorType -> dense, length -> 16, values -> List(2.1867640091953926, -0.5302769590219136, 0.20505351266027816, 0.10560146332176179, -0.29681411056900553, 0.2141496509739139, 0.5297392716521283, -0.679443016770487, -1.3777405413333004, -0.652229636289246, 2.4669384964319923, 0.256170750314657, 1.0513036087432672, 0.2809301731954526, 0.5656584175579674, 0.11839475836712271))","Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 28, 52, 60, 80, 82, 83), values -> List(2.1867640091953926, -0.5302769590219136, 0.20505351266027816, 0.10560146332176179, -0.29681411056900553, 0.2141496509739139, 0.5297392716521283, -0.679443016770487, -1.3777405413333004, -0.652229636289246, 2.4669384964319923, 0.256170750314657, 1.0513036087432672, 0.2809301731954526, 0.5656584175579674, 0.11839475836712271, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(0.15369338968161333, -0.15369338968161333))","Map(vectorType -> dense, length -> 2, values -> List(0.5383478903929685, 0.4616521096070315))",0.0
8445851,1,1,AS,10299,30,1542.0,0.0,1,-12.0,14057,2018-01-01,N319AS,442,-25.0,0.0,0.0,large_airport,27.4,29.59,25.0,0.0,78.0,FEW:02 70 FEW:02 110,29.59,10.0,20.0,0.0,,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 6, indices -> List(1), values -> List(1.0))",8.0,"Map(vectorType -> sparse, length -> 19, indices -> List(8), values -> List(1.0))",62.0,"Map(vectorType -> sparse, length -> 372, indices -> List(62), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 269, indices -> List(11, 216, 261), values -> List(1.0, 1.0, 1.0))",,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,1542.0,4.0,0,1,0.0016538037486218,2018,5.548686904115838,"Map(vectorType -> dense, length -> 16, values -> List(1542.0, 27.399999618530273, 29.59000015258789, 25.0, 0.0, 78.0, 29.59000015258789, 10.0, 20.0, 0.0, 1542.0, 5.548686904115838, 0.0016538037486218302, 0.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 16, values -> List(1.2033093727956021, -0.5362511721573848, -0.3576192620548446, -1.7428827758484247, -0.29681411056900553, -0.42747036895395746, 0.48469834112683285, 1.1544701274613054, -1.1994286156581961, -0.652229636289246, 1.2195350619545362, 0.06607619557390841, -0.894115417456661, -1.1181609719810173, -0.9066118068884264, -0.879387815459479))","Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 28, 52, 61, 82, 83), values -> List(1.2033093727956021, -0.5362511721573848, -0.3576192620548446, -1.7428827758484247, -0.29681411056900553, -0.42747036895395746, 0.48469834112683285, 1.1544701274613054, -1.1994286156581961, -0.652229636289246, 1.2195350619545362, 0.06607619557390841, -0.894115417456661, -1.1181609719810173, -0.9066118068884264, -0.879387815459479, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(1.1287970707288086, -1.1287970707288086))","Map(vectorType -> dense, length -> 2, values -> List(0.7556168344439501, 0.24438316555604989))",0.0
8445852,1,1,UA,14747,34,1721.0,0.0,1,7.0,13930,2018-01-01,N69839,607,-23.0,0.0,0.0,large_airport,112.8,30.25,35.0,0.0,82.0,BKN:07 150,30.29,9.94,20.0,10.0,,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 6, indices -> List(1), values -> List(1.0))",4.0,"Map(vectorType -> sparse, length -> 19, indices -> List(4), values -> List(1.0))",10.0,"Map(vectorType -> sparse, length -> 372, indices -> List(10), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 269, indices -> List(46, 258), values -> List(1.0, 1.0))",,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0.218978102189781,0.1,0.08,1874.0,6.0,0,1,0.0284575981787137,2018,5.556441886757492,"Map(vectorType -> dense, length -> 16, values -> List(1721.0, 112.80000305175781, 30.25, 35.0, 0.0, 82.0, 30.290000915527344, 9.9399995803833, 20.0, 10.0, 1874.0, 5.556441886757492, 0.028457598178713718, 0.21897810218978103, 0.1, 0.08))","Map(vectorType -> dense, length -> 16, values -> List(1.5052619970075445, -0.3043431515445256, 0.31758884812303817, -1.1827360367059438, -0.29681411056900553, -0.1423059156526813, 0.5420233037935336, 1.1369205965954012, -1.1994286156581961, 0.45538077244577413, 1.78145899716012, 0.10468027793420205, 0.4196391040543875, 0.029584569273892806, -0.44652736174892826, -0.5501195660967003))","Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 28, 52, 72, 82, 83), values -> List(1.5052619970075445, -0.3043431515445256, 0.31758884812303817, -1.1827360367059438, -0.29681411056900553, -0.1423059156526813, 0.5420233037935336, 1.1369205965954012, -1.1994286156581961, 0.45538077244577413, 1.78145899716012, 0.10468027793420205, 0.4196391040543875, 0.029584569273892806, -0.44652736174892826, -0.5501195660967003, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(0.6649745007876522, -0.6649745007876522))","Map(vectorType -> dense, length -> 2, values -> List(0.6603769541588272, 0.33962304584117275))",0.0
8445853,1,1,AA,14107,35,1773.0,0.0,1,-2.0,11057,2018-01-01,N519UW,557,-25.0,0.0,0.0,large_airport,337.4,30.1,52.0,0.0,47.0,FEW:02 160 SCT:04 210 BKN:07 250,30.09,10.0,110.0,0.0,,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",1.0,"Map(vectorType -> sparse, length -> 6, indices -> List(1), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 19, indices -> List(2), values -> List(1.0))",6.0,"Map(vectorType -> sparse, length -> 372, indices -> List(6), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 269, indices -> List(55, 87, 106, 258, 261, 265), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.2616279069767442,0.2564102564102564,0.1666666666666666,1337.0,-4.0,0,1,0.0221969265793967,2018,5.517051209373546,"Map(vectorType -> dense, length -> 16, values -> List(1773.0, 337.3999938964844, 30.100000381469727, 52.0, 0.0, 47.0, 30.09000015258789, 10.0, 110.0, 0.0, 1337.0, 5.517051209373546, 0.0221969265793967, 0.2616279069767442, 0.2564102564102564, 0.16666666666666666))","Map(vectorType -> dense, length -> 16, values -> List(1.5929800777841983, 0.30556946218495984, 0.1641328142279446, -0.23048658016372667, -0.29681411056900553, -2.6374948820388475, 0.5256446984036314, 1.1544701274613054, -0.3970249501202278, -0.652229636289246, 0.8725639573848233, -0.09140539036760724, 0.11278010158179079, 0.2531280322508666, 0.27309189859746597, -0.19341229595369036))","Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 28, 52, 60, 82, 83), values -> List(1.5929800777841983, 0.30556946218495984, 0.1641328142279446, -0.23048658016372667, -0.29681411056900553, -2.6374948820388475, 0.5256446984036314, 1.1544701274613054, -0.3970249501202278, -0.652229636289246, 0.8725639573848233, -0.09140539036760724, 0.11278010158179079, 0.2531280322508666, 0.27309189859746597, -0.19341229595369036, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(0.4241169612667326, -0.4241169612667326))","Map(vectorType -> dense, length -> 2, values -> List(0.6044679816139937, 0.39553201838600627))",0.0


In [0]:
predictions_2019 = baseline_pipeline.transform(df_test)

In [0]:
# different run time because I had to load the model back in to generate the test metrics for the presentation
# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol=target_feature)
area_under_curve = evaluator.evaluate(predictions_2019)
print(f"Test Area under ROC curve: {area_under_curve}")

# Convert DataFrame to RDD
prediction_and_label = predictions_2019\
                        .select(["prediction", target_feature])\

# Create MulticlassMetrics object
metrics = MulticlassMetrics(prediction_and_label.rdd.map(tuple))

beta = 1/(2**0.5)
fbeta = (1 + beta**2) * (metrics.precision(1.0) * metrics.recall(1.0)) / ((beta**2 * metrics.precision(1.0)) + metrics.recall(1.0))


# metrics
print("Test Metrics")
print("Accuracy:", metrics.accuracy)
print("Precision:", metrics.precision(1.0))
print("Recall:", metrics.recall(1.0))
print("F Beta:", fbeta)

Test Area under ROC curve: 0.7509129838119214
Test Metrics
Accuracy: 0.7509051303237437
Precision: 0.5231230008341489
Recall: 0.5701665843409288
F Beta: 0.5379172473102787


In [0]:
display(predictions_2019.limit(5))

index,MONTH,DAY_OF_WEEK,OP_CARRIER,ORIGIN_AIRPORT_ID,CRS_DEP_TIME,DISTANCE,DEP_DEL15,QUARTER,DEP_DELAY,DEST_AIRPORT_ID,FL_DATE,TAIL_NUM,ARR_TIME,ARR_DELAY,ARR_DEL15,CANCELLED,type,ELEVATION,HourlyAltimeterSetting,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySkyConditions,HourlySeaLevelPressure,HourlyVisibility,HourlyWindDirection,HourlyWindSpeed,HourlyPresentWeatherType,HOUR,MONTH_indexed,MONTH_vec,DAY_OF_WEEK_indexed,DAY_OF_WEEK_vec,OP_CARRIER_indexed,OP_CARRIER_vec,ORIGIN_AIRPORT_ID_indexed,ORIGIN_AIRPORT_ID_vec,HOUR_indexed,HOUR_vec,type_indexed,type_vec,HourlySkyConditions_vec,HourlyPresentWeatherType_vec,Rain,Snow,Thunder,Tornado,View Obstruction,MONTH_1,MONTH_2,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,MONTH_9,MONTH_10,MONTH_11,MONTH_12,HOUR_00,HOUR_01,HOUR_02,HOUR_03,HOUR_04,HOUR_05,HOUR_06,HOUR_07,HOUR_08,HOUR_09,HOUR_10,HOUR_11,HOUR_12,HOUR_13,HOUR_14,HOUR_15,HOUR_16,HOUR_17,HOUR_18,HOUR_19,HOUR_20,HOUR_21,HOUR_22,HOUR_23,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6,DAY_OF_WEEK_7,OP_CARRIER_9E,OP_CARRIER_AA,OP_CARRIER_AS,OP_CARRIER_B6,OP_CARRIER_DL,OP_CARRIER_EV,OP_CARRIER_F9,OP_CARRIER_G4,OP_CARRIER_HA,OP_CARRIER_MQ,OP_CARRIER_NK,OP_CARRIER_OH,OP_CARRIER_OO,OP_CARRIER_QX,OP_CARRIER_UA,OP_CARRIER_US,OP_CARRIER_VX,OP_CARRIER_WN,OP_CARRIER_YV,OP_CARRIER_YX,type_large_airport,type_medium_airport,type_seaplane_base,type_small_airport,DEP_12hr,DEP_6hr,DEP_4hr,DISTANCE_LAG,REALIZED_DELAY_MIN,REALIZED_DEL15,HOLIDAY,OUTDEG_AIRPORT_6hr,YEAR,LOG_REALIZED_DELAY_MIN,numeric_features,num_feats_scaled,features,rawPrediction,probability,prediction
12053218,1,2,F9,11292,4,1506.0,1.0,1,21.0,15304,2019-01-01,N216FR,620,50.0,1.0,0.0,large_airport,1650.2,30.2,0.0,0.0,87.0,SCT:04 15 BKN:07 180,0.0,9.0,10.0,0.0,-SN:03 BR:1 |SN |,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",5.0,"Map(vectorType -> sparse, length -> 6, indices -> List(5), values -> List(1.0))",13.0,"Map(vectorType -> sparse, length -> 19, indices -> List(13), values -> List(1.0))",3.0,"Map(vectorType -> sparse, length -> 372, indices -> List(3), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 269, indices -> List(45, 67, 258, 265), values -> List(1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(1, 4), values -> List(1.0, 1.0))",0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.6231884057971014,0.6923076923076923,0.7,589.0,3.0,0,1,0.0337954939341421,2019,5.544786743165743,"Map(vectorType -> dense, length -> 16, values -> List(1506.0, 1650.199951171875, 30.200000762939453, 0.0, 0.0, 87.0, 0.0, 9.0, 10.0, 0.0, 589.0, 5.544786743165743, 0.033795493934142114, 0.6231884057971014, 0.6923076923076923, 0.7))","Map(vectorType -> dense, length -> 16, values -> List(1.1425814707194573, 3.8705440208853483, 0.26643748725778654, -3.1432496237046266, -0.29681411056900553, 0.2141496509739139, -1.9385070950099375, 0.8619799919253247, -1.2885845784957484, -0.652229636289246, -0.3934574388012513, 0.04666130562597214, 0.6812694005535929, 2.148200855812545, 2.278588197923483, 2.001709366464833))","Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 28, 53, 65, 77, 80, 82, 83), values -> List(1.1425814707194573, 3.8705440208853483, 0.26643748725778654, -3.1432496237046266, -0.29681411056900553, 0.2141496509739139, -1.9385070950099375, 0.8619799919253247, -1.2885845784957484, -0.652229636289246, -0.3934574388012513, 0.04666130562597214, 0.6812694005535929, 2.148200855812545, 2.278588197923483, 2.001709366464833, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(-1.091649789263326, 1.091649789263326))","Map(vectorType -> dense, length -> 2, values -> List(0.25130773963964126, 0.7486922603603587))",1.0
12053219,1,2,UA,14747,5,1874.0,0.0,1,-7.0,12266,2019-01-01,N889UA,607,-7.0,0.0,0.0,large_airport,112.8,30.43,35.0,0.0,82.0,SCT:04 140 BKN:07 220,30.46,6.84,130.0,0.0,,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",5.0,"Map(vectorType -> sparse, length -> 6, indices -> List(5), values -> List(1.0))",4.0,"Map(vectorType -> sparse, length -> 19, indices -> List(4), values -> List(1.0))",10.0,"Map(vectorType -> sparse, length -> 372, indices -> List(10), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 269, indices -> List(39, 92, 258, 265), values -> List(1.0, 1.0, 1.0, 1.0))",,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0.2702702702702703,0.2127659574468085,0.2105263157894736,1874.0,-5.0,0,1,0.0203023758099352,2019,5.513025439041145,"Map(vectorType -> dense, length -> 16, values -> List(1874.0, 112.80000305175781, 30.43000030517578, 35.0, 0.0, 82.0, 30.459999084472656, 6.840000152587891, 130.0, 0.0, 1874.0, 5.513025439041145, 0.020302375809935207, 0.2702702702702703, 0.2127659574468085, 0.21052631578947367))","Map(vectorType -> dense, length -> 16, values -> List(1.7633555808311603, -0.3043431515445256, 0.501736869316886, -1.1827360367059438, -0.29681411056900553, -0.1423059156526813, 0.5559449153176063, 0.23020134379805918, -0.21871302444512372, -0.652229636289246, 1.78145899716012, -0.11144556023735268, 0.01992106081784699, 0.29842586722549336, 0.07229126787646312, -0.012892422399535471))","Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 28, 53, 72, 82, 83), values -> List(1.7633555808311603, -0.3043431515445256, 0.501736869316886, -1.1827360367059438, -0.29681411056900553, -0.1423059156526813, 0.5559449153176063, 0.23020134379805918, -0.21871302444512372, -0.652229636289246, 1.78145899716012, -0.11144556023735268, 0.01992106081784699, 0.29842586722549336, 0.07229126787646312, -0.012892422399535471, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(0.36910867905429157, -0.36910867905429157))","Map(vectorType -> dense, length -> 2, values -> List(0.5912435863241869, 0.40875641367581306))",0.0
12053220,1,2,F9,12889,5,1747.0,1.0,1,23.0,10397,2019-01-01,N328FR,704,7.0,0.0,0.0,large_airport,664.5,30.01,40.0,0.0,29.0,SCT:04 90,30.02,9.94,360.0,0.0,,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",5.0,"Map(vectorType -> sparse, length -> 6, indices -> List(5), values -> List(1.0))",13.0,"Map(vectorType -> sparse, length -> 19, indices -> List(13), values -> List(1.0))",8.0,"Map(vectorType -> sparse, length -> 372, indices -> List(8), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 269, indices -> List(137, 174, 208, 245, 258, 261, 265), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))",1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0.3265306122448979,0.2884615384615384,0.2777777777777778,368.0,8.0,0,1,0.0224622030237581,2019,5.564137192142683,"Map(vectorType -> dense, length -> 16, values -> List(1747.0, 664.5, 30.010000228881836, 40.0, 0.0, 29.0, 30.020000457763672, 9.9399995803833, 360.0, 0.0, 368.0, 5.564137192142683, 0.0224622030237581, 0.32653061224489793, 0.28846153846153844, 0.2777777777777778))","Map(vectorType -> dense, length -> 16, values -> List(1.5491210373958715, 1.1938260419078348, 0.07205880363102068, -0.9026626671347036, -0.29681411056900553, -3.9207349218945904, 0.5199122333765527, 1.1369205965954012, 1.8318741208185734, -0.652229636289246, -0.767509214947137, 0.14298728861426405, 0.12578229532488647, 0.5933072092643994, 0.4205548617832025, 0.2639047170501688))","Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 27, 28, 50, 53, 56, 65, 69, 76, 82, 83), values -> List(1.5491210373958715, 1.1938260419078348, 0.07205880363102068, -0.9026626671347036, -0.29681411056900553, -3.9207349218945904, 0.5199122333765527, 1.1369205965954012, 1.8318741208185734, -0.652229636289246, -0.767509214947137, 0.14298728861426405, 0.12578229532488647, 0.5933072092643994, 0.4205548617832025, 0.2639047170501688, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0))","Map(vectorType -> dense, length -> 2, values -> List(0.21093072628627252, -0.21093072628627252))","Map(vectorType -> dense, length -> 2, values -> List(0.5525380333494654, 0.4474619666505346))",0.0
12053221,1,2,F9,12889,5,1747.0,1.0,1,23.0,10397,2019-01-01,N328FR,704,7.0,0.0,0.0,large_airport,664.5,30.01,40.0,0.0,29.0,SCT:04 90,30.02,9.94,360.0,0.0,,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",5.0,"Map(vectorType -> sparse, length -> 6, indices -> List(5), values -> List(1.0))",13.0,"Map(vectorType -> sparse, length -> 19, indices -> List(13), values -> List(1.0))",8.0,"Map(vectorType -> sparse, length -> 372, indices -> List(8), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 269, indices -> List(245, 265), values -> List(1.0, 1.0))",,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.3265306122448979,0.2884615384615384,0.2777777777777778,1747.0,-120.0,0,1,0.0224622030237581,2019,4.8895969657192,"Map(vectorType -> dense, length -> 16, values -> List(1747.0, 664.5, 30.010000228881836, 40.0, 0.0, 29.0, 30.020000457763672, 9.9399995803833, 360.0, 0.0, 1747.0, 4.8895969657192, 0.0224622030237581, 0.32653061224489793, 0.28846153846153844, 0.2777777777777778))","Map(vectorType -> dense, length -> 16, values -> List(1.5491210373958715, 1.1938260419078348, 0.07205880363102068, -0.9026626671347036, -0.29681411056900553, -3.9207349218945904, 0.5199122333765527, 1.1369205965954012, 1.8318741208185734, -0.652229636289246, 1.5665061665242492, -3.2148547156111107, 0.12578229532488647, 0.5933072092643994, 0.4205548617832025, 0.2639047170501688))","Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 28, 53, 65, 82, 83), values -> List(1.5491210373958715, 1.1938260419078348, 0.07205880363102068, -0.9026626671347036, -0.29681411056900553, -3.9207349218945904, 0.5199122333765527, 1.1369205965954012, 1.8318741208185734, -0.652229636289246, 1.5665061665242492, -3.2148547156111107, 0.12578229532488647, 0.5933072092643994, 0.4205548617832025, 0.2639047170501688, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(0.21093072628627252, -0.21093072628627252))","Map(vectorType -> dense, length -> 2, values -> List(0.5525380333494654, 0.4474619666505346))",0.0
12053222,1,2,AA,12892,15,2342.0,0.0,1,3.0,13303,2019-01-01,N990AU,807,-3.0,0.0,0.0,large_airport,29.6,29.95,54.0,0.0,17.0,57,29.95,9.94,110.0,0.0,,0,6.0,"Map(vectorType -> sparse, length -> 11, indices -> List(6), values -> List(1.0))",5.0,"Map(vectorType -> sparse, length -> 6, indices -> List(5), values -> List(1.0))",2.0,"Map(vectorType -> sparse, length -> 19, indices -> List(2), values -> List(1.0))",4.0,"Map(vectorType -> sparse, length -> 372, indices -> List(4), values -> List(1.0))",19.0,"Map(vectorType -> sparse, length -> 23, indices -> List(19), values -> List(1.0))",0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 269, indices -> List(197), values -> List(1.0))",,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.2433460076045627,0.3255813953488372,0.2,1242.0,-3.0,0,1,0.0388261851015801,2019,5.521060837840907,"Map(vectorType -> dense, length -> 16, values -> List(2342.0, 29.600000381469727, 29.950000762939453, 54.0, 0.0, 17.0, 29.950000762939453, 9.9399995803833, 110.0, 0.0, 1242.0, 5.521060837840907, 0.03882618510158013, 0.24334600760456274, 0.32558139534883723, 0.2))","Map(vectorType -> dense, length -> 16, values -> List(2.5528183078210436, -0.5302769590219136, 0.010676780332850987, -0.1184572323352305, -0.29681411056900553, -4.776228281798419, 0.5141797683494741, 1.1369205965954012, -0.3970249501202278, -0.652229636289246, 0.7117724699012977, -0.07144557424155, 0.9278424476937971, 0.15730580947944547, 0.591337549379707, -0.056217192052532554))","Map(vectorType -> sparse, length -> 87, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 28, 53, 60, 82, 83), values -> List(2.5528183078210436, -0.5302769590219136, 0.010676780332850987, -0.1184572323352305, -0.29681411056900553, -4.776228281798419, 0.5141797683494741, 1.1369205965954012, -0.3970249501202278, -0.652229636289246, 0.7117724699012977, -0.07144557424155, 0.9278424476937971, 0.15730580947944547, 0.591337549379707, -0.056217192052532554, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(0.2824598360744759, -0.2824598360744759))","Map(vectorType -> dense, length -> 2, values -> List(0.5701491820711609, 0.42985081792883906))",0.0


In [0]:
probabilities_df = predictions_2018.select("index", "YEAR", "DEP_DEL15", "probability", "prediction") \
                                   .union(predictions_2019.select("index", "YEAR", "DEP_DEL15", "probability", "prediction"))
    

In [0]:
probabilities_reformat = probabilities_df.rdd.map(lambda x: (x[0], x[1], x[2], float(x[3][0]), float(x[3][1]), x[4])).toDF(['index', 'YEAR', 'DEP_DEL15', 'probability_0', 'probability_1', 'prediction'])

In [0]:
display(probabilities_reformat.limit(10))

index,YEAR,DEP_DEL15,probability_0,probability_1,prediction
8445849,2018,0.0,0.6768607963274049,0.3231392036725951,0.0
8445850,2018,0.0,0.5383478903929685,0.4616521096070315,0.0
8445851,2018,0.0,0.7556168344439501,0.2443831655560498,0.0
8445852,2018,0.0,0.6603769541588272,0.3396230458411727,0.0
8445853,2018,0.0,0.6044679816139937,0.3955320183860062,0.0
8445854,2018,1.0,0.2715386838756888,0.7284613161243112,1.0
8445855,2018,1.0,0.5934826195556799,0.40651738044432,0.0
8445856,2018,1.0,0.2921414592048035,0.7078585407951965,1.0
8445857,2018,0.0,0.717412687355561,0.282587312644439,0.0
8445858,2018,1.0,0.4601307881475564,0.5398692118524435,1.0


In [0]:
# write to the storage blob
probabilities_reformat.write.mode("overwrite").parquet(f"{data_BASE_DIR}/logbaseline_probabilities_2018_2019")


## Run baseline using lasso alpha value of 0.03 on full training set 2015-2017

In [0]:
# Assemble one-hot encoded variables with numerical features
assembler = VectorAssembler(inputCols=numeric_cols, outputCol="numeric_features")
scaler = StandardScaler(inputCol='numeric_features', outputCol="num_feats_scaled", withMean=True)
final_assembler = VectorAssembler(
    inputCols=["num_feats_scaled"] + categorical_cols,
    outputCol="features"
)

target_feature = 'DEP_DEL15'

lr = LogisticRegression(maxIter=10, featuresCol = "features", labelCol=target_feature, regParam=0.03, elasticNetParam=1.0)

pipeline = Pipeline(stages=[assembler, scaler, final_assembler, lr])

baseline_pipeline = pipeline.fit(df_train)
# df_train_transform = pipeline \
#     .fit(df_train) \
#     .transform(df_train) \
#     .select("QUARTER", "FL_DATE", "CRS_DEP_TIME", "DEP_DEL15", "features", "index")


Downloading artifacts:   0%|          | 0/35 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
baseline_model = baseline_pipeline.stages[-1]

In [0]:
# Training Summary & Metrics
training_summary = baseline_model.summary
train_precision = training_summary.precisionByLabel[1]
train_recall = training_summary.recallByLabel[1]
beta = 1/(2**0.5)
fbeta = (1 + beta**2) * (train_precision * train_recall) / ((beta**2 * train_precision) + train_recall)

print(f"Precision by Label: \n{training_summary.precisionByLabel}")
print(f"Recall by Label: \n{training_summary.recallByLabel}")
print(f"F-Beta Score: \n{fbeta}")


Precision by Label: 
[0.6747955256194371, 0.7364132698820335]
Recall by Label: 
[0.8391166484691664, 0.5263990386973368]
F-Beta Score: 
0.6499744783195487


In [0]:
# Model coefficients

coefficients = baseline_model.coefficients.toArray()
print(len(coefficients))
features = numeric_cols + categorical_cols

87


In [0]:
coefficients

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.17719112,
        0.42128768,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        , -0.16916375,
       -0.09613809,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

In [0]:
baseline_coeff_df = pd.DataFrame({'Features': features, 'Coefficients': coefficients})
baseline_coeff_df = baseline_coeff_df.set_index('Features')
baseline_coeff_df['Exp_Coefficients'] = np.exp(baseline_coeff_df['Coefficients'])

In [0]:
baseline_coeff_df[baseline_coeff_df['Exp_Coefficients'] != 1].sort_values(by='Exp_Coefficients', ascending=False)

Unnamed: 0_level_0,Coefficients,Exp_Coefficients
Features,Unnamed: 1_level_1,Unnamed: 2_level_1
REALIZED_DEL15,0.747386,2.111473
Thunder,0.463943,1.590333
DEP_4hr,0.421288,1.523923
DEP_6hr,0.177191,1.193859
HOUR_07,-0.096138,0.908339
HOUR_06,-0.169164,0.844371


In [0]:
# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol=target_feature)
area_under_curve = evaluator.evaluate(predictions_2018)
print(f"Test Area under ROC curve: {area_under_curve}")

# Convert DataFrame to RDD
prediction_and_label = predictions_2018\
                        .select(["prediction", target_feature])\

# Create MulticlassMetrics object
metrics = MulticlassMetrics(prediction_and_label.rdd.map(tuple))

beta = 1/(2**0.5)
fbeta = (1 + beta**2) * (metrics.precision(1.0) * metrics.recall(1.0)) / ((beta**2 * metrics.precision(1.0)) + metrics.recall(1.0))


# metrics
print("Test Metrics")
print("Accuracy:", metrics.accuracy)
print("Precision:", metrics.precision(1.0))
print("Recall:", metrics.recall(1.0))
print("F Beta:", fbeta)

Test Area under ROC curve: 0.7181880459033703
Test Metrics
Accuracy: 0.7049190143841675
Precision: 0.7006311298250856
Recall: 0.42629755918136647
F Beta: 0.5768844188914335


In [0]:
# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol=target_feature)
area_under_curve = evaluator.evaluate(predictions_2019)
print(f"Test Area under ROC curve: {area_under_curve}")

# Convert DataFrame to RDD
prediction_and_label = predictions_2019\
                        .select(["prediction", target_feature])\

# Create MulticlassMetrics object
metrics = MulticlassMetrics(prediction_and_label.rdd.map(tuple))

beta = 1/(2**0.5)
fbeta = (1 + beta**2) * (metrics.precision(1.0) * metrics.recall(1.0)) / ((beta**2 * metrics.precision(1.0)) + metrics.recall(1.0))


# metrics
print("Test Metrics")
print("Accuracy:", metrics.accuracy)
print("Precision:", metrics.precision(1.0))
print("Recall:", metrics.recall(1.0))
print("F Beta:", fbeta)

Test Area under ROC curve: 0.7399542970867687
Test Metrics
Accuracy: 0.7522168181476752
Precision: 0.5274165494716483
Recall: 0.5329195570159425
F Beta: 0.5292382139522539


In [0]:
import mlflow

In [0]:

# Replace <run-id1> with the run ID you identified in the previous step.
run_id1 = "a413d02d41ac4b8e9a8743c8e308e45b"
model_uri = "runs:/" + run_id1 + "/model"

In [0]:
model = mlflow.spark.load_model(model_uri)

2024/08/08 05:53:46 INFO mlflow.spark: 'runs:/a413d02d41ac4b8e9a8743c8e308e45b/model' resolved as 'dbfs:/databricks/mlflow-tracking/3133664007186385/a413d02d41ac4b8e9a8743c8e308e45b/artifacts/model'


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/39 [00:00<?, ?it/s]

2024/08/08 05:53:56 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


In [0]:
target_feature = 'DEP_DEL15'

In [0]:
predictions_2018 = model.transform(df_val)

In [0]:
predictions_2019 = model.transform(df_test)

In [0]:
probabilities_df = predictions_2018.select("index", "YEAR", "DEP_DEL15", "probability", "prediction") \
                                   .union(predictions_2019.select("index", "YEAR", "DEP_DEL15", "probability", "prediction"))
    

In [0]:
probabilities_reformat = probabilities_df.rdd.map(lambda x: (x[0], x[1], x[2], float(x[3][0]), float(x[3][1]), x[4])).toDF(['index', 'YEAR', 'DEP_DEL15', 'probability_0', 'probability_1', 'prediction'])

In [0]:
display(probabilities_reformat.limit(10))

index,YEAR,DEP_DEL15,probability_0,probability_1,prediction
8445849,2018,0.0,0.6436532512780021,0.3563467487219979,0.0
8445850,2018,0.0,0.5464634035487549,0.4535365964512451,0.0
8445851,2018,0.0,0.7042498315862907,0.2957501684137092,0.0
8445852,2018,0.0,0.6564194840724241,0.3435805159275759,0.0
8445853,2018,0.0,0.5913603964498242,0.4086396035501758,0.0
8445854,2018,1.0,0.3238118934815063,0.6761881065184936,1.0
8445855,2018,1.0,0.5740163894265249,0.4259836105734751,0.0
8445856,2018,1.0,0.3589600468656955,0.6410399531343045,1.0
8445857,2018,0.0,0.6869922835651096,0.3130077164348904,0.0
8445858,2018,1.0,0.4772507925723416,0.5227492074276583,1.0


In [0]:
# write to the storage blob
probabilities_reformat.write.mode("overwrite").parquet(f"{data_BASE_DIR}/logbaseline_0.03_probabilities_2018_2019v2")


## Run with alpha value of 0.05 for test metrics

In [0]:
# Assemble one-hot encoded variables with numerical features
assembler = VectorAssembler(inputCols=numeric_cols, outputCol="numeric_features")
scaler = StandardScaler(inputCol='numeric_features', outputCol="num_feats_scaled", withMean=True)
final_assembler = VectorAssembler(
    inputCols=["num_feats_scaled"] + categorical_cols,
    outputCol="features"
)

target_feature = 'DEP_DEL15'

lr = LogisticRegression(maxIter=10, featuresCol = "features", labelCol=target_feature, regParam=0.05, elasticNetParam=1.0)

pipeline = Pipeline(stages=[assembler, scaler, final_assembler, lr])

baseline_pipeline = pipeline.fit(df_train)
# df_train_transform = pipeline \
#     .fit(df_train) \
#     .transform(df_train) \
#     .select("QUARTER", "FL_DATE", "CRS_DEP_TIME", "DEP_DEL15", "features", "index")


Downloading artifacts:   0%|          | 0/35 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
baseline_model = baseline_pipeline.stages[-1]

In [0]:
# Training Summary & Metrics
training_summary = baseline_model.summary
train_precision = training_summary.precisionByLabel[1]
train_recall = training_summary.recallByLabel[1]
beta = 1/(2**0.5)
fbeta = (1 + beta**2) * (train_precision * train_recall) / ((beta**2 * train_precision) + train_recall)

print(f"Precision by Label: \n{training_summary.precisionByLabel}")
print(f"Recall by Label: \n{training_summary.recallByLabel}")
print(f"F-Beta Score: \n{fbeta}")


Precision by Label: 
[0.6893892224711167, 0.7529458122106284]
Recall by Label: 
[0.9377655814631798, 0.30982430276747774]
F-Beta Score: 
0.5098685560873885


In [0]:
# Model coefficients

coefficients = baseline_model.coefficients.toArray()
print(len(coefficients))
features = numeric_cols + categorical_cols

87


In [0]:
coefficients

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.14009949,
       0.2743975 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [0]:
baseline_coeff_df = pd.DataFrame({'Features': features, 'Coefficients': coefficients})
baseline_coeff_df = baseline_coeff_df.set_index('Features')
baseline_coeff_df['Exp_Coefficients'] = np.exp(baseline_coeff_df['Coefficients'])

In [0]:
baseline_coeff_df[baseline_coeff_df['Exp_Coefficients'] != 1].sort_values(by='Exp_Coefficients', ascending=False)

Unnamed: 0_level_0,Coefficients,Exp_Coefficients
Features,Unnamed: 1_level_1,Unnamed: 2_level_1
REALIZED_DEL15,0.746933,2.110518
DEP_4hr,0.274397,1.315738
DEP_6hr,0.140099,1.150388


In [0]:
predictions_2019 = baseline_pipeline.transform(df_test)

In [0]:
# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol=target_feature)
area_under_curve = evaluator.evaluate(predictions_2019)
print(f"Test Area under ROC curve: {area_under_curve}")

# Convert DataFrame to RDD
prediction_and_label = predictions_2019\
                        .select(["prediction", target_feature])\

# Create MulticlassMetrics object
metrics = MulticlassMetrics(prediction_and_label.rdd.map(tuple))

beta = 1/(2**0.5)
fbeta = (1 + beta**2) * (metrics.precision(1.0) * metrics.recall(1.0)) / ((beta**2 * metrics.precision(1.0)) + metrics.recall(1.0))


# metrics
print("Test Metrics")
print("Accuracy:", metrics.accuracy)
print("Precision:", metrics.precision(1.0))
print("Recall:", metrics.recall(1.0))
print("F Beta:", fbeta)

Test Area under ROC curve: 0.7108467178417175
Test Metrics
Accuracy: 0.8075589137646134
Precision: 0.5438172184280455
Recall: 0.3225328435846514
F Beta: 0.4425976175385364
