In [0]:
# The following blob storage is accessible to team members only (read and write)
# access key is valid til TTL
# after that you will need to create a new SAS key and authenticate access again via DataBrick command line
blob_container  = "final-project-summer24-team3"       # The name of your container created in https://portal.azure.com
storage_account = "summer2024team3"  # The name of your Storage account created in https://portal.azure.com
secret_scope    = "summer24_team_3_2_scope"           # The name of the scope created in your local computer using the Databricks CLI
secret_key      = "final-project-summer24-team3"             # The name of the secret key created in your local computer using the Databricks CLI
team_blob_url   = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"  #points to the root of your team storage bucket

# the 261 course blob storage is mounted here.
mids261_mount_path      = "/mnt/mids-w261"

# SAS Token: Grant the team limited access to Azure Storage resources
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

# see what's in the blob storage root folder 
display(dbutils.fs.ls(f"{team_blob_url}"))

path,name,size,modificationTime
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/,TP/,0,1722323705000


In [0]:
from pyspark.sql.functions import col, isnan, count, when, split, concat, lit, min, row_number, lower, lpad, udf, first, countDistinct, coalesce, to_timestamp, monotonically_increasing_id, explode, array, concat_ws, unix_timestamp
from pyspark.sql.types import StringType, StructField, StructType, FloatType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.window import Window
import pyspark.sql.functions as F
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from datetime import datetime, timedelta

In [0]:
merged_3m = spark.read.parquet("wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/flight_stations_weather_3m_cleaned2")
merged_1y = spark.read.parquet("wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/flight_stations_weather_1y_cleaned2")
merged_all = spark.read.parquet("wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/flight_stations_weather_all_cleaned2")

In [0]:
merged_all.withColumn("YEAR", col("FL_DATE").cast("string").substr(0, 4)).filter(col("YEAR").isin("2015", "2016", "2017","2018", "2019")).count()

31037110

In [0]:
df = merged_all

In [0]:
holidays = [# New Years
            "2015-01-01", "2016-01-01", "2017-01-01", "2018-01-01", "2019-01-01", "2020-01-01", "2021-01-01",
            # MLK Day 
            "2015-01-19", "2016-01-18", "2017-01-16", "2018-01-15", "2019-01-21", "2020-01-20", "2021-01-18",
            # President's Day
            "2015-02-16", "2016-02-15", "2017-02-20", "2018-02-19", "2019-02-18", "2020-02-17", "2021-02-15",
            # Memorial Day 
            # Juneteeth became holiday in 2021 (skip)
            # Independence Day 
            "2015-07-04", "2016-07-04", "2017-07-04", "2018-07-04", "2019-07-04", "2020-07-04", "2021-07-04",
            # Labor Day
            "2015-09-07", "2016-09-05", "2017-09-04", "2018-09-03", "2019-09-02", "2020-09-07", "2021-09-06",
            # Columbus Day 
            "2015-10-12", "2016-10-10", "2017-10-09", "2018-10-08", "2019-10-14", "2020-10-12", "2021-10-11", 
            # Veterans Day
            "2015-11-11", "2016-11-11", "2017-11-11", "2018-11-11", "2019-11-11", "2020-11-11", "2021-11-11",
            # Thanksgiving Day 
            "2015-11-26", "2016-11-24", "2017-11-23", "2018-11-22", "2019-11-28", "2020-11-26", "2021-11-25",
            # Christmas Day
            "2015-12-25", "2016-12-25", "2017-12-25", "2018-12-25", "2019-12-25", "2020-12-25", "2021-12-25"
            ]
# Convert to date 
holiday_dates = [datetime.strptime(date, "%Y-%m-%d") for date in holidays]
# Get all days within 3 days of a holiday date 
all_dates = []
for date in holiday_dates:
    start_date = date - timedelta(days=3)
    end_date = date + timedelta(days=3)
    current_date = start_date
    while current_date <= end_date:
        all_dates.append(current_date.strftime("%Y-%m-%d"))
        current_date += timedelta(days=1)


In [0]:
# Create binary indicator if flight date is scheduled within 3 days of a holiday 
df = df.withColumn("DATE", col("FL_DATE").cast("string").substr(0, 10)).withColumn("HOLIDAY", when(col("DATE").isin(all_dates), 1).otherwise(0)).drop("DATE")

In [0]:
# Removing "arrival features" -- see Leakage discussion
df = df.drop("ARR_12hr", "ARR_6hr", "ARR_4hr")

In [0]:
# generate timestamp for running window functions
def update_df_unixtime(df, timestring, datestring):
    """
    Inputs: dataframe, CRS_DEP_TIME, FL_DATE
    Outputs: dataframe with unix timestring
    """

    # Convert the military time to a time string
    df = df.withColumn("MOD_DEP_TIME", F.lpad(F.col(str(timestring)).cast("string"), 4, "0"))
    df = df.withColumn("MOD_DEP_TIME", F.concat_ws(":", df.MOD_DEP_TIME.substr(1, 2), df.MOD_DEP_TIME.substr(3, 2)))

    # Combine the date column with the time string to create a datetime string
    df = df.withColumn("MOD_DEP_DATE_TIME", F.concat_ws(" ", F.col(str(datestring)).cast("string").substr(0, 10), F.col("MOD_DEP_TIME")))

    # Convert the datetime string to a UNIX timestamp
    df = df.withColumn("DEP_TIMESTAMP_UNIX", F.unix_timestamp(F.col("MOD_DEP_DATE_TIME"), "yyyy-MM-dd HH:mm"))

    df = df.drop("MOD_DEP_TIME") \
           .drop("MOD_DEP_DATE_TIME")

    return df

df = update_df_unixtime(df, "CRS_DEP_TIME", "FL_DATE")

In [0]:
# Define a window specification based on timestamp

# Partition by origin airport & carrier
#window_spec_carr_12hr = Window.partitionBy("ORIGIN_AIRPORT_ID", "OP_CARRIER").orderBy("DEP_TIMESTAMP_UNIX").rangeBetween(Window.currentRow-(12 * 3600), Window.currentRow-(2 * 3600))
# window_spec_carr_6hr = Window.partitionBy("ORIGIN_AIRPORT_ID", "OP_CARRIER").orderBy("DEP_TIMESTAMP_UNIX").rangeBetween(Window.currentRow-(6 * 3600), Window.currentRow-(2 * 3600))

# Partition by origin airport
# window_spec_dep_12hr = Window.partitionBy("ORIGIN_AIRPORT_ID").orderBy("DEP_TIMESTAMP_UNIX").rangeBetween(Window.currentRow-(12 * 3600), Window.currentRow-(2 * 3600))
window_spec_dep_6hr = Window.partitionBy("ORIGIN_AIRPORT_ID").orderBy("DEP_TIMESTAMP_UNIX").rangeBetween(Window.currentRow-(6 * 3600), Window.currentRow-(2 * 3600))

# Timeframe only
# window_spec_12hr = Window.orderBy("DEP_TIMESTAMP_UNIX").rangeBetween(Window.currentRow-(12 * 3600), Window.currentRow-(2 * 3600))
window_spec_6hr = Window.orderBy("DEP_TIMESTAMP_UNIX").rangeBetween(Window.currentRow-(6 * 3600), Window.currentRow-(2 * 3600))

# Compute # of outgoing flights by carrier by airport within the window -- OUT-DEGREES CARRIER
# Compute # of outgoing flights by airport within the window -- OUT-DEGREES AIRPORT
# Compute total flights within window
# Compute # of delayed flights by carrier by airport within the window

df = df.withColumn("OUTDEG_AIRPORT_6hr", F.count("*").over(window_spec_dep_6hr)/F.count("*").over(window_spec_6hr))

df = df.na.fill({
        'OUTDEG_AIRPORT_6hr': 0,
})

# Drop unix timestamp column
df = df.drop('DEP_TIMESTAMP_UNIX')

In [0]:
df.write.parquet(f"{team_blob_url}/TP/flight_stations_weather_all_cleaned3")
display(dbutils.fs.ls(f"{team_blob_url}/TP"))

path,name,size,modificationTime
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_SUCCESS,_SUCCESS,0,1720561571000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_1002670972388615845,_committed_1002670972388615845,625,1720560531000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_5669257934384103852,_committed_5669257934384103852,221,1720561571000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_6618439955609603938,_committed_6618439955609603938,419,1720561337000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_9167039456723159873,_committed_9167039456723159873,318,1720559468000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_vacuum825530481471543349,_committed_vacuum825530481471543349,95,1720561338000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_1002670972388615845,_started_1002670972388615845,0,1720560531000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_5669257934384103852,_started_5669257934384103852,0,1720561570000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_6618439955609603938,_started_6618439955609603938,0,1720561337000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/df_1y_cleaned_transformed/,df_1y_cleaned_transformed/,0,1722046865000


In [0]:
df = spark.read.parquet(f"{team_blob_url}/TP/flight_stations_weather_all_cleaned3")

In [0]:
# Downsample data 

def undersample_majority(df, ratio=3):
    '''
    Majority class 0
    ratio is the ratio of majority to minority
    Eg. ratio 1 is equivalent to majority:minority = 1:1
    ratio 5 is equivalent to majority:minority = 5:1
    '''
    minority_count = df.filter(F.col('DEP_DEL15')==1).count()
    whole_count = df.count()
    undersampled_majority = df.filter(F.col('DEP_DEL15')==0)\
        .sample(withReplacement=False, fraction=(ratio*minority_count/whole_count),seed=88)
    undersampled_df = df.filter(F.col('DEP_DEL15')==1).union(undersampled_majority)
    
    return undersampled_df

df_train = df.withColumn("YEAR", col("FL_DATE").cast("string").substr(0, 4)).filter(col("YEAR").isin("2015", "2016", "2017", "2018"))
df_test = df.withColumn("YEAR", col("FL_DATE").cast("string").substr(0, 4)).filter(col("YEAR").isin("2019"))

# Undersample so that training set has a 2:1 ratio of not delayed to delayed 
# Only undersample training set to not affect test set 
df_train = undersample_majority(df_train, ratio = 2)

In [0]:
print("Train Count:", df_train.count())
print("Test Count:", df_test.count())

Train Count: 12053218
Test Count: 7270542


In [0]:
windowSpec = Window.orderBy("DATE_TIME")

df_train.union(df_test).withColumn("MOD_DEP_TIME", lpad(col("CRS_DEP_TIME").cast("string"), 4, "0"))\
.withColumn("MOD_DEP_TIME", concat_ws(":", merged_all.MOD_DEP_TIME.substr(1, 2), merged_all.MOD_DEP_TIME.substr(3, 2)))\
.withColumn("MOD_DEP_DATE_TIME", concat_ws(" ", col("FL_DATE").cast("string").substr(0, 10), col("MOD_DEP_TIME")))\
.withColumn("DATE_TIME", unix_timestamp(col("MOD_DEP_DATE_TIME"), "yyyy-MM-dd HH:mm"))\
.withColumn("index", row_number().over(windowSpec) - 1)\
.orderBy("index").drop("MOD_DEP_TIME", "MOD_DEP_DATE_TIME", "DATE_TIME")\
.write.mode("overwrite").parquet(f"{team_blob_url}/TP/flight_stations_weather_all_cleaned3")
display(dbutils.fs.ls(f"{team_blob_url}/TP"))

path,name,size,modificationTime
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_SUCCESS,_SUCCESS,0,1720561571000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_1002670972388615845,_committed_1002670972388615845,625,1720560531000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_5669257934384103852,_committed_5669257934384103852,221,1720561571000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_6618439955609603938,_committed_6618439955609603938,419,1720561337000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_9167039456723159873,_committed_9167039456723159873,318,1720559468000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_vacuum825530481471543349,_committed_vacuum825530481471543349,95,1720561338000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_1002670972388615845,_started_1002670972388615845,0,1720560531000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_5669257934384103852,_started_5669257934384103852,0,1720561570000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_6618439955609603938,_started_6618439955609603938,0,1720561337000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/df_1y_cleaned_transformed/,df_1y_cleaned_transformed/,0,1722046865000
