# Phase 3

In [None]:
blob_container = "" # The name of your container created in https://portal.azure.com
storage_account = "" # The name of your Storage account created in https://portal.azure.com
secret_scope = "" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [None]:
from pyspark.sql.functions import col, when, to_timestamp, unix_timestamp, round, concat, lit, min, max

In [None]:
# Clean the joined data set
dataset = spark.read.parquet(f"{blob_url}/joined_df_full_dest")

# Keep flights that are cancelled but ALSO ARE characterized as being delayed.
dataset = dataset.filter(~((dataset.CANCELLED == 1) & (dataset.DEP_DEL15.isNull())))

# Features to drop primarily relate to diverted flights and features that are considered after the fact.
feature_to_drop = ('FIRST_DEP_TIME', 'TOTAL_ADD_GTIME', 'LONGEST_ADD_GTIME', 'DIV_AIRPORT_LANDINGS', 'DIV_REACHED_DEST','DIV_ACTUAL_ELAPSED_TIME',
                       'DIV_ARR_DELAY','DIV_DISTANCE', 'DIV1_AIRPORT', 'DIV1_AIRPORT_ID', 'DIV1_AIRPORT_SEQ_ID', 'DIV1_WHEELS_ON',
                       'DIV1_TOTAL_GTIME','DIV1_LONGEST_GTIME', 'DIV1_WHEELS_OFF','DIV1_TAIL_NUM', 'DIV2_AIRPORT', 'DIV2_AIRPORT_ID', 'DIV2_AIRPORT_SEQ_ID',
                       'DIV2_WHEELS_ON', 'DIV2_TOTAL_GTIME','DIV2_LONGEST_GTIME', 'DIV2_WHEELS_OFF','DIV2_TAIL_NUM', 'DIV3_AIRPORT', 'DIV3_AIRPORT_ID',
                       'DIV3_AIRPORT_SEQ_ID', 'DIV3_WHEELS_ON', 'DIV3_TOTAL_GTIME','DIV3_LONGEST_GTIME', 'DIV3_WHEELS_OFF','DIV3_TAIL_NUM', 'DIV4_AIRPORT',
                       'DIV4_AIRPORT_ID','DIV4_AIRPORT_SEQ_ID','DIV4_WHEELS_ON','DIV4_TOTAL_GTIME','DIV4_LONGEST_GTIME','DIV4_WHEELS_OFF',
                       'DIV4_TAIL_NUM','DIV5_AIRPORT','DIV5_AIRPORT_ID','DIV5_AIRPORT_SEQ_ID','DIV5_WHEELS_ON','DIV5_TOTAL_GTIME','DIV5_LONGEST_GTIME',
                       'DIV5_WHEELS_OFF','DIV5_TAIL_NUM', 'OP_CARRIER','OP_CARRIER_FL_NUM','ORIGIN_AIRPORT_SEQ_ID',
                       'ORIGIN_CITY_MARKET_ID','ORIGIN_CITY_NAME','ORIGIN_STATE_FIPS','ORIGIN_STATE_NM','ORIGIN_WAC','DEST_AIRPORT_SEQ_ID',
                       'DEST_CITY_MARKET_ID','DEST_CITY_NAME','DEST_STATE_FIPS','DEST_STATE_NM','DEST_WAC','TAXI_OUT','WHEELS_OFF','WHEELS_ON',
                       'TAXI_IN','CANCELLATION_CODE','CRS_ELAPSED_TIME','ACTUAL_ELAPSED_TIME','AIR_TIME','FLIGHTS')

dataset = dataset.drop(*feature_to_drop)

# Drop observations where values cannot be imputed or inferred (i.e scheduled departure arrival times, actual arrival and departure times)

dataset = dataset.na.drop(subset=["CRS_DEP_TIME"]).na.drop(subset=["CRS_ARR_TIME"]).na.drop(subset=["ARR_TIME"]).na.drop(subset=["DEP_TIME"]).na.drop(subset=["DEP_DELAY"]).fillna(0, subset=["CARRIER_DELAY","WEATHER_DELAY","NAS_DELAY","SECURITY_DELAY","LATE_AIRCRAFT_DELAY"]).filter(col('DIVERTED') == 0)

# Remaining observations are flights where the flight arrived on-time and happen to have NULL values for ARR_DELAY data
dataset = dataset.fillna(0, subset=["ARR_DELAY","ARR_DELAY_NEW","ARR_DEL15","ARR_DELAY_GROUP"])

dataset.write.mode("overwrite").parquet(f"{blob_url}/joined_df_full_dest")

In [None]:
from datetime import  datetime, timedelta
from pytz import timezone

# Add timestamp for arrival
dataset = spark.read.parquet(f"{blob_url}/joined_df_full_dest")

def get_hour_minutes(depature_time):
    depature_time = str(depature_time)
    hour = ''
    minutes = ''
    if len(depature_time) <= 2:
        hour = '00'
        minutes = depature_time
    elif len(depature_time) == 3:
        hour = depature_time[0]
        minutes = depature_time[1:]
    elif len(depature_time) == 4:
        hour = depature_time[:2]
        minutes = depature_time[2:]
    if int(hour) > 23:
        hour = '00'
    return hour, minutes
  
def get_timestamp(year, month, day, hour_minutes, tz, dep_time):
    hour, minutes = get_hour_minutes(hour_minutes)
    utc = timezone('UTC')
    tz = timezone(tz)
    timestamp = tz.localize(datetime(int(year), int(month), int(day), hour=int(hour), minute=int(minutes)))
    arr_timestamp = timestamp.astimezone(utc)
    dep_timestamp = utc.localize(dep_time)
    if arr_timestamp < dep_timestamp:
        return (arr_timestamp + timedelta(hours=24)).strftime("%Y%m%d%H%M")
    return arr_timestamp.strftime("%Y%m%d%H%M")

get_timestamp = udf(get_timestamp)

dataset = dataset.withColumn("ARRIVAL_FLIGHT_TIMESTAMP", get_timestamp("YEAR", "MONTH", "DAY_OF_MONTH", "CRS_ARR_TIME", "DEST_airport_tz", "ORIGIN_FLIGHT_TIMESTAMP")).withColumn("ARRIVAL_FLIGHT_TIMESTAMP", to_timestamp(col("ARRIVAL_FLIGHT_TIMESTAMP"), "yyyyMMddHHmm"))

dataset.write.mode("overwrite").parquet(f"{blob_url}/dataset_timestamps")


In [None]:
from pyspark.sql.functions import lag
from pyspark.sql.window import Window

dataset = spark.read.parquet(f"{blob_url}/dataset_timestamps")

# Grouped window to calculate n hour moving average
w = Window.partitionBy('TAIL_NUM').orderBy(col("ORIGIN_FLIGHT_TIMESTAMP"))
dataset_with_prev = dataset.withColumn('PREV_DEP_DELAY', lag("DEP_DELAY_NEW",1).over(w))\
    .withColumn('PREV_DEPARTURE_TIMESTAMP', lag("ORIGIN_FLIGHT_TIMESTAMP",1).over(w))\
    .withColumn('PREV_ARR_DELAY', lag("ARR_DELAY_NEW",1).over(w))\
    .withColumn('PREV_ARRIVAL_TIMESTAMP', lag("ARRIVAL_FLIGHT_TIMESTAMP",1).over(w))

def was_delayed(dep_ts, weather_ts, is_delayed):
    if is_delayed == None:
        return 0.0
    if dep_ts <= weather_ts:
        return is_delayed
    return 0.0

was_delayed = udf(was_delayed)

dataset_with_prev = dataset_with_prev.withColumn('PREV_DEP_DELAY', was_delayed('PREV_DEPARTURE_TIMESTAMP', 'WEATHER_TIMESTAMP', 'PREV_DEP_DELAY'))
dataset_with_prev = dataset_with_prev.withColumn('PREV_ARR_DELAY', was_delayed('PREV_ARRIVAL_TIMESTAMP', 'WEATHER_TIMESTAMP', 'PREV_ARR_DELAY'))

dataset_with_prev = dataset_with_prev.withColumn("time_between_flights", round((unix_timestamp("ORIGIN_FLIGHT_TIMESTAMP") - unix_timestamp("PREV_ARRIVAL_TIMESTAMP"))/60))

dataset_with_prev.write.mode("overwrite").parquet(f"{blob_url}/clean_full_dataset")


In [None]:
from pyspark.sql.functions import lag, avg, col, when
from pyspark.sql.window import Window

dataset = spark.read.parquet(f"{blob_url}/clean_full_dataset")

hours = lambda i: i * 3600

# Grouped window to calculate n hour moving average
w_12 = Window.partitionBy('ORIGIN').orderBy(col("ORIGIN_FLIGHT_TIMESTAMP").cast('long')).rangeBetween(-hours(14), -hours(2))
w_6 = Window.partitionBy('ORIGIN').orderBy(col("ORIGIN_FLIGHT_TIMESTAMP").cast('long')).rangeBetween(-hours(8), -hours(2))
w_3 = Window.partitionBy('ORIGIN').orderBy(col("ORIGIN_FLIGHT_TIMESTAMP").cast('long')).rangeBetween(-hours(5), -hours(2))
w_1 = Window.partitionBy('ORIGIN').orderBy(col("ORIGIN_FLIGHT_TIMESTAMP").cast('long')).rangeBetween(-hours(3), -hours(2))

dataset = dataset.withColumn('PREV_DEP_15', when(col('PREV_DEP_DELAY') >= 15, 1).otherwise(0))
dataset = dataset.withColumn('AVG_DELAY_ORIGIN_LAST_12', avg("DEP_DELAY_NEW").over(w_12)).na.fill(value=0,subset=["AVG_DELAY_ORIGIN_LAST_12"])
dataset = dataset.withColumn('PER_DELAY_15_ORIGIN_LAST_12', avg("DEP_DEL15").over(w_12)).na.fill(value=0,subset=["PER_DELAY_15_ORIGIN_LAST_12"])
dataset = dataset.withColumn('AVG_DELAY_ORIGIN_LAST_6', avg("DEP_DELAY_NEW").over(w_6)).na.fill(value=0,subset=["AVG_DELAY_ORIGIN_LAST_6"])
dataset = dataset.withColumn('PER_DELAY_15_ORIGIN_LAST_6', avg("DEP_DEL15").over(w_6)).na.fill(value=0,subset=["PER_DELAY_15_ORIGIN_LAST_6"])
dataset = dataset.withColumn('AVG_DELAY_ORIGIN_LAST_3', avg("DEP_DELAY_NEW").over(w_3)).na.fill(value=0,subset=["AVG_DELAY_ORIGIN_LAST_3"])
dataset = dataset.withColumn('PER_DELAY_15_ORIGIN_LAST_3', avg("DEP_DEL15").over(w_3)).na.fill(value=0,subset=["PER_DELAY_15_ORIGIN_LAST_3"])
dataset = dataset.withColumn('AVG_DELAY_ORIGIN_LAST_1', avg("DEP_DELAY_NEW").over(w_1)).na.fill(value=0,subset=["AVG_DELAY_ORIGIN_LAST_1"])
dataset = dataset.withColumn('PER_DELAY_15_ORIGIN_LAST_1', avg("DEP_DEL15").over(w_1)).na.fill(value=0,subset=["PER_DELAY_15_ORIGIN_LAST_1"])
dataset.write.mode("overwrite").parquet(f"{blob_url}/clean_full_dataset_ori_delay")

In [None]:
dataset = spark.read.parquet(f"{blob_url}/clean_full_dataset_ori_delay")
# Split test and train data set
dataset_2021 = dataset.where("YEAR == '2021'")
dataset_2021.write.mode("overwrite").parquet(f"{blob_url}/test_set")

dataset_2015_2020 = dataset.where("YEAR != '2021'")
dataset_2015_2020.write.mode("overwrite").parquet(f"{blob_url}/train_set")

In [None]:
dataset = spark.read.parquet(f"{blob_url}/train_set")
print('Correlation with 12 hour delay ratio in Origin Airport', dataset.stat.corr("DEP_DEL15", "PER_DELAY_15_ORIGIN_LAST_12"))
print('Correlation with 6 hour delay ratio in Origin Airport', dataset.stat.corr("DEP_DEL15", "PER_DELAY_15_ORIGIN_LAST_6"))
print('Correlation with 3 hour delay ratio in Origin Airport', dataset.stat.corr("DEP_DEL15", "PER_DELAY_15_ORIGIN_LAST_3"))
print('Correlation with 1 hour delay ratio in Origin Airport', dataset.stat.corr("DEP_DEL15", "PER_DELAY_15_ORIGIN_LAST_1"))

Correlation with 12 hour delay ratio in Origin Airport 0.12216519883229544
Correlation with 6 hour delay ratio in Origin Airport 0.21077215134158905
Correlation with 3 hour delay ratio in Origin Airport 0.257984686955184
Correlation with 1 hour delay ratio in Origin Airport 0.24283098426761895


In [None]:
train_set = spark.read.parquet(f"{blob_url}/train_set")

train_set_1 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2016-01-01'")
dev_set_1 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2016-03-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2016-01-01'")

train_set_1.write.mode("overwrite").parquet(f"{blob_url}/train_set_1")
dev_set_1.write.mode("overwrite").parquet(f"{blob_url}/dev_set_1")

train_set_2 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2017-03-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2016-03-01'")
dev_set_2 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2017-05-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2017-03-01'")

train_set_2.write.mode("overwrite").parquet(f"{blob_url}/train_set_2")
dev_set_2.write.mode("overwrite").parquet(f"{blob_url}/dev_set_2")

train_set_3 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2018-07-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2017-05-01'")
dev_set_3 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2018-09-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2018-07-01'")

train_set_3.write.mode("overwrite").parquet(f"{blob_url}/train_set_3")
dev_set_3.write.mode("overwrite").parquet(f"{blob_url}/dev_set_3")

train_set_4 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2019-09-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2018-09-01'")
dev_set_4 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2019-11-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2019-09-01'")

train_set_4.write.mode("overwrite").parquet(f"{blob_url}/train_set_4")
dev_set_4.write.mode("overwrite").parquet(f"{blob_url}/dev_set_4")

train_set_5 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2020-10-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2019-11-01'")
dev_set_5 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP >= '2020-10-01'")

train_set_5.write.mode("overwrite").parquet(f"{blob_url}/train_set_5")
dev_set_5.write.mode("overwrite").parquet(f"{blob_url}/dev_set_5")

## Helper Function

In [None]:
def read_clean(parquet_string): 
    dataset = spark.read.parquet(f"{blob_url}/{parquet_string}")

    # Make sure the target variable is not null

    dataset = dataset.where("label is not NULL")

    dataset = dataset.withColumn("PREV_DEP_DELAY", col("PREV_DEP_DELAY").cast('int'))

    dataset= dataset.withColumnRenamed("DEP_DEL15", "label")

    for col_name in cont_feat:
        dataset = dataset.withColumn(col_name, col(col_name).cast('float'))

    dataset = dataset.na.drop(subset=["ORIGIN_HourlyStationPressure",
                                          "DEST_HourlyStationPressure",
                                          "ORIGIN_HourlyDryBulbTemperature",
                                          "DEST_HourlyDryBulbTemperature",
                                          "ORIGIN_HourlyVisibility",
                                          "DEST_HourlyVisibility"])\
                         .fillna(0, subset=["ORIGIN_HourlyPrecipitation",
                                            "ORIGIN_HourlyWindDirection",
                                            "ORIGIN_HourlyWindSpeed",
                                            "DEST_HourlyPrecipitation",
                                            "DEST_HourlyWindDirection",
                                            "DEST_HourlyWindSpeed"])       
        
    return dataset

## Join Joined Dataset with QRN (Javier To Include)

## Join Page Rank Features to Fully Joined Dataset

In [None]:
qrn_full_set = spark.read.parquet(f"{blob_url}/QRN_fullset_V3")

In [None]:
# TO JOIN FULL DATA SET WITH PAGE RANK FEATURES
page_rank = spark.read.parquet(f"{blob_url}/df_PageRank")
page_rank.cache().count()

qrn_full_set = spark.read.parquet(f"{blob_url}/QRN_fullset_V3")
qrn_full_set.cache().count()

joined_rank = qrn_full_set.withColumn("key_origin", concat(col("QUARTER"),lit('-'),col("YEAR"),lit('-'),col("ORIGIN")))
print(f'Dimensions of Full Set are: {joined_rank.count()}, {len(joined_rank.columns)}')
joined_rank = joined_rank.withColumn("key_dest", concat(col("QUARTER"),lit('-'),col("YEAR"),lit('-'),col("DEST")) )
joined_rank = joined_rank.join(page_rank, joined_rank.key_origin == page_rank.key)
joined_rank = joined_rank.drop("PageRank", "quarter","node", "key_origin","key").withColumnRenamed("PageRank_use", "PageRank_origin")
joined_rank = joined_rank.join(page_rank, joined_rank.key_dest == page_rank.key)
joined_rank = joined_rank.drop("PageRank", "quarter","node", "key_dest","key").withColumnRenamed("PageRank_use", "PageRank_dest")

print(f'Dimensions of Full Set are: {joined_rank.count()}, {len(joined_rank.columns)}')
joined_rank.write.mode("overwrite").parquet(f"{blob_url}/rank_set")

Dimensions of Full Set are: 41061381, 209
Dimensions of Full Set are: 41017808, 209


## RANK SPLIT

In [None]:
test_ = spark.read.parquet(f"{blob_url}/train_delay_set")

In [None]:
test_.columns

Out[11]: ['hour_stamp',
 'ORIGIN',
 'MONTH',
 'DAY_OF_MONTH',
 'DAY_OF_WEEK',
 'FL_DATE',
 'OP_UNIQUE_CARRIER',
 'OP_CARRIER_AIRLINE_ID',
 'TAIL_NUM',
 'ORIGIN_AIRPORT_ID',
 'ORIGIN_STATE_ABR',
 'DEST_AIRPORT_ID',
 'DEST',
 'DEST_STATE_ABR',
 'CRS_DEP_TIME',
 'DEP_TIME',
 'DEP_DELAY',
 'DEP_DELAY_NEW',
 'DEP_DEL15',
 'DEP_DELAY_GROUP',
 'DEP_TIME_BLK',
 'CRS_ARR_TIME',
 'ARR_TIME',
 'ARR_DELAY',
 'ARR_DELAY_NEW',
 'ARR_DEL15',
 'ARR_DELAY_GROUP',
 'ARR_TIME_BLK',
 'CANCELLED',
 'DIVERTED',
 'DISTANCE',
 'DISTANCE_GROUP',
 'CARRIER_DELAY',
 'WEATHER_DELAY',
 'NAS_DELAY',
 'SECURITY_DELAY',
 'LATE_AIRCRAFT_DELAY',
 'YEAR',
 'ORIGIN_icao',
 'ORIGIN_iata',
 'ORIGIN_airport_name',
 'ORIGIN_airport_city',
 'ORIGIN_airport_subd',
 'ORIGIN_airport_country',
 'ORIGIN_elevation',
 'ORIGIN_airport_lat',
 'ORIGIN_airport_lon',
 'ORIGIN_airport_tz',
 'flight_id',
 'ORIGIN_FLIGHT_TIMESTAMP',
 'HOUR_WEATHER_TIMESTAMP',
 'WEATHER_TIMESTAMP',
 'TWO_HOUR_WEATHER_TIMESTAMP',
 'THREE_HOUR_WEATHER_TIMESTAM

In [None]:
full_rank_set = spark.read.parquet(f"{blob_url}/rank_set")

In [None]:
full_set_2021 = full_rank_set.where("YEAR == '2021'")
full_set_2021.write.mode("overwrite").parquet(f"{blob_url}/test_rank_set")

full_set_2015_2020 = full_rank_set.where("YEAR != '2021'")
full_set_2015_2020.write.mode("overwrite").parquet(f"{blob_url}/train_rank_set")

In [None]:
train_set = full_set_2015_2020

train_set_1 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2016-01-01'")
dev_set_1 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2016-03-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2016-01-01'")

train_set_1.write.mode("overwrite").parquet(f"{blob_url}/train_qrn_set1")
dev_set_1.write.mode("overwrite").parquet(f"{blob_url}/dev_qrn_set1")

train_set_2 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2017-03-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2016-03-01'")
dev_set_2 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2017-05-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2017-03-01'")

train_set_2.write.mode("overwrite").parquet(f"{blob_url}/train_qrn_set2")
dev_set_2.write.mode("overwrite").parquet(f"{blob_url}/dev_qrn_set2")

train_set_3 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2018-07-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2017-05-01'")
dev_set_3 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2018-09-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2018-07-01'")

train_set_3.write.mode("overwrite").parquet(f"{blob_url}/train_qrn_set3")
dev_set_3.write.mode("overwrite").parquet(f"{blob_url}/dev_qrn_set3")

train_set_4 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2019-09-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2018-09-01'")
dev_set_4 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2019-11-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2019-09-01'")

train_set_4.write.mode("overwrite").parquet(f"{blob_url}/train_qrn_set4")
dev_set_4.write.mode("overwrite").parquet(f"{blob_url}/dev_qrn_set4")

train_set_5 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2020-10-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2019-11-01'")
dev_set_5 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP >= '2020-10-01'")

train_set_5.write.mode("overwrite").parquet(f"{blob_url}/train_qrn_set5")
dev_set_5.write.mode("overwrite").parquet(f"{blob_url}/dev_qrn_set5")

## Join Delay States

In [None]:
full_rank_set = spark.read.parquet(f"{blob_url}/rank_set")
full_rank_set.cache().count()

Out[3]: 41017808

In [None]:
delay = spark.read.parquet(f"{blob_url}/df_delay_stateCV") #df_delay_state
delay.cache().count()

delay = delay.withColumnRenamed("hour_stamp", "delay_hour_stamp")

In [None]:
full_delay_set = full_rank_set.join(delay, full_rank_set.hour_stamp == delay.delay_hour_stamp)
full_delay_set = full_delay_set.drop("delay_hour_stamp").withColumnRenamed("prediction","delay_state")
full_delay_set.cache().count()

Out[8]: 35207356

In [None]:
full_delay_set_2021 = full_delay_set.where("YEAR == '2021'")
#full_delay_set_2021.write.mode("overwrite").parquet(f"{blob_url}/test_delay_set")

full_delay_set_2015_2020 = full_delay_set.where("YEAR != '2021'")
#full_delay_set_2015_2020.write.mode("overwrite").parquet(f"{blob_url}/train_delay_set")

## Split Delay States

In [None]:
# train_set = spark.read.parquet(f"{blob_url}/train_delay_set")
# train_set.cache().count()

In [None]:
train_set = full_delay_set_2015_2020
train_set.cache().count()

train_set_1 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2016-01-01'")
dev_set_1 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2016-03-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2016-01-01'")

train_set_1.write.mode("overwrite").parquet(f"{blob_url}/train_delay_set1")
dev_set_1.write.mode("overwrite").parquet(f"{blob_url}/dev_delay_set1")

train_set_2 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2017-03-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2016-03-01'")
dev_set_2 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2017-05-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2017-03-01'")

train_set_2.write.mode("overwrite").parquet(f"{blob_url}/train_delay_set2")
dev_set_2.write.mode("overwrite").parquet(f"{blob_url}/dev_delay_set2")

train_set_3 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2018-07-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2017-05-01'")
dev_set_3 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2018-09-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2018-07-01'")

train_set_3.write.mode("overwrite").parquet(f"{blob_url}/train_delay_set3")
dev_set_3.write.mode("overwrite").parquet(f"{blob_url}/dev_delay_set3")

train_set_4 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2019-09-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2018-09-01'")
dev_set_4 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2019-11-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2019-09-01'")

train_set_4.write.mode("overwrite").parquet(f"{blob_url}/train_delay_set4")
dev_set_4.write.mode("overwrite").parquet(f"{blob_url}/dev_delay_set4")

train_set_5 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP < '2020-10-01' and ORIGIN_FLIGHT_TIMESTAMP >= '2019-11-01'")
dev_set_5 = train_set.where("ORIGIN_FLIGHT_TIMESTAMP >= '2020-10-01'")

train_set_5.write.mode("overwrite").parquet(f"{blob_url}/train_delay_set5")
dev_set_5.write.mode("overwrite").parquet(f"{blob_url}/dev_delay_set5")

In [None]:
# TO JOIN FULL DATA SET WITH PAGE RANK FEATURES
# page_rank = spark.read.parquet(f"{blob_url}/df_PageRank")

# for i in range(1,6):
#     split_train = read_clean(f"train_set_{i}")
#     split_dev = read_clean(f"dev_set_{i}")
    
#     print(f'Dimensions of Train Set {i} are: {split_train.count()}, {len(split_train.columns)}')
#     split_train_rank = split_train.withColumn("key_origin", concat(col("QUARTER"),lit('-'),col("YEAR"),lit('-'),col("ORIGIN")))
#     split_train_rank = split_train_rank.withColumn("key_dest", concat(col("QUARTER"),lit('-'),col("YEAR"),lit('-'),col("DEST")) )
#     split_train_rank = split_train_rank.join(page_rank, split_train_rank.key_origin == page_rank.key)
#     split_train_rank = split_train_rank.drop("PageRank", "quarter","node", "key_origin","key").withColumnRenamed("PageRank_use", "PageRank_origin")
#     split_train_rank = split_train_rank.join(page_rank, split_train_rank.key_dest == page_rank.key)
#     split_train_rank = split_train_rank.drop("PageRank", "quarter","node", "key_dest","key").withColumnRenamed("PageRank_use", "PageRank_dest")
#     print(f'Dimensions of Train Set {i} are: {split_train_rank.count()}, {len(split_train_rank.columns)}')
    
#     split_train_rank.write.mode("overwrite").parquet(f"{blob_url}/train_rank_set_{i}")
    
#     print(f'Dimensions of Test Set {i} are: {split_dev.count()}, {len(split_dev.columns)}')
#     split_dev_rank = split_dev.withColumn("key_origin", concat(col("QUARTER"),lit('-'),col("YEAR"),lit('-'),col("ORIGIN")))
#     split_dev_rank = split_dev_rank.withColumn("key_dest", concat(col("QUARTER"),lit('-'),col("YEAR"),lit('-'),col("DEST")) )
#     split_dev_rank = split_dev_rank.join(page_rank, split_dev_rank.key_origin == page_rank.key)
#     split_dev_rank = split_dev_rank.drop("PageRank", "quarter","node", "key_origin","key").withColumnRenamed("PageRank_use", "PageRank_origin")
#     split_dev_rank = split_dev_rank.join(page_rank, split_dev_rank.key_dest == page_rank.key)
#     split_dev_rank = split_dev_rank.drop("PageRank", "quarter","node", "key_dest","key").withColumnRenamed("PageRank_use", "PageRank_dest")
#     print(f'Dimensions of Test Set {i} are: {split_dev_rank.count()}, {len(split_dev_rank.columns)}')
    
#     split_dev_rank.write.mode("overwrite").parquet(f"{blob_url}/dev_rank_set_{i}")

# full_train_df = read_clean("train_set")

# train_rank = full_train_df.withColumn("key_origin", concat(col("QUARTER"),lit('-'),col("YEAR"),lit('-'),col("ORIGIN")))
# print(f'Dimensions of Full Train Set are: {train_rank.count()}, {len(train_rank.columns)}')
# train_rank = train_rank.withColumn("key_dest", concat(col("QUARTER"),lit('-'),col("YEAR"),lit('-'),col("DEST")) )
# train_rank = train_rank.join(page_rank, train_rank.key_origin == page_rank.key)
# train_rank = train_rank.drop("PageRank", "quarter","node", "key_origin","key").withColumnRenamed("PageRank_use", "PageRank_origin")
# train_rank = train_rank.join(page_rank, train_rank.key_dest == page_rank.key)
# train_rank = train_rank.drop("PageRank", "quarter","node", "key_dest","key").withColumnRenamed("PageRank_use", "PageRank_dest")
# print(f'Dimensions of Full Train Set are: {train_rank.count()}, {len(train_rank.columns)}')
# train_rank.write.mode("overwrite").parquet(f"{blob_url}/train_rank_set")

# full_test_df = read_clean("test_set")

# test_rank = full_test_df.withColumn("key_origin", concat(col("QUARTER"),lit('-'),col("YEAR"),lit('-'),col("ORIGIN")))
# print(f'Dimensions of Full Test Set are: {test_rank.count()}, {len(test_rank.columns)}')
# test_rank = test_rank.withColumn("key_dest", concat(col("QUARTER"),lit('-'),col("YEAR"),lit('-'),col("DEST")) )
# test_rank = test_rank.join(page_rank, test_rank.key_origin == page_rank.key)
# test_rank = test_rank.drop("PageRank", "quarter","node", "key_origin","key").withColumnRenamed("PageRank_use", "PageRank_origin")
# test_rank = test_rank.join(page_rank, test_rank.key_dest == page_rank.key)
# test_rank = test_rank.drop("PageRank", "quarter","node", "key_dest","key").withColumnRenamed("PageRank_use", "PageRank_dest")
# print(f'Dimensions of Full Test Set are: {test_rank.count()}, {len(test_rank.columns)}')
# test_rank.write.mode("overwrite").parquet(f"{blob_url}/test_rank_set")

In [None]:
df_delay_cv
df_delay_full

In [None]:
delay_state_1, delay_state_2, delay_state_3
delay_state_full

In [None]:
delay_state_full.join(df_delay_full) --> feature: delay_state

In [None]:
delay_state_1.join(df_delay_cv) --> feature: delay_state

###Full train and test set delay

In [None]:
delay = spark.read.parquet(f"{blob_url}/df_delay_state_2015_2020") #df_delay_state
delay.cache().count()

delay = delay.withColumnRenamed("hour_stamp", "delay_hour_stamp")

full_delay_set = full_rank_set.join(delay, full_rank_set.hour_stamp == delay.delay_hour_stamp)
full_delay_set = full_delay_set.drop("delay_hour_stamp").withColumnRenamed("prediction","delay_state")
full_delay_set.cache().count()

full_delay_set_2021 = full_delay_set.where("YEAR == '2021'")


[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
[0;32m<command-1215577238251273>[0m in [0;36m<cell line: 11>[0;34m()[0m
[1;32m      9[0m [0;34m[0m[0m
[1;32m     10[0m [0mfull_delay_set_2021[0m [0;34m=[0m [0mfull_delay_set[0m[0;34m.[0m[0mwhere[0m[0;34m([0m[0;34m"YEAR == '2021'"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 11[0;31m [0mfull_delay_set_2021[0m[0;34m.[0m[0mwrite[0m[0;34m.[0m[0mmode[0m[0;34m([0m[0;34m"overwrite"[0m[0;34m)[0m[0;34m.[0m[0mparquet[0m[0;34m([0m[0;34mf"{blob_url}/test_delay_set"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m     12[0m [0;34m[0m[0m
[1;32m     13[0m [0mfull_delay_set_2015_2020[0m [0;34m=[0m [0mfull_delay_set[0m[0;34m.[0m[0mwhere[0m[0;34m([0m[0;34m"YEAR != '2021'"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;32m/databricks/spark/python/pyspar

In [None]:
#full_delay_set_2021.count()
min_date, max_date = full_delay_set_2021.select(min("hour_stamp"), max("hour_stamp")).first()
min_date, max_date

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
[0;32m<command-1215577238251688>[0m in [0;36m<cell line: 2>[0;34m()[0m
[1;32m      1[0m [0;31m#full_delay_set_2021.count()[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 2[0;31m [0mmin_date[0m[0;34m,[0m [0mmax_date[0m [0;34m=[0m [0mfull_delay_set_2021[0m[0;34m.[0m[0mselect[0m[0;34m([0m[0mmin[0m[0;34m([0m[0;34m"hour_stamp"[0m[0;34m)[0m[0;34m,[0m [0mmax[0m[0;34m([0m[0;34m"hour_stamp"[0m[0;34m)[0m[0;34m)[0m[0;34m.[0m[0mfirst[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      3[0m [0mmin_date[0m[0;34m,[0m [0mmax_date[0m[0;34m[0m[0;34m[0m[0m

[0;32m/databricks/spark/python/pyspark/instrumentation_utils.py[0m in [0;36mwrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m             [0mstart[0m [0;34m=[0m [0mtime[0m[0;34m.[

In [None]:
display(full_delay_set_2021)

hour_stamp,ORIGIN,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,TAIL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_STATE_ABR,DEST_AIRPORT_ID,DEST,DEST_STATE_ABR,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,ARR_TIME_BLK,CANCELLED,DIVERTED,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,YEAR,ORIGIN_icao,ORIGIN_iata,ORIGIN_airport_name,ORIGIN_airport_city,ORIGIN_airport_subd,ORIGIN_airport_country,ORIGIN_elevation,ORIGIN_airport_lat,ORIGIN_airport_lon,ORIGIN_airport_tz,flight_id,ORIGIN_FLIGHT_TIMESTAMP,HOUR_WEATHER_TIMESTAMP,WEATHER_TIMESTAMP,TWO_HOUR_WEATHER_TIMESTAMP,THREE_HOUR_WEATHER_TIMESTAMP,ORIGIN_STATION_ID,ORIGIN_STATION_NAME,ORIGIN_STATION_ELEVATION,ORIGIN_STATION_LATITUDE,ORIGIN_STATION_LONGITUDE,ORIGIN_WEATHER_DATE,ORIGIN_WEATHER_REPORT_TYPE,ORIGIN_WEATHER_SOURCE,ORIGIN_HourlyPrecipitation,ORIGIN_HourlyPresentWeatherType,ORIGIN_STATION_TIMESTAMP,HOUR_TIMESTAMP,ORIGIN_HourlyDewPointTemperature,ORIGIN_HourlyDryBulbTemperature,ORIGIN_HourlyWetBulbTemperature,ORIGIN_HourlyStationPressure,ORIGIN_HourlyWindDirection,ORIGIN_HourlyWindSpeed,ORIGIN_HourlyVisibility,ORIGIN_AU_code,ORIGIN_AW_code,ORIGIN_MW_code,ORIGIN_AU_TS,ORIGIN_AU_ICE,ORIGIN_AU_SNOW,ORIGIN_AU_FOG,ORIGIN_AW_TS,ORIGIN_AW_ICE,ORIGIN_AW_SNOW,ORIGIN_AW_FOG,ORIGIN_MW_TS,ORIGIN_MW_ICE,ORIGIN_MW_SNOW,ORIGIN_MW_FOG,ORIGIN_TS,ORIGIN_ICE,ORIGIN_SNOW,ORIGIN_FOG,DEST_icao,DEST_iata,DEST_airport_name,DEST_airport_city,DEST_airport_subd,DEST_airport_country,DEST_elevation,DEST_airport_lat,DEST_airport_lon,DEST_airport_tz,DEST_STATION_ID,DEST_STATION_NAME,DEST_STATION_ELEVATION,DEST_STATION_LATITUDE,DEST_STATION_LONGITUDE,DEST_WEATHER_DATE,DEST_WEATHER_REPORT_TYPE,DEST_WEATHER_SOURCE,DEST_HourlyPrecipitation,DEST_HourlyPresentWeatherType,DEST_STATION_TIMESTAMP,DEST_HourlyDewPointTemperature,DEST_HourlyDryBulbTemperature,DEST_HourlyWetBulbTemperature,DEST_HourlyStationPressure,DEST_HourlyWindDirection,DEST_HourlyWindSpeed,DEST_HourlyVisibility,DEST_AU_code,DEST_AW_code,DEST_MW_code,DEST_AU_TS,DEST_AU_ICE,DEST_AU_SNOW,DEST_AU_FOG,DEST_AW_TS,DEST_AW_ICE,DEST_AW_SNOW,DEST_AW_FOG,DEST_MW_TS,DEST_MW_ICE,DEST_MW_SNOW,DEST_MW_FOG,DEST_TS,DEST_ICE,DEST_SNOW,DEST_FOG,ARRIVAL_FLIGHT_TIMESTAMP,PREV_DEP_DELAY,PREV_DEPARTURE_TIMESTAMP,PREV_ARR_DELAY,PREV_ARRIVAL_TIMESTAMP,time_between_flights,PREV_DEP_15,AVG_DELAY_ORIGIN_LAST_12,PER_DELAY_15_ORIGIN_LAST_12,AVG_DELAY_ORIGIN_LAST_6,PER_DELAY_15_ORIGIN_LAST_6,AVG_DELAY_ORIGIN_LAST_3,PER_DELAY_15_ORIGIN_LAST_3,AVG_DELAY_ORIGIN_LAST_1,PER_DELAY_15_ORIGIN_LAST_1,ORIGIN_FLIGHT_TIMESTAMP_ACT,hour_stamp_act,Q,Qs,Q1h_sum,Qs1h_sum,R1h,Q2h_sum,Qs2h_sum,R2h,Q3h_sum,Qs3h_sum,R3h,Q4h_sum,Qs4h_sum,R4h,Q5h_sum,Qs5h_sum,R5h,Q6h_sum,Qs6h_sum,R6h,Q7h_sum,Qs7h_sum,R7h,Q8h_sum,Qs8h_sum,R8h,Q9h_sum,Qs9h_sum,R9h,Q10h_sum,Qs10h_sum,R10h,Q11h_sum,Qs11h_sum,R11h,Q12h_sum,Qs12h_sum,R12h,N,N1h_sum,N2h_sum,N3h_sum,N4h_sum,N5h_sum,N6h_sum,N7h_sum,N8h_sum,N9h_sum,N10h_sum,N11h_sum,N12h_sum,PageRank_origin,PageRank_dest,delay_state
2021-01-01T11:00:00.000+0000,TPA,1,1,5,2021-01-01 00:00:00,DL,19790,N697DL,15304,FL,10397,ATL,GA,600,709,69.0,69.0,1.0,4,0600-0659,730,849,79.0,79.0,1.0,5,0700-0759,0.0,0.0,406.0,2,69.0,0.0,10.0,0.0,0.0,2021,KTPA,TPA,Tampa International Airport,Tampa,Florida,US,26.0,27.9755001068,-82.533203125,America/New_York,223338831361,2021-01-01T11:00:00.000+0000,2021010109,2021-01-01T09:00:00.000+0000,2021010108,2021010107,72211012842,"TAMPA INTERNATIONAL AIRPORT, FL US",1.8,27.96331,-82.54,2021-01-01T03:53:00,FM-15,7,0.0,,2021-01-01T08:53:00.000+0000,2021010108,66.0,71.0,68.0,30.1,130.0,8.0,10.0,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,KATL,ATL,Hartsfield Jackson Atlanta International Airport,Atlanta,Georgia,US,1026.0,33.6366996765,-84.4281005859,America/New_York,72219013874,"ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPORT, GA US",308.2,33.62972,-84.44224,2021-01-01T03:52:00,FM-15,7,0.01,-RA:02 BR:1 |s RA s |RA s,2021-01-01T08:52:00.000+0000,57.0,58.0,57.0,28.97,120.0,9.0,0.25,-RA:02 BR:1,s RA s,RA s,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2021-01-01T12:30:00.000+0000,0.0,2021-01-01T01:45:00.000+0000,0.0,2021-01-01T03:05:00.000+0000,475.0,0,8.393939393939394,0.1515151515151515,0.0,0.0,0.0,0.0,0.0,0.0,2021-01-01T12:09:00.000+0000,2021-01-01T12:00:00.000+0000,11,7,0,0,,0,0,,0,0,,0,0,,0,0,,1,0,,3,2,1.5,4,4,1.0,11,10,1.1,16,16,1.0,24,25,0.96,33,33,1.0,301,19,51,82,114,154,284,558,802,1132,1666,2188,2833,0.011367584658505,0.0608647377369676,0
2021-01-01T11:00:00.000+0000,TPA,1,1,5,2021-01-01 00:00:00,NK,20416,N638NK,15304,FL,10397,ATL,GA,610,630,20.0,20.0,1.0,1,0600-0659,737,751,14.0,14.0,0.0,0,0700-0759,0.0,0.0,406.0,2,0.0,0.0,0.0,0.0,0.0,2021,KTPA,TPA,Tampa International Airport,Tampa,Florida,US,26.0,27.9755001068,-82.533203125,America/New_York,180388945850,2021-01-01T11:10:00.000+0000,2021010109,2021-01-01T09:10:00.000+0000,2021010108,2021010107,72211012842,"TAMPA INTERNATIONAL AIRPORT, FL US",1.8,27.96331,-82.54,2021-01-01T03:53:00,FM-15,7,0.0,,2021-01-01T08:53:00.000+0000,2021010108,66.0,71.0,68.0,30.1,130.0,8.0,10.0,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,KATL,ATL,Hartsfield Jackson Atlanta International Airport,Atlanta,Georgia,US,1026.0,33.6366996765,-84.4281005859,America/New_York,72219013874,"ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPORT, GA US",308.2,33.62972,-84.44224,2021-01-01T03:52:00,FM-15,7,0.01,-RA:02 BR:1 |s RA s |RA s,2021-01-01T08:52:00.000+0000,57.0,58.0,57.0,28.97,120.0,9.0,0.25,-RA:02 BR:1,s RA s,RA s,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2021-01-01T12:37:00.000+0000,0.0,2021-01-01T06:10:00.000+0000,0.0,2021-01-01T10:25:00.000+0000,45.0,0,9.233333333333333,0.1666666666666666,0.0,0.0,0.0,0.0,0.0,0.0,2021-01-01T11:30:00.000+0000,2021-01-01T11:00:00.000+0000,11,7,0,0,,0,0,,0,0,,0,0,,0,0,,1,0,,3,2,1.5,4,4,1.0,11,10,1.1,16,16,1.0,24,25,0.96,33,33,1.0,301,19,51,82,114,154,284,558,802,1132,1666,2188,2833,0.011367584658505,0.0608647377369676,0
2021-01-01T11:00:00.000+0000,TPA,1,1,5,2021-01-01 00:00:00,AA,19805,N582UW,15304,FL,11057,CLT,NC,615,610,-5.0,0.0,0.0,-1,0600-0659,800,745,-15.0,0.0,0.0,-1,0800-0859,0.0,0.0,507.0,3,0.0,0.0,0.0,0.0,0.0,2021,KTPA,TPA,Tampa International Airport,Tampa,Florida,US,26.0,27.9755001068,-82.533203125,America/New_York,266288081116,2021-01-01T11:15:00.000+0000,2021010109,2021-01-01T09:15:00.000+0000,2021010108,2021010107,72211012842,"TAMPA INTERNATIONAL AIRPORT, FL US",1.8,27.96331,-82.54,2021-01-01T03:53:00,FM-15,7,0.0,,2021-01-01T08:53:00.000+0000,2021010108,66.0,71.0,68.0,30.1,130.0,8.0,10.0,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,KCLT,CLT,Charlotte Douglas International Airport,Charlotte,North Carolina,US,748.0,35.2140007019,-80.9430999756,America/New_York,72314013881,"CHARLOTTE DOUGLAS AIRPORT, NC US",222.6,35.22254,-80.95433,2021-01-01T03:52:00,FM-15,7,0.0,FG:2 |FG |,2021-01-01T08:52:00.000+0000,53.0,54.0,53.0,29.41,30.0,6.0,0.25,FG:2,FG,,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,2021-01-01T13:00:00.000+0000,0.0,2020-12-31T21:47:00.000+0000,0.0,2020-12-31T23:35:00.000+0000,700.0,0,9.551724137931034,0.1724137931034483,0.0,0.0,0.0,0.0,0.0,0.0,2021-01-01T11:10:00.000+0000,2021-01-01T11:00:00.000+0000,11,7,0,0,,0,0,,0,0,,0,0,,0,0,,1,0,,3,2,1.5,4,4,1.0,11,10,1.1,16,16,1.0,24,25,0.96,33,33,1.0,301,19,51,82,114,154,284,558,802,1132,1666,2188,2833,0.011367584658505,0.0379795703388318,0
2021-01-01T11:00:00.000+0000,TPA,1,1,5,2021-01-01 00:00:00,NK,20416,N620NK,15304,FL,11697,FLL,FL,630,627,-3.0,0.0,0.0,-1,0600-0659,735,738,3.0,3.0,0.0,0,0700-0759,0.0,0.0,197.0,1,0.0,0.0,0.0,0.0,0.0,2021,KTPA,TPA,Tampa International Airport,Tampa,Florida,US,26.0,27.9755001068,-82.533203125,America/New_York,94489286872,2021-01-01T11:30:00.000+0000,2021010109,2021-01-01T09:30:00.000+0000,2021010108,2021010107,72211012842,"TAMPA INTERNATIONAL AIRPORT, FL US",1.8,27.96331,-82.54,2021-01-01T03:53:00,FM-15,7,0.0,,2021-01-01T08:53:00.000+0000,2021010108,66.0,71.0,68.0,30.1,130.0,8.0,10.0,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,KFLL,FLL,Fort Lauderdale Hollywood International Airport,Fort Lauderdale,Florida,US,9.0,26.072599411,-80.1527023315,America/New_York,74783012849,"FORT LAUDERDALE INTERNATIONAL AIRPORT, FL US",1.0,26.07875,-80.16223,2021-01-01T03:53:00,FM-15,7,0.0,,2021-01-01T08:53:00.000+0000,71.0,75.0,72.0,30.12,130.0,16.0,10.0,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2021-01-01T12:35:00.000+0000,0.0,2021-01-01T02:20:00.000+0000,0.0,2021-01-01T04:59:00.000+0000,391.0,0,9.551724137931034,0.1724137931034483,0.0,0.0,0.0,0.0,0.0,0.0,2021-01-01T11:27:00.000+0000,2021-01-01T11:00:00.000+0000,11,7,0,0,,0,0,,0,0,,0,0,,0,0,,1,0,,3,2,1.5,4,4,1.0,11,10,1.1,16,16,1.0,24,25,0.96,33,33,1.0,301,19,51,82,114,154,284,558,802,1132,1666,2188,2833,0.011367584658505,0.0142532671699507,0
2021-01-01T11:00:00.000+0000,TPA,1,1,5,2021-01-01 00:00:00,AA,19805,N163AA,15304,FL,13303,MIA,FL,630,621,-9.0,0.0,0.0,-1,0600-0659,731,714,-17.0,0.0,0.0,-2,0700-0759,0.0,0.0,204.0,1,0.0,0.0,0.0,0.0,0.0,2021,KTPA,TPA,Tampa International Airport,Tampa,Florida,US,26.0,27.9755001068,-82.533203125,America/New_York,240518423745,2021-01-01T11:30:00.000+0000,2021010109,2021-01-01T09:30:00.000+0000,2021010108,2021010107,72211012842,"TAMPA INTERNATIONAL AIRPORT, FL US",1.8,27.96331,-82.54,2021-01-01T03:53:00,FM-15,7,0.0,,2021-01-01T08:53:00.000+0000,2021010108,66.0,71.0,68.0,30.1,130.0,8.0,10.0,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,KMIA,MIA,Miami International Airport,Miami,Florida,US,8.0,25.7931995392,-80.2906036377,America/New_York,72202012839,"MIAMI INTERNATIONAL AIRPORT, FL US",1.4,25.78805,-80.31694,2021-01-01T04:27:00,FM-16,7,0.0,,2021-01-01T09:27:00.000+0000,69.0,75.0,71.0,30.11,130.0,11.0,10.0,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2021-01-01T12:31:00.000+0000,6.0,2021-01-01T01:11:00.000+0000,0.0,2021-01-01T02:21:00.000+0000,549.0,0,9.551724137931034,0.1724137931034483,0.0,0.0,0.0,0.0,0.0,0.0,2021-01-01T11:21:00.000+0000,2021-01-01T11:00:00.000+0000,11,7,0,0,,0,0,,0,0,,0,0,,0,0,,1,0,,3,2,1.5,4,4,1.0,11,10,1.1,16,16,1.0,24,25,0.96,33,33,1.0,301,19,51,82,114,154,284,558,802,1132,1666,2188,2833,0.011367584658505,0.01163132339679,0
2021-01-01T11:00:00.000+0000,TPA,1,1,5,2021-01-01 00:00:00,AA,19805,N868NN,15304,FL,14100,PHL,PA,630,622,-8.0,0.0,0.0,-1,0600-0659,855,836,-19.0,0.0,0.0,-2,0800-0859,0.0,0.0,920.0,4,0.0,0.0,0.0,0.0,0.0,2021,KTPA,TPA,Tampa International Airport,Tampa,Florida,US,26.0,27.9755001068,-82.533203125,America/New_York,214748538450,2021-01-01T11:30:00.000+0000,2021010109,2021-01-01T09:30:00.000+0000,2021010108,2021010107,72211012842,"TAMPA INTERNATIONAL AIRPORT, FL US",1.8,27.96331,-82.54,2021-01-01T03:53:00,FM-15,7,0.0,,2021-01-01T08:53:00.000+0000,2021010108,66.0,71.0,68.0,30.1,130.0,8.0,10.0,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,KPHL,PHL,Philadelphia International Airport,Philadelphia,Pennsylvania,US,36.0,39.8718986511,-75.2410964966,America/New_York,72408013739,"PHILADELPHIA INTERNATIONAL AIRPORT, PA US",2.1,39.87326,-75.22681,2021-01-01T03:54:00,FM-15,7,0.0,,2021-01-01T08:54:00.000+0000,21.0,30.0,27.0,30.45,360.0,6.0,10.0,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2021-01-01T13:55:00.000+0000,0.0,2020-12-31T20:45:00.000+0000,6.0,2020-12-31T23:07:00.000+0000,743.0,0,9.551724137931034,0.1724137931034483,0.0,0.0,0.0,0.0,0.0,0.0,2021-01-01T11:22:00.000+0000,2021-01-01T11:00:00.000+0000,11,7,0,0,,0,0,,0,0,,0,0,,0,0,,1,0,,3,2,1.5,4,4,1.0,11,10,1.1,16,16,1.0,24,25,0.96,33,33,1.0,301,19,51,82,114,154,284,558,802,1132,1666,2188,2833,0.011367584658505,0.0143716560629373,0
2021-01-01T11:00:00.000+0000,TPA,1,1,5,2021-01-01 00:00:00,DL,19790,N874DN,15304,FL,12892,LAX,CA,655,655,0.0,0.0,0.0,0,0600-0659,920,921,1.0,1.0,0.0,0,0900-0959,0.0,0.0,2158.0,9,0.0,0.0,0.0,0.0,0.0,2021,KTPA,TPA,Tampa International Airport,Tampa,Florida,US,26.0,27.9755001068,-82.533203125,America/New_York,171799943403,2021-01-01T11:55:00.000+0000,2021010109,2021-01-01T09:55:00.000+0000,2021010108,2021010107,72211012842,"TAMPA INTERNATIONAL AIRPORT, FL US",1.8,27.96331,-82.54,2021-01-01T04:53:00,FM-15,7,0.0,,2021-01-01T09:53:00.000+0000,2021010109,66.0,71.0,68.0,30.1,130.0,9.0,10.0,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,KLAX,LAX,Los Angeles International Airport,Los Angeles,California,US,125.0,33.94250107,-118.4079971,America/Los_Angeles,72295023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",29.7,33.93816,-118.3866,2021-01-01T01:53:00,FM-15,7,0.0,,2021-01-01T09:53:00.000+0000,33.0,49.0,42.0,29.65,100.0,3.0,10.0,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2021-01-01T17:20:00.000+0000,0.0,2020-12-31T19:00:00.000+0000,0.0,2020-12-31T23:23:00.000+0000,752.0,0,10.653846153846152,0.1923076923076923,0.0,0.0,0.0,0.0,0.0,0.0,2021-01-01T11:55:00.000+0000,2021-01-01T11:00:00.000+0000,11,7,0,0,,0,0,,0,0,,0,0,,0,0,,1,0,,3,2,1.5,4,4,1.0,11,10,1.1,16,16,1.0,24,25,0.96,33,33,1.0,301,19,51,82,114,154,284,558,802,1132,1666,2188,2833,0.011367584658505,0.0259226989181112,0
2021-01-01T12:00:00.000+0000,ABR,1,1,5,2021-01-01 00:00:00,OO,20304,N446SW,10141,SD,13487,MSP,MN,610,615,5.0,5.0,0.0,0,0600-0659,727,721,-6.0,0.0,0.0,-1,0700-0759,0.0,0.0,257.0,2,0.0,0.0,0.0,0.0,0.0,2021,KABR,ABR,Aberdeen Regional Airport,Aberdeen,South Dakota,US,1302.0,45.4491004944,-98.4217987061,America/Chicago,85900030351,2021-01-01T12:10:00.000+0000,2021010110,2021-01-01T10:10:00.000+0000,2021010109,2021010108,72659014929,"ABERDEEN REGIONAL AIRPORT, SD US",395.6,45.44358,-98.41384,2021-01-01T03:53:00,FM-15,7,0.0,,2021-01-01T09:53:00.000+0000,2021010109,1.0,5.0,4.0,28.69,180.0,3.0,9.0,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,KMSP,MSP,Minneapolis-St Paul International/Wold-Chamberlain Airport,Minneapolis,Minnesota,US,841.0,44.8819999695,-93.2218017578,America/Chicago,72658014922,"MINNEAPOLIS ST. PAUL INTERNATIONAL AIRPORT, MN US",254.5,44.88523,-93.23133,2021-01-01T03:53:00,FM-15,7,0.0,,2021-01-01T09:53:00.000+0000,12.0,15.0,14.0,29.3,120.0,9.0,7.0,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2021-01-01T13:27:00.000+0000,0.0,2020-12-31T21:27:00.000+0000,0.0,2020-12-31T22:45:00.000+0000,805.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-01-01T12:15:00.000+0000,2021-01-01T12:00:00.000+0000,1,1,0,0,,0,0,,0,0,,0,0,,0,0,,0,0,,0,0,,0,0,,0,0,,0,0,,0,0,,0,0,,404,18,37,69,100,132,172,302,576,820,1150,1684,2206,0.0002129666915478473,0.0239024711971578,0
2021-01-01T12:00:00.000+0000,DVL,1,1,5,2021-01-01 00:00:00,OO,20304,N910SW,11447,ND,12519,JMS,ND,630,622,-8.0,0.0,0.0,-1,0600-0659,727,719,-8.0,0.0,0.0,-1,0700-0759,0.0,0.0,83.0,1,0.0,0.0,0.0,0.0,0.0,2021,KDVL,DVL,Devils Lake Regional Airport,Devils Lake,North Dakota,US,1456.0,48.11420059,-98.90879822,America/Chicago,120259271509,2021-01-01T12:30:00.000+0000,2021010110,2021-01-01T10:30:00.000+0000,2021010109,2021010108,72757394928,"DEVILS LAKE MUNICIPAL AIRPORT, ND US",439.2,48.11667,-98.9,2021-01-01T04:08:00,FM-15,7,0.0,BR:1 ||,2021-01-01T10:08:00.000+0000,2021010110,27.0,28.0,28.0,28.48,300.0,5.0,5.0,BR:1,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,KJMS,JMS,Jamestown Regional Airport,Jamestown,North Dakota,US,1500.0,46.92969894,-98.67819977,America/Chicago,72753514919,"JAMESTOWN MUNICIPAL AIRPORT, ND US",455.7,46.92586,-98.67037,2021-01-01T04:12:00,FM-16,7,0.0,,2021-01-01T10:12:00.000+0000,23.0,27.0,26.0,28.43,320.0,7.0,10.0,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2021-01-01T13:27:00.000+0000,0.0,2021-01-01T04:46:00.000+0000,0.0,2021-01-01T05:32:00.000+0000,418.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-01-01T12:22:00.000+0000,2021-01-01T12:00:00.000+0000,1,1,0,0,,0,0,,0,0,,0,0,,0,0,,0,0,,0,0,,0,0,,0,0,,0,0,,0,0,,0,0,,404,18,37,69,100,132,172,302,576,820,1150,1684,2206,0.0002084750936330215,0.0002878991479299584,0
2021-01-01T13:00:00.000+0000,CLE,1,1,5,2021-01-01 00:00:00,WN,19393,N417WN,11042,OH,14635,RSW,FL,800,757,-3.0,0.0,0.0,-1,0800-0859,1055,1100,5.0,5.0,0.0,0,1000-1059,0.0,0.0,1025.0,5,0.0,0.0,0.0,0.0,0.0,2021,KCLE,CLE,Cleveland Hopkins International Airport,Cleveland,Ohio,US,791.0,41.4117012024,-81.8498001099,America/New_York,146029774887,2021-01-01T13:00:00.000+0000,2021010111,2021-01-01T11:00:00.000+0000,2021010110,2021010109,72524014820,"CLEVELAND HOPKINS INTERNATIONAL AIRPORT, OH US",236.8,41.40568,-81.85191,2021-01-01T05:51:00,FM-15,7,0.0,,2021-01-01T10:51:00.000+0000,2021010110,23.0,28.0,26.0,29.43,70.0,7.0,10.0,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,KRSW,RSW,Southwest Florida International Airport,Fort Myers,Florida,US,30.0,26.5361995697,-81.7552032471,America/New_York,72210812894,"FORT MYERS SW FLORIDA REGIONAL AIRPORT, FL US",8.2,26.53805,-81.75674,2021-01-01T05:53:00,FM-15,7,0.0,,2021-01-01T10:53:00.000+0000,66.0,68.0,67.0,30.1,100.0,6.0,7.0,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2021-01-01T15:55:00.000+0000,6.0,2020-12-31T21:35:00.000+0000,0.0,2020-12-31T23:00:00.000+0000,840.0,0,1.1666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-01-01T12:57:00.000+0000,2021-01-01T12:00:00.000+0000,7,11,1,0,,1,0,,1,0,,1,0,,1,0,,1,0,,1,0,,1,0,,1,1,1.0,3,2,1.5,5,4,1.25,5,4,1.25,566,80,98,117,149,180,212,252,382,656,900,1230,1764,0.0054912438432515,0.0071780330946512,0


Output can only be rendered in Databricks

In [None]:
full_delay_set_2021.write.mode("overwrite").parquet(f"{blob_url}/test_delay_set")

full_delay_set_2015_2020 = full_delay_set.where("YEAR != '2021'")
full_delay_set_2015_2020.write.mode("overwrite").parquet(f"{blob_url}/train_delay_set")

In [None]:
REFRESH [TABLE] full_delay_set_2021

[0;36m  File [0;32m"<command-1215577238251882>"[0;36m, line [0;32m1[0m
[0;31m    REFRESH [TABLE] full_delay_set_2021[0m
[0m                    ^[0m
[0;31mSyntaxError[0m[0;31m:[0m invalid syntax
