In [0]:
# The following blob storage is accessible to team members only (read and write)
# access key is valid til TTL
# after that you will need to create a new SAS key and authenticate access again via DataBrick command line
blob_container  = "final-project-summer24-team3"       # The name of your container created in https://portal.azure.com
storage_account = "summer2024team3"  # The name of your Storage account created in https://portal.azure.com
secret_scope    = "summer24_team_3_2_scope"           # The name of the scope created in your local computer using the Databricks CLI
secret_key      = "final-project-summer24-team3"             # The name of the secret key created in your local computer using the Databricks CLI
team_blob_url   = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"  #points to the root of your team storage bucket

# the 261 course blob storage is mounted here.
mids261_mount_path      = "/mnt/mids-w261"

# SAS Token: Grant the team limited access to Azure Storage resources
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

# see what's in the blob storage root folder 
display(dbutils.fs.ls(f"{team_blob_url}"))

path,name,size,modificationTime
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/,TP/,0,1722707375000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/df_clean_transformed_ian/,df_clean_transformed_ian/,0,1722606719000


In [0]:
from pyspark.sql.functions import col, isnan, count, when, split, concat, lit, min, row_number, lower, lpad, udf, first, countDistinct, coalesce, to_timestamp, monotonically_increasing_id, explode, array
from pyspark.sql.types import StringType, StructField, StructType, FloatType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.window import Window
import pyspark.sql.functions as F
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from datetime import datetime, timedelta
import re 

In [0]:
#Create UDF to subtract 3 hours from a date/time 
def subtract_hours(hour_str, year_str, month_str, day_str, num_hours = 2):
    '''
    Subtracts num_hours from a given date/time of the initial flight departure. If the return time is before 00:00, the date returned will be the previous day 
    
    Inputs:
        hour_str (str): hour of the flight departure time
        year_str (str): year of the date of the flight departure
        month_str (str): month of the date of the flight departure
        day_str (str): day of the date of the flight departure
        num_hours (int): number of hours to subtract
    Outputs:
        year (str): year of the date of departure num hours prior
        month (str): month of the date of departure num hours prior (2 digit format)
        day (str): day of the date of departure num hours prior (2 digit format)
        hour (str): hour of the departure num hours prior (24 hour format)

    ''' 
    date_str = f"{year_str}-{month_str}-{day_str}"
    dt = datetime.strptime(date_str, "%Y-%m-%d")
    # if the hour of departure is before 2am, subtract a day 
    if int(hour_str) < num_hours:
        dt = dt - timedelta(days=1)
    new_hour = (int(hour_str) - num_hours) % 24
    return (f"{dt.year}", f"{dt.month:02d}", f"{dt.day:02d}", f"{new_hour:02d}")

# Create schema for the UDF 
schema = StructType([
        StructField("year", StringType()),
        StructField("month", StringType()),
        StructField("day", StringType()),
        StructField("hour", StringType())
])
subtract_hours_udf = udf(subtract_hours, schema)

def translate_weather_type(s): 
    if s is None:
        return None
    s = s.strip().split("|")
    # weather type separated by pipe, if there are 3 groups, first 2 groups are from a machine and last group is human entered. If human entered, use their input, otherwise combine machine entered 
    if len(s) < 3 or ((len(s) == 3) and (len(s[-1]) < 2)):
        conditions = " ".join(s)
    else:
        conditions = s[-1]
    # Keep running list of weather conditions and combine together at the end 
    translated_conditions = []
    if "TS" in conditions: #Thunderstorm
        translated_conditions.append("Thunder")
    if "FC" in conditions: #Funnel cloud, waterspout, tornado
        translated_conditions.append("Tornado")
    if "WIND" in conditions: #High winds
        translated_conditions.append("Wind")
    # Weather conditions groupings 
    if any(["PL" in conditions, # ice pellets
            "GR" in conditions, # hail
            "GL" in conditions, # Glaze or rime 
            "SN" in conditions, # snow
            "SG" in conditions, # snow grains 
            "IC" in conditions, # Ice crystals
            "GS" in conditions # Small hail 
            ]): 
        translated_conditions.append("Snow") 
    if any(["DU" in conditions, # Widespread dust
            "HZ" in conditions, # haze 
            "FU" in conditions, # Smoke
            "VA" in conditions, #volcanic ash
            "SA" in conditions, #sandstorm
            "PO" in conditions, # Well developed dust/sand whirls 
            "SS" in conditions, #sandstorm
            "DS" in conditions, #duststorm
            "FG" in conditions, # fog  
            "BR" in conditions # Mist
            ]): 
        translated_conditions.append("View Obstruction")
    if any(["PY" in conditions, #spray 
            "SH" in conditions, #showers
            "RA" in conditions, # rain
            "DZ" in conditions #drizzle
            ]):
        translated_conditions.append("Rain")
    translated_string = ",".join(translated_conditions)
    if len(translated_string) == 0:
        return None
    return translated_string    

translate_weather_type_udf = udf(translate_weather_type, StringType())

def clean_weather(s):
    s = str(s)
    #extract digits and decimals 
    val = re.findall("\d+\.?\d+", s)
    # if there's no digits, return 0  
    if len(val) == 0: 
        return 0
    else:
        #take first number / decimal
        return float(val[0])
    # if s is None: 
    #     return None
    # s = s.strip().lower()
    # if 't' in s or s == "*":
    #     return 0
    # if s.endswith('s') or s.endswith("v"):
    #     val = s.replace('s', '')
    #     val = val.replace('v', '')
    #     d = val.find('.')
    #     if d != -1:
    #         val = val[:d+1] + val[d:].replace('.','')
    #     val = float(val)
    #     return val/2
    # else:
    #     d = s.find('.')
    #     if d != -1:
    #         s = s[:d+1] + s[d:].replace('.','')
    #     val = float(s)
    #     return val
clean_weather_udf = udf(clean_weather, FloatType())

In [0]:
merged_3m = spark.read.parquet("wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/flight_stations_weather_3m")
merged_1y = spark.read.parquet("wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/flight_stations_weather_1y")
merged_all = spark.read.parquet("wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/flight_stations_weather_all")

weather_3m = spark.read.parquet(f"dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_weather_data_3m/")
weather_1y = spark.read.parquet("dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_weather_data_1y/")
weather_all = spark.read.parquet("dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_weather_data/")

stations = spark.read.parquet(f"dbfs:/mnt/mids-w261/datasets_final_project_2022/stations_data/stations_with_neighbors.parquet/")

In [0]:
n_null_3m = merged_3m.select(
    *[(count(when(col(c).isNull() | isnan(c), c))).alias(c)
    for c in merged_3m.columns]
)
n_null_1y = merged_1y.select(
    *[(count(when(col(c).isNull() | isnan(c), c))).alias(c)
    for c in merged_1y.columns]
)
n_null_all = merged_all.select(
    *[(count(when(col(c).isNull() | isnan(c), c))).alias(c)
    for c in merged_all.columns]
)


perc_null_3m = merged_3m.select(
    *[(count(when(col(c).isNull() | isnan(c), c)) / count("*")).alias(c)
    for c in merged_3m.columns]
)
perc_null_1y = merged_1y.select(
    *[(count(when(col(c).isNull() | isnan(c), c)) / count("*")).alias(c)
    for c in merged_1y.columns]
)
perc_null_all = merged_all.select(
    *[(count(when(col(c).isNull() | isnan(c), c)) / count("*")).alias(c)
    for c in merged_all.columns]
)


In [0]:
display(n_null_3m)

QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,CRS_DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,ARR_TIME_BLK,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,FLIGHTS,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,FIRST_DEP_TIME,TOTAL_ADD_GTIME,LONGEST_ADD_GTIME,DIV_AIRPORT_LANDINGS,DIV_REACHED_DEST,DIV_ACTUAL_ELAPSED_TIME,DIV_ARR_DELAY,DIV_DISTANCE,DIV1_AIRPORT,DIV1_AIRPORT_ID,DIV1_AIRPORT_SEQ_ID,DIV1_WHEELS_ON,DIV1_TOTAL_GTIME,DIV1_LONGEST_GTIME,DIV1_WHEELS_OFF,DIV1_TAIL_NUM,DIV2_AIRPORT,DIV2_AIRPORT_ID,DIV2_AIRPORT_SEQ_ID,DIV2_WHEELS_ON,DIV2_TOTAL_GTIME,DIV2_LONGEST_GTIME,DIV2_WHEELS_OFF,DIV2_TAIL_NUM,DIV3_AIRPORT,DIV3_AIRPORT_ID,DIV3_AIRPORT_SEQ_ID,DIV3_WHEELS_ON,DIV3_TOTAL_GTIME,DIV3_LONGEST_GTIME,DIV3_WHEELS_OFF,DIV3_TAIL_NUM,DIV4_AIRPORT,DIV4_AIRPORT_ID,DIV4_AIRPORT_SEQ_ID,DIV4_WHEELS_ON,DIV4_TOTAL_GTIME,DIV4_LONGEST_GTIME,DIV4_WHEELS_OFF,DIV4_TAIL_NUM,DIV5_AIRPORT,DIV5_AIRPORT_ID,DIV5_AIRPORT_SEQ_ID,DIV5_WHEELS_ON,DIV5_TOTAL_GTIME,DIV5_LONGEST_GTIME,DIV5_WHEELS_OFF,DIV5_TAIL_NUM,YEAR,iata_code,type,station_id,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,REPORT_TYPE,SOURCE,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyPresentWeatherType,HourlyPressureChange,HourlyPressureTendency,HourlyRelativeHumidity,HourlySkyConditions,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,Sunrise,Sunset,DailyAverageDewPointTemperature,DailyAverageDryBulbTemperature,DailyAverageRelativeHumidity,DailyAverageSeaLevelPressure,DailyAverageStationPressure,DailyAverageWetBulbTemperature,DailyAverageWindSpeed,DailyCoolingDegreeDays,DailyDepartureFromNormalAverageTemperature,DailyHeatingDegreeDays,DailyMaximumDryBulbTemperature,DailyMinimumDryBulbTemperature,DailyPeakWindDirection,DailyPeakWindSpeed,DailyPrecipitation,DailySnowDepth,DailySnowfall,DailySustainedWindDirection,DailySustainedWindSpeed,DailyWeather,MonthlyAverageRH,MonthlyDaysWithGT001Precip,MonthlyDaysWithGT010Precip,MonthlyDaysWithGT32Temp,MonthlyDaysWithGT90Temp,MonthlyDaysWithLT0Temp,MonthlyDaysWithLT32Temp,MonthlyDepartureFromNormalAverageTemperature,MonthlyDepartureFromNormalCoolingDegreeDays,MonthlyDepartureFromNormalHeatingDegreeDays,MonthlyDepartureFromNormalMaximumTemperature,MonthlyDepartureFromNormalMinimumTemperature,MonthlyDepartureFromNormalPrecipitation,MonthlyDewpointTemperature,MonthlyGreatestPrecip,MonthlyGreatestPrecipDate,MonthlyGreatestSnowDepth,MonthlyGreatestSnowDepthDate,MonthlyGreatestSnowfall,MonthlyGreatestSnowfallDate,MonthlyMaxSeaLevelPressureValue,MonthlyMaxSeaLevelPressureValueDate,MonthlyMaxSeaLevelPressureValueTime,MonthlyMaximumTemperature,MonthlyMeanTemperature,MonthlyMinSeaLevelPressureValue,MonthlyMinSeaLevelPressureValueDate,MonthlyMinSeaLevelPressureValueTime,MonthlyMinimumTemperature,MonthlySeaLevelPressure,MonthlyStationPressure,MonthlyTotalLiquidPrecipitation,MonthlyTotalSnowfall,MonthlyWetBulb,AWND,CDSD,CLDD,DSNW,HDSD,HTDD,NormalsCoolingDegreeDay,NormalsHeatingDegreeDay,ShortDurationEndDate005,ShortDurationEndDate010,ShortDurationEndDate015,ShortDurationEndDate020,ShortDurationEndDate030,ShortDurationEndDate045,ShortDurationEndDate060,ShortDurationEndDate080,ShortDurationEndDate100,ShortDurationEndDate120,ShortDurationEndDate150,ShortDurationEndDate180,ShortDurationPrecipitationValue005,ShortDurationPrecipitationValue010,ShortDurationPrecipitationValue015,ShortDurationPrecipitationValue020,ShortDurationPrecipitationValue030,ShortDurationPrecipitationValue045,ShortDurationPrecipitationValue060,ShortDurationPrecipitationValue080,ShortDurationPrecipitationValue100,ShortDurationPrecipitationValue120,ShortDurationPrecipitationValue150,ShortDurationPrecipitationValue180,REM,BackupDirection,BackupDistance,BackupDistanceUnit,BackupElements,BackupElevation,BackupEquipment,BackupLatitude,BackupLongitude,BackupName,WindEquipmentChangeDate
0,0,0,0,0,0,0,0,8066,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,42072,42072,42072,42072,0,42875,42875,44057,44057,0,44057,46253,46253,46253,46253,0,0,1326493,0,2,46253,46253,0,0,0,1088449,1088449,1088449,1088449,1088449,1360895,1360895,1360895,0,1366648,1367501,1367501,1366648,1366319,1366319,1366319,1366319,1366319,1366319,1367453,1367453,1369640,1369640,1369640,1369640,1369640,1369640,1369686,1369686,1369695,1369695,1369695,1369695,1369695,1369695,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,0,0,0,0,291,291,291,291,291,291,291,273621,4795,4622,387357,1160501,732548,732548,4876,127600,185954,18662,4465,19184,12012,1225004,4894,1369696,1369696,1369697,1369697,1369697,1369697,1369696,1369697,1369696,1369697,1369697,1369697,1369697,1369697,1369697,1369696,1369696,1369696,1369697,1369696,1369696,1369696,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,1369697,4047,886848,886848,886848,849095,1023226,901974,1005721,1005721,827928,477696


In [0]:
display(perc_null_3m)

QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,CRS_DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,ARR_TIME_BLK,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,FLIGHTS,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,FIRST_DEP_TIME,TOTAL_ADD_GTIME,LONGEST_ADD_GTIME,DIV_AIRPORT_LANDINGS,DIV_REACHED_DEST,DIV_ACTUAL_ELAPSED_TIME,DIV_ARR_DELAY,DIV_DISTANCE,DIV1_AIRPORT,DIV1_AIRPORT_ID,DIV1_AIRPORT_SEQ_ID,DIV1_WHEELS_ON,DIV1_TOTAL_GTIME,DIV1_LONGEST_GTIME,DIV1_WHEELS_OFF,DIV1_TAIL_NUM,DIV2_AIRPORT,DIV2_AIRPORT_ID,DIV2_AIRPORT_SEQ_ID,DIV2_WHEELS_ON,DIV2_TOTAL_GTIME,DIV2_LONGEST_GTIME,DIV2_WHEELS_OFF,DIV2_TAIL_NUM,DIV3_AIRPORT,DIV3_AIRPORT_ID,DIV3_AIRPORT_SEQ_ID,DIV3_WHEELS_ON,DIV3_TOTAL_GTIME,DIV3_LONGEST_GTIME,DIV3_WHEELS_OFF,DIV3_TAIL_NUM,DIV4_AIRPORT,DIV4_AIRPORT_ID,DIV4_AIRPORT_SEQ_ID,DIV4_WHEELS_ON,DIV4_TOTAL_GTIME,DIV4_LONGEST_GTIME,DIV4_WHEELS_OFF,DIV4_TAIL_NUM,DIV5_AIRPORT,DIV5_AIRPORT_ID,DIV5_AIRPORT_SEQ_ID,DIV5_WHEELS_ON,DIV5_TOTAL_GTIME,DIV5_LONGEST_GTIME,DIV5_WHEELS_OFF,DIV5_TAIL_NUM,YEAR,iata_code,type,station_id,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,REPORT_TYPE,SOURCE,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyPresentWeatherType,HourlyPressureChange,HourlyPressureTendency,HourlyRelativeHumidity,HourlySkyConditions,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,Sunrise,Sunset,DailyAverageDewPointTemperature,DailyAverageDryBulbTemperature,DailyAverageRelativeHumidity,DailyAverageSeaLevelPressure,DailyAverageStationPressure,DailyAverageWetBulbTemperature,DailyAverageWindSpeed,DailyCoolingDegreeDays,DailyDepartureFromNormalAverageTemperature,DailyHeatingDegreeDays,DailyMaximumDryBulbTemperature,DailyMinimumDryBulbTemperature,DailyPeakWindDirection,DailyPeakWindSpeed,DailyPrecipitation,DailySnowDepth,DailySnowfall,DailySustainedWindDirection,DailySustainedWindSpeed,DailyWeather,MonthlyAverageRH,MonthlyDaysWithGT001Precip,MonthlyDaysWithGT010Precip,MonthlyDaysWithGT32Temp,MonthlyDaysWithGT90Temp,MonthlyDaysWithLT0Temp,MonthlyDaysWithLT32Temp,MonthlyDepartureFromNormalAverageTemperature,MonthlyDepartureFromNormalCoolingDegreeDays,MonthlyDepartureFromNormalHeatingDegreeDays,MonthlyDepartureFromNormalMaximumTemperature,MonthlyDepartureFromNormalMinimumTemperature,MonthlyDepartureFromNormalPrecipitation,MonthlyDewpointTemperature,MonthlyGreatestPrecip,MonthlyGreatestPrecipDate,MonthlyGreatestSnowDepth,MonthlyGreatestSnowDepthDate,MonthlyGreatestSnowfall,MonthlyGreatestSnowfallDate,MonthlyMaxSeaLevelPressureValue,MonthlyMaxSeaLevelPressureValueDate,MonthlyMaxSeaLevelPressureValueTime,MonthlyMaximumTemperature,MonthlyMeanTemperature,MonthlyMinSeaLevelPressureValue,MonthlyMinSeaLevelPressureValueDate,MonthlyMinSeaLevelPressureValueTime,MonthlyMinimumTemperature,MonthlySeaLevelPressure,MonthlyStationPressure,MonthlyTotalLiquidPrecipitation,MonthlyTotalSnowfall,MonthlyWetBulb,AWND,CDSD,CLDD,DSNW,HDSD,HTDD,NormalsCoolingDegreeDay,NormalsHeatingDegreeDay,ShortDurationEndDate005,ShortDurationEndDate010,ShortDurationEndDate015,ShortDurationEndDate020,ShortDurationEndDate030,ShortDurationEndDate045,ShortDurationEndDate060,ShortDurationEndDate080,ShortDurationEndDate100,ShortDurationEndDate120,ShortDurationEndDate150,ShortDurationEndDate180,ShortDurationPrecipitationValue005,ShortDurationPrecipitationValue010,ShortDurationPrecipitationValue015,ShortDurationPrecipitationValue020,ShortDurationPrecipitationValue030,ShortDurationPrecipitationValue045,ShortDurationPrecipitationValue060,ShortDurationPrecipitationValue080,ShortDurationPrecipitationValue100,ShortDurationPrecipitationValue120,ShortDurationPrecipitationValue150,ShortDurationPrecipitationValue180,REM,BackupDirection,BackupDistance,BackupDistanceUnit,BackupElements,BackupElevation,BackupEquipment,BackupLatitude,BackupLongitude,BackupName,WindEquipmentChangeDate
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0058888936750244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0307162825062769,0.0307162825062769,0.0307162825062769,0.0307162825062769,0.0,0.0313025435552534,0.0313025435552534,0.0321655081379312,0.0321655081379312,0.0,0.0321655081379312,0.0337687824387437,0.0337687824387437,0.0337687824387437,0.0337687824387437,0.0,0.0,0.9684572573350164,0.0,1.4601769588456424e-06,0.0337687824387437,0.0337687824387437,0.0,0.0,0.0,0.7946640753392904,0.7946640753392904,0.7946640753392904,0.7946640753392904,0.7946640753392904,0.9935737612041204,0.9935737612041204,0.9935737612041204,0.0,0.9977739602262398,0.9983967256991876,0.9983967256991876,0.9977739602262398,0.9975337611165096,0.9975337611165096,0.9975337611165096,0.9975337611165096,0.9975337611165096,0.9975337611165096,0.9983616814521752,0.9983616814521752,0.9999583849566728,0.9999583849566728,0.9999583849566728,0.9999583849566728,0.9999583849566728,0.9999583849566728,0.9999919690267264,0.9999919690267264,0.9999985398230412,0.9999985398230412,0.9999985398230412,0.9999985398230412,0.9999985398230412,0.9999985398230412,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.000212455747512041,0.000212455747512041,0.000212455747512041,0.000212455747512041,0.000212455747512041,0.000212455747512041,0.000212455747512041,0.1997675398281517,0.0035007742588324,0.0033744689518922,0.2828048831237857,0.8472684104586635,0.5348248554242289,0.5348248554242289,0.0035599114256656,0.093159289974352,0.1357628731025913,0.0136249112029886,0.0032598450606228,0.0140060173892474,0.0087698228148269,0.8943613076468737,0.0035730530182952,0.9999992699115204,0.9999992699115204,1.0,1.0,1.0,1.0,0.9999992699115204,1.0,0.9999992699115204,1.0,1.0,1.0,1.0,1.0,1.0,0.9999992699115204,0.9999992699115204,0.9999992699115204,1.0,0.9999992699115204,0.9999992699115204,0.9999992699115204,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0029546680762241,0.6474775077991701,0.6474775077991701,0.6474775077991701,0.6199144774355204,0.7470455144458957,0.6585208261389198,0.7342653156135992,0.7342653156135992,0.6044606945915776,0.348760346266364


In [0]:
display(n_null_1y)

QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,CRS_DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,ARR_TIME_BLK,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,FLIGHTS,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,FIRST_DEP_TIME,TOTAL_ADD_GTIME,LONGEST_ADD_GTIME,DIV_AIRPORT_LANDINGS,DIV_REACHED_DEST,DIV_ACTUAL_ELAPSED_TIME,DIV_ARR_DELAY,DIV_DISTANCE,DIV1_AIRPORT,DIV1_AIRPORT_ID,DIV1_AIRPORT_SEQ_ID,DIV1_WHEELS_ON,DIV1_TOTAL_GTIME,DIV1_LONGEST_GTIME,DIV1_WHEELS_OFF,DIV1_TAIL_NUM,DIV2_AIRPORT,DIV2_AIRPORT_ID,DIV2_AIRPORT_SEQ_ID,DIV2_WHEELS_ON,DIV2_TOTAL_GTIME,DIV2_LONGEST_GTIME,DIV2_WHEELS_OFF,DIV2_TAIL_NUM,DIV3_AIRPORT,DIV3_AIRPORT_ID,DIV3_AIRPORT_SEQ_ID,DIV3_WHEELS_ON,DIV3_TOTAL_GTIME,DIV3_LONGEST_GTIME,DIV3_WHEELS_OFF,DIV3_TAIL_NUM,DIV4_AIRPORT,DIV4_AIRPORT_ID,DIV4_AIRPORT_SEQ_ID,DIV4_WHEELS_ON,DIV4_TOTAL_GTIME,DIV4_LONGEST_GTIME,DIV4_WHEELS_OFF,DIV4_TAIL_NUM,DIV5_AIRPORT,DIV5_AIRPORT_ID,DIV5_AIRPORT_SEQ_ID,DIV5_WHEELS_ON,DIV5_TOTAL_GTIME,DIV5_LONGEST_GTIME,DIV5_WHEELS_OFF,DIV5_TAIL_NUM,YEAR,iata_code,type,municipality,station_state,station_id,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,REPORT_TYPE,SOURCE,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyPresentWeatherType,HourlyPressureChange,HourlyPressureTendency,HourlyRelativeHumidity,HourlySkyConditions,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,Sunrise,Sunset,DailyAverageDewPointTemperature,DailyAverageDryBulbTemperature,DailyAverageRelativeHumidity,DailyAverageSeaLevelPressure,DailyAverageStationPressure,DailyAverageWetBulbTemperature,DailyAverageWindSpeed,DailyCoolingDegreeDays,DailyDepartureFromNormalAverageTemperature,DailyHeatingDegreeDays,DailyMaximumDryBulbTemperature,DailyMinimumDryBulbTemperature,DailyPeakWindDirection,DailyPeakWindSpeed,DailyPrecipitation,DailySnowDepth,DailySnowfall,DailySustainedWindDirection,DailySustainedWindSpeed,DailyWeather,MonthlyAverageRH,MonthlyDaysWithGT001Precip,MonthlyDaysWithGT010Precip,MonthlyDaysWithGT32Temp,MonthlyDaysWithGT90Temp,MonthlyDaysWithLT0Temp,MonthlyDaysWithLT32Temp,MonthlyDepartureFromNormalAverageTemperature,MonthlyDepartureFromNormalCoolingDegreeDays,MonthlyDepartureFromNormalHeatingDegreeDays,MonthlyDepartureFromNormalMaximumTemperature,MonthlyDepartureFromNormalMinimumTemperature,MonthlyDepartureFromNormalPrecipitation,MonthlyDewpointTemperature,MonthlyGreatestPrecip,MonthlyGreatestPrecipDate,MonthlyGreatestSnowDepth,MonthlyGreatestSnowDepthDate,MonthlyGreatestSnowfall,MonthlyGreatestSnowfallDate,MonthlyMaxSeaLevelPressureValue,MonthlyMaxSeaLevelPressureValueDate,MonthlyMaxSeaLevelPressureValueTime,MonthlyMaximumTemperature,MonthlyMeanTemperature,MonthlyMinSeaLevelPressureValue,MonthlyMinSeaLevelPressureValueDate,MonthlyMinSeaLevelPressureValueTime,MonthlyMinimumTemperature,MonthlySeaLevelPressure,MonthlyStationPressure,MonthlyTotalLiquidPrecipitation,MonthlyTotalSnowfall,MonthlyWetBulb,AWND,CDSD,CLDD,DSNW,HDSD,HTDD,NormalsCoolingDegreeDay,NormalsHeatingDegreeDay,ShortDurationEndDate005,ShortDurationEndDate010,ShortDurationEndDate015,ShortDurationEndDate020,ShortDurationEndDate030,ShortDurationEndDate045,ShortDurationEndDate060,ShortDurationEndDate080,ShortDurationEndDate100,ShortDurationEndDate120,ShortDurationEndDate150,ShortDurationEndDate180,ShortDurationPrecipitationValue005,ShortDurationPrecipitationValue010,ShortDurationPrecipitationValue015,ShortDurationPrecipitationValue020,ShortDurationPrecipitationValue030,ShortDurationPrecipitationValue045,ShortDurationPrecipitationValue060,ShortDurationPrecipitationValue080,ShortDurationPrecipitationValue100,ShortDurationPrecipitationValue120,ShortDurationPrecipitationValue150,ShortDurationPrecipitationValue180,REM,BackupDirection,BackupDistance,BackupDistanceUnit,BackupElements,BackupElevation,BackupEquipment,BackupLatitude,BackupLongitude,BackupName,WindEquipmentChangeDate
0,0,0,0,0,0,0,0,17614,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,129512,129512,129512,129512,0,133346,133346,136974,136974,0,136973,152909,152909,152909,152909,0,0,7135752,0,135,152909,152909,0,0,0,5901073,5901073,5901073,5901073,5901073,7219665,7219666,7219666,0,7251407,7254099,7254098,7251413,7250471,7250471,7250471,7250471,7250471,7250471,7254009,7254009,7269884,7269883,7269883,7269884,7269884,7269884,7269971,7269971,7270031,7270031,7270031,7270031,7270031,7270031,7270033,7270033,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,0,0,0,0,0,0,1872,1872,1872,1872,1872,1872,1872,1464477,19070,17505,2158337,6440300,3810710,3810710,19231,629216,863411,126700,16719,129398,81556,6495690,22067,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,7270034,15644,4463996,4463996,4463996,4134171,5368891,4510012,5251873,5251873,4051273,1978753


In [0]:
display(perc_null_1y)

QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,CRS_DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,ARR_TIME_BLK,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,FLIGHTS,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,FIRST_DEP_TIME,TOTAL_ADD_GTIME,LONGEST_ADD_GTIME,DIV_AIRPORT_LANDINGS,DIV_REACHED_DEST,DIV_ACTUAL_ELAPSED_TIME,DIV_ARR_DELAY,DIV_DISTANCE,DIV1_AIRPORT,DIV1_AIRPORT_ID,DIV1_AIRPORT_SEQ_ID,DIV1_WHEELS_ON,DIV1_TOTAL_GTIME,DIV1_LONGEST_GTIME,DIV1_WHEELS_OFF,DIV1_TAIL_NUM,DIV2_AIRPORT,DIV2_AIRPORT_ID,DIV2_AIRPORT_SEQ_ID,DIV2_WHEELS_ON,DIV2_TOTAL_GTIME,DIV2_LONGEST_GTIME,DIV2_WHEELS_OFF,DIV2_TAIL_NUM,DIV3_AIRPORT,DIV3_AIRPORT_ID,DIV3_AIRPORT_SEQ_ID,DIV3_WHEELS_ON,DIV3_TOTAL_GTIME,DIV3_LONGEST_GTIME,DIV3_WHEELS_OFF,DIV3_TAIL_NUM,DIV4_AIRPORT,DIV4_AIRPORT_ID,DIV4_AIRPORT_SEQ_ID,DIV4_WHEELS_ON,DIV4_TOTAL_GTIME,DIV4_LONGEST_GTIME,DIV4_WHEELS_OFF,DIV4_TAIL_NUM,DIV5_AIRPORT,DIV5_AIRPORT_ID,DIV5_AIRPORT_SEQ_ID,DIV5_WHEELS_ON,DIV5_TOTAL_GTIME,DIV5_LONGEST_GTIME,DIV5_WHEELS_OFF,DIV5_TAIL_NUM,YEAR,iata_code,type,municipality,station_state,station_id,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,REPORT_TYPE,SOURCE,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyPresentWeatherType,HourlyPressureChange,HourlyPressureTendency,HourlyRelativeHumidity,HourlySkyConditions,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,Sunrise,Sunset,DailyAverageDewPointTemperature,DailyAverageDryBulbTemperature,DailyAverageRelativeHumidity,DailyAverageSeaLevelPressure,DailyAverageStationPressure,DailyAverageWetBulbTemperature,DailyAverageWindSpeed,DailyCoolingDegreeDays,DailyDepartureFromNormalAverageTemperature,DailyHeatingDegreeDays,DailyMaximumDryBulbTemperature,DailyMinimumDryBulbTemperature,DailyPeakWindDirection,DailyPeakWindSpeed,DailyPrecipitation,DailySnowDepth,DailySnowfall,DailySustainedWindDirection,DailySustainedWindSpeed,DailyWeather,MonthlyAverageRH,MonthlyDaysWithGT001Precip,MonthlyDaysWithGT010Precip,MonthlyDaysWithGT32Temp,MonthlyDaysWithGT90Temp,MonthlyDaysWithLT0Temp,MonthlyDaysWithLT32Temp,MonthlyDepartureFromNormalAverageTemperature,MonthlyDepartureFromNormalCoolingDegreeDays,MonthlyDepartureFromNormalHeatingDegreeDays,MonthlyDepartureFromNormalMaximumTemperature,MonthlyDepartureFromNormalMinimumTemperature,MonthlyDepartureFromNormalPrecipitation,MonthlyDewpointTemperature,MonthlyGreatestPrecip,MonthlyGreatestPrecipDate,MonthlyGreatestSnowDepth,MonthlyGreatestSnowDepthDate,MonthlyGreatestSnowfall,MonthlyGreatestSnowfallDate,MonthlyMaxSeaLevelPressureValue,MonthlyMaxSeaLevelPressureValueDate,MonthlyMaxSeaLevelPressureValueTime,MonthlyMaximumTemperature,MonthlyMeanTemperature,MonthlyMinSeaLevelPressureValue,MonthlyMinSeaLevelPressureValueDate,MonthlyMinSeaLevelPressureValueTime,MonthlyMinimumTemperature,MonthlySeaLevelPressure,MonthlyStationPressure,MonthlyTotalLiquidPrecipitation,MonthlyTotalSnowfall,MonthlyWetBulb,AWND,CDSD,CLDD,DSNW,HDSD,HTDD,NormalsCoolingDegreeDay,NormalsHeatingDegreeDay,ShortDurationEndDate005,ShortDurationEndDate010,ShortDurationEndDate015,ShortDurationEndDate020,ShortDurationEndDate030,ShortDurationEndDate045,ShortDurationEndDate060,ShortDurationEndDate080,ShortDurationEndDate100,ShortDurationEndDate120,ShortDurationEndDate150,ShortDurationEndDate180,ShortDurationPrecipitationValue005,ShortDurationPrecipitationValue010,ShortDurationPrecipitationValue015,ShortDurationPrecipitationValue020,ShortDurationPrecipitationValue030,ShortDurationPrecipitationValue045,ShortDurationPrecipitationValue060,ShortDurationPrecipitationValue080,ShortDurationPrecipitationValue100,ShortDurationPrecipitationValue120,ShortDurationPrecipitationValue150,ShortDurationPrecipitationValue180,REM,BackupDirection,BackupDistance,BackupDistanceUnit,BackupElements,BackupElevation,BackupEquipment,BackupLatitude,BackupLongitude,BackupName,WindEquipmentChangeDate
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0024228222316429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0178144971536584,0.0178144971536584,0.0178144971536584,0.0178144971536584,0.0,0.0183418674520641,0.0183418674520641,0.0188409022571283,0.0188409022571283,0.0,0.0188407647061898,0.0210327764629436,0.0210327764629436,0.0210327764629436,0.0210327764629436,0.0,0.0,0.9815293848694516,0.0,1.8569376704428062e-05,0.0210327764629436,0.0210327764629436,0.0,0.0,0.0,0.8116981296098478,0.8116981296098478,0.8116981296098478,0.8116981296098478,0.8116981296098478,0.9930716967761086,0.9930718343270472,0.9930718343270472,0.0,0.9974378386676046,0.9978081257941848,0.9978079882432462,0.997438663973236,0.9973090909891206,0.9973090909891206,0.9973090909891206,0.9973090909891206,0.9973090909891206,0.9973090909891206,0.9977957462097152,0.9977957462097152,0.9999793673592172,0.9999792298082788,0.9999792298082788,0.9999793673592172,0.9999793673592172,0.9999793673592172,0.9999913342908712,0.9999913342908712,0.9999995873471844,0.9999995873471844,0.9999995873471844,0.9999995873471844,0.9999995873471844,0.9999995873471844,0.9999998624490616,0.9999998624490616,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0002574953569680692,0.0002574953569680692,0.0002574953569680692,0.0002574953569680692,0.0002574953569680692,0.0002574953569680692,0.0002574953569680692,0.20144018583682,0.0026230963981736,0.0024078291793408,0.2968812800600382,0.8858693095520599,0.5241667370468969,0.5241667370468969,0.0026452420992804,0.0865492513515067,0.1187629934055329,0.0174277039144521,0.0022997141416395,0.0177988163466635,0.0112181043444913,0.8934882560384174,0.0030353365610119,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0021518468826968,0.6140268394893339,0.6140268394893339,0.6140268394893339,0.5686591011816451,0.7384959960297297,0.6203563834777114,0.7224000603023315,0.7224000603023315,0.5572564034776178,0.2721793323112381


In [0]:
display(n_null_all)

QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,CRS_DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,ARR_TIME_BLK,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,FLIGHTS,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,FIRST_DEP_TIME,TOTAL_ADD_GTIME,LONGEST_ADD_GTIME,DIV_AIRPORT_LANDINGS,DIV_REACHED_DEST,DIV_ACTUAL_ELAPSED_TIME,DIV_ARR_DELAY,DIV_DISTANCE,DIV1_AIRPORT,DIV1_AIRPORT_ID,DIV1_AIRPORT_SEQ_ID,DIV1_WHEELS_ON,DIV1_TOTAL_GTIME,DIV1_LONGEST_GTIME,DIV1_WHEELS_OFF,DIV1_TAIL_NUM,DIV2_AIRPORT,DIV2_AIRPORT_ID,DIV2_AIRPORT_SEQ_ID,DIV2_WHEELS_ON,DIV2_TOTAL_GTIME,DIV2_LONGEST_GTIME,DIV2_WHEELS_OFF,DIV2_TAIL_NUM,DIV3_AIRPORT,DIV3_AIRPORT_ID,DIV3_AIRPORT_SEQ_ID,DIV3_WHEELS_ON,DIV3_TOTAL_GTIME,DIV3_LONGEST_GTIME,DIV3_WHEELS_OFF,DIV3_TAIL_NUM,DIV4_AIRPORT,DIV4_AIRPORT_ID,DIV4_AIRPORT_SEQ_ID,DIV4_WHEELS_ON,DIV4_TOTAL_GTIME,DIV4_LONGEST_GTIME,DIV4_WHEELS_OFF,DIV4_TAIL_NUM,DIV5_AIRPORT,DIV5_AIRPORT_ID,DIV5_AIRPORT_SEQ_ID,DIV5_WHEELS_ON,DIV5_TOTAL_GTIME,DIV5_LONGEST_GTIME,DIV5_WHEELS_OFF,DIV5_TAIL_NUM,YEAR,iata_code,type,municipality,station_state,station_id,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,REPORT_TYPE,SOURCE,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyPresentWeatherType,HourlyPressureChange,HourlyPressureTendency,HourlyRelativeHumidity,HourlySkyConditions,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,Sunrise,Sunset,DailyAverageDewPointTemperature,DailyAverageDryBulbTemperature,DailyAverageRelativeHumidity,DailyAverageSeaLevelPressure,DailyAverageStationPressure,DailyAverageWetBulbTemperature,DailyAverageWindSpeed,DailyCoolingDegreeDays,DailyDepartureFromNormalAverageTemperature,DailyHeatingDegreeDays,DailyMaximumDryBulbTemperature,DailyMinimumDryBulbTemperature,DailyPeakWindDirection,DailyPeakWindSpeed,DailyPrecipitation,DailySnowDepth,DailySnowfall,DailySustainedWindDirection,DailySustainedWindSpeed,DailyWeather,MonthlyAverageRH,MonthlyDaysWithGT001Precip,MonthlyDaysWithGT010Precip,MonthlyDaysWithGT32Temp,MonthlyDaysWithGT90Temp,MonthlyDaysWithLT0Temp,MonthlyDaysWithLT32Temp,MonthlyDepartureFromNormalAverageTemperature,MonthlyDepartureFromNormalCoolingDegreeDays,MonthlyDepartureFromNormalHeatingDegreeDays,MonthlyDepartureFromNormalMaximumTemperature,MonthlyDepartureFromNormalMinimumTemperature,MonthlyDepartureFromNormalPrecipitation,MonthlyDewpointTemperature,MonthlyGreatestPrecip,MonthlyGreatestPrecipDate,MonthlyGreatestSnowDepth,MonthlyGreatestSnowDepthDate,MonthlyGreatestSnowfall,MonthlyGreatestSnowfallDate,MonthlyMaxSeaLevelPressureValue,MonthlyMaxSeaLevelPressureValueDate,MonthlyMaxSeaLevelPressureValueTime,MonthlyMaximumTemperature,MonthlyMeanTemperature,MonthlyMinSeaLevelPressureValue,MonthlyMinSeaLevelPressureValueDate,MonthlyMinSeaLevelPressureValueTime,MonthlyMinimumTemperature,MonthlySeaLevelPressure,MonthlyStationPressure,MonthlyTotalLiquidPrecipitation,MonthlyTotalSnowfall,MonthlyWetBulb,AWND,CDSD,CLDD,DSNW,HDSD,HTDD,NormalsCoolingDegreeDay,NormalsHeatingDegreeDay,ShortDurationEndDate005,ShortDurationEndDate010,ShortDurationEndDate015,ShortDurationEndDate020,ShortDurationEndDate030,ShortDurationEndDate045,ShortDurationEndDate060,ShortDurationEndDate080,ShortDurationEndDate100,ShortDurationEndDate120,ShortDurationEndDate150,ShortDurationEndDate180,ShortDurationPrecipitationValue005,ShortDurationPrecipitationValue010,ShortDurationPrecipitationValue015,ShortDurationPrecipitationValue020,ShortDurationPrecipitationValue030,ShortDurationPrecipitationValue045,ShortDurationPrecipitationValue060,ShortDurationPrecipitationValue080,ShortDurationPrecipitationValue100,ShortDurationPrecipitationValue120,ShortDurationPrecipitationValue150,ShortDurationPrecipitationValue180,REM,BackupDirection,BackupDistance,BackupDistanceUnit,BackupElements,BackupElevation,BackupEquipment,BackupLatitude,BackupLongitude,BackupName,WindEquipmentChangeDate
0,0,0,0,0,0,0,0,237133,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,847610,847610,847610,847610,0,858846,858841,877342,877342,0,877322,963933,963933,963933,963933,0,0,40620330,0,170,961335,961335,0,0,0,34358878,34358878,34358878,34358878,34358878,41225916,41225930,41225930,91,41385780,41399767,41399716,41385790,41381137,41381137,41381137,41381138,41381137,41381137,41399205,41399206,41482864,41482857,41482857,41482864,41482864,41482864,41483380,41483380,41483721,41483721,41483721,41483721,41483721,41483721,41483727,41483727,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,0,0,0,0,0,0,10431,10431,10431,10431,10431,10431,10431,8318268,237762,229241,12035476,36860703,21856927,21856927,239234,3548962,5046017,681104,227048,697645,572012,37119243,251504,41483714,41483714,41483717,41483715,41483717,41483717,41483716,41483717,41483716,41483715,41483715,41483715,41483715,41483715,41483717,41483716,41483714,41483721,41483722,41483714,41483714,41483718,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,41483728,5965124,28156042,22407930,25918027,24518316,29890066,25906037,29859682,29361194,24521002,14134224


In [0]:
display(perc_null_all)

QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,CRS_DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,ARR_TIME_BLK,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,FLIGHTS,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,FIRST_DEP_TIME,TOTAL_ADD_GTIME,LONGEST_ADD_GTIME,DIV_AIRPORT_LANDINGS,DIV_REACHED_DEST,DIV_ACTUAL_ELAPSED_TIME,DIV_ARR_DELAY,DIV_DISTANCE,DIV1_AIRPORT,DIV1_AIRPORT_ID,DIV1_AIRPORT_SEQ_ID,DIV1_WHEELS_ON,DIV1_TOTAL_GTIME,DIV1_LONGEST_GTIME,DIV1_WHEELS_OFF,DIV1_TAIL_NUM,DIV2_AIRPORT,DIV2_AIRPORT_ID,DIV2_AIRPORT_SEQ_ID,DIV2_WHEELS_ON,DIV2_TOTAL_GTIME,DIV2_LONGEST_GTIME,DIV2_WHEELS_OFF,DIV2_TAIL_NUM,DIV3_AIRPORT,DIV3_AIRPORT_ID,DIV3_AIRPORT_SEQ_ID,DIV3_WHEELS_ON,DIV3_TOTAL_GTIME,DIV3_LONGEST_GTIME,DIV3_WHEELS_OFF,DIV3_TAIL_NUM,DIV4_AIRPORT,DIV4_AIRPORT_ID,DIV4_AIRPORT_SEQ_ID,DIV4_WHEELS_ON,DIV4_TOTAL_GTIME,DIV4_LONGEST_GTIME,DIV4_WHEELS_OFF,DIV4_TAIL_NUM,DIV5_AIRPORT,DIV5_AIRPORT_ID,DIV5_AIRPORT_SEQ_ID,DIV5_WHEELS_ON,DIV5_TOTAL_GTIME,DIV5_LONGEST_GTIME,DIV5_WHEELS_OFF,DIV5_TAIL_NUM,YEAR,iata_code,type,municipality,station_state,station_id,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,REPORT_TYPE,SOURCE,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyPresentWeatherType,HourlyPressureChange,HourlyPressureTendency,HourlyRelativeHumidity,HourlySkyConditions,HourlySeaLevelPressure,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,Sunrise,Sunset,DailyAverageDewPointTemperature,DailyAverageDryBulbTemperature,DailyAverageRelativeHumidity,DailyAverageSeaLevelPressure,DailyAverageStationPressure,DailyAverageWetBulbTemperature,DailyAverageWindSpeed,DailyCoolingDegreeDays,DailyDepartureFromNormalAverageTemperature,DailyHeatingDegreeDays,DailyMaximumDryBulbTemperature,DailyMinimumDryBulbTemperature,DailyPeakWindDirection,DailyPeakWindSpeed,DailyPrecipitation,DailySnowDepth,DailySnowfall,DailySustainedWindDirection,DailySustainedWindSpeed,DailyWeather,MonthlyAverageRH,MonthlyDaysWithGT001Precip,MonthlyDaysWithGT010Precip,MonthlyDaysWithGT32Temp,MonthlyDaysWithGT90Temp,MonthlyDaysWithLT0Temp,MonthlyDaysWithLT32Temp,MonthlyDepartureFromNormalAverageTemperature,MonthlyDepartureFromNormalCoolingDegreeDays,MonthlyDepartureFromNormalHeatingDegreeDays,MonthlyDepartureFromNormalMaximumTemperature,MonthlyDepartureFromNormalMinimumTemperature,MonthlyDepartureFromNormalPrecipitation,MonthlyDewpointTemperature,MonthlyGreatestPrecip,MonthlyGreatestPrecipDate,MonthlyGreatestSnowDepth,MonthlyGreatestSnowDepthDate,MonthlyGreatestSnowfall,MonthlyGreatestSnowfallDate,MonthlyMaxSeaLevelPressureValue,MonthlyMaxSeaLevelPressureValueDate,MonthlyMaxSeaLevelPressureValueTime,MonthlyMaximumTemperature,MonthlyMeanTemperature,MonthlyMinSeaLevelPressureValue,MonthlyMinSeaLevelPressureValueDate,MonthlyMinSeaLevelPressureValueTime,MonthlyMinimumTemperature,MonthlySeaLevelPressure,MonthlyStationPressure,MonthlyTotalLiquidPrecipitation,MonthlyTotalSnowfall,MonthlyWetBulb,AWND,CDSD,CLDD,DSNW,HDSD,HTDD,NormalsCoolingDegreeDay,NormalsHeatingDegreeDay,ShortDurationEndDate005,ShortDurationEndDate010,ShortDurationEndDate015,ShortDurationEndDate020,ShortDurationEndDate030,ShortDurationEndDate045,ShortDurationEndDate060,ShortDurationEndDate080,ShortDurationEndDate100,ShortDurationEndDate120,ShortDurationEndDate150,ShortDurationEndDate180,ShortDurationPrecipitationValue005,ShortDurationPrecipitationValue010,ShortDurationPrecipitationValue015,ShortDurationPrecipitationValue020,ShortDurationPrecipitationValue030,ShortDurationPrecipitationValue045,ShortDurationPrecipitationValue060,ShortDurationPrecipitationValue080,ShortDurationPrecipitationValue100,ShortDurationPrecipitationValue120,ShortDurationPrecipitationValue150,ShortDurationPrecipitationValue180,REM,BackupDirection,BackupDistance,BackupDistanceUnit,BackupElements,BackupElevation,BackupEquipment,BackupLatitude,BackupLongitude,BackupName,WindEquipmentChangeDate
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0057162895292342,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02043234879951,0.02043234879951,0.02043234879951,0.02043234879951,0.0,0.020703201988018,0.0207030814588312,0.0211490635557151,0.0211490635557151,0.0,0.021148581438968,0.0232364121180237,0.0232364121180237,0.0232364121180237,0.0232364121180237,0.0,0.0,0.9791870682403472,0.0,4.097992350157151e-06,0.0231737851525783,0.0231737851525783,0.0,0.0,0.0,0.8282495247293107,0.8282495247293107,0.8282495247293107,0.8282495247293107,0.8282495247293107,0.9937852258601252,0.9937855633418482,0.9937855633418482,2.193631199201769e-06,0.9976388814428636,0.997976049789932,0.9979748203922272,0.9976391225012372,0.9975269580400296,0.9975269580400296,0.9975269580400296,0.9975269821458668,0.9975269580400296,0.9975269580400296,0.9979625023093392,0.9979625264151766,0.9999791725565262,0.9999790038156648,0.9999790038156648,0.9999791725565262,0.9999791725565262,0.9999791725565262,0.9999916111686008,0.9999916111686008,0.9999998312591384,0.9999998312591384,0.9999998312591384,0.9999998312591384,0.9999998312591384,0.9999998312591384,0.9999999758941628,0.9999999758941628,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000251447989438172,0.000251447989438172,0.000251447989438172,0.000251447989438172,0.000251447989438172,0.000251447989438172,0.000251447989438172,0.2005188154738648,0.0057314521009297,0.0055260462608375,0.2901252269323529,0.8885581112671456,0.5268795273173135,0.5268795273173135,0.0057669358935146,0.0855507007470495,0.1216384650868408,0.0164185822450672,0.0054731821595204,0.0168173169007375,0.0137888282364593,0.8947904344566139,0.0060627145178466,0.9999996625182772,0.9999996625182772,0.9999997348357892,0.9999996866241144,0.9999997348357892,0.9999997348357892,0.9999997107299518,0.9999997348357892,0.9999997107299518,0.9999996866241144,0.9999996866241144,0.9999996866241144,0.9999996866241144,0.9999996866241144,0.9999997348357892,0.9999997107299518,0.9999996625182772,0.9999998312591384,0.999999855364976,0.9999996625182772,0.9999996625182772,0.9999997589416264,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.1437943089396401,0.6787249689806085,0.5401619160168054,0.624775743395097,0.5910345376866805,0.7205250694923079,0.6244867144052242,0.7197926377301481,0.7077761670792944,0.591099285965813,0.3407173048671035


In [0]:
# First stage, filter to just relevant columns 
# Time Period 
    # Not necessary to know all time period cols, cols of interest are month and day of week 
# Airline 
    # Only need to keep track of 1 ID column for carrier, remove extraneous id columns 
    # Only need 1 column of the origin airport, could be useful to know destination airport, remove extraneous id columns
# Departure Performance
    # Keep departure time and outcome variable of whether there is a delay 
# Arrival Performance 
    # Don't need to know additional departure performances nor any arrival performances 
# Cancellations and Diversions 
    # Don't need to distinguish cancellations or diversions 
# Flight Summaries 
    # Distance between airport could be useful 
# Remove columns that have high NA percentage 
# Remove HourlyDewPointTemperature and HourlyWetBulbTemperature because highly correlated with HourlyDryBulbTemperature
# Choose HourlyDryBulbTemperature because it is the standard air temperature (most interpretable)
# Remove HourlyStationPressure because highly correlated with ELEVATION, choose ELEVATION because no NA values 

#Weather features (13 features)
weather_cols = ["ELEVATION", "HourlyAltimeterSetting", "HourlyDryBulbTemperature", "HourlyPrecipitation", "HourlyRelativeHumidity", "HourlySkyConditions", "HourlySeaLevelPressure", "HourlyVisibility", "HourlyWindDirection", "HourlyWindSpeed", "HourlyPresentWeatherType"]
weather_cols_to_clean = ["ELEVATION", "HourlyAltimeterSetting", "HourlyDryBulbTemperature", "HourlyPrecipitation", "HourlyRelativeHumidity", "HourlySeaLevelPressure", "HourlyVisibility", "HourlyWindDirection", "HourlyWindSpeed"]
                
cols_to_keep = [#Flight features (11 + 6 + 13 + 301 + 23 features once dummy coded)
                "MONTH", "DAY_OF_WEEK", "OP_CARRIER",  "ORIGIN_AIRPORT_ID", "CRS_DEP_TIME", "DISTANCE", "DEP_DEL15",
                # Additional features for EDA + data engineering
                "QUARTER", "DEP_DELAY", "DEST_AIRPORT_ID", "FL_DATE", "TAIL_NUM", "ARR_TIME", "ARR_DELAY", "ARR_DEL15", "CANCELLED",
                #Stations features (2 features once dummy coded)
                "type"
                ] + weather_cols 
cols_to_encode = ["MONTH", "DAY_OF_WEEK", "OP_CARRIER",  "ORIGIN_AIRPORT_ID", "HOUR", "type"]


In [0]:
merged_df = merged_all.distinct()
# Cancelled flights = delayed 
merged_df = merged_df.fillna({"DEP_DEL15": 1, "ARR_DEL15": 1})
merged_df = merged_df.withColumn("index", monotonically_increasing_id()).cache()
weather_df = weather_all

In [0]:
stations_w_missing_data = merged_df.withColumn("NA_COUNT", sum(
    F.when(F.col(c).isNull() | F.isnan(F.col(c)), 1).otherwise(0) for c in weather_cols
)).filter(col("NA_COUNT") >= 1).select("STATION").distinct().rdd.flatMap(lambda x: x).collect()

# Get the neighbor station with the smallest distance to station with missing data 
reduced_stations = stations.filter((col("station_id").isin(stations_w_missing_data)) & 
                                   (col("neighbor_id") != col("station_id"))) 
windowSpec = Window.partitionBy("station_id")
reduced_stations_join_w_row_num = reduced_stations.withColumn("row_number", row_number().over(windowSpec.orderBy(col("distance_to_neighbor"))))
min_distance_neighbor_station = reduced_stations_join_w_row_num.filter(col("row_number") == 1).drop("row_number")
min_distance_neighbor_station = min_distance_neighbor_station.filter(col("distance_to_neighbor") < 75)
merge_neighbor_station = min_distance_neighbor_station.select(["station_id", "neighbor_id"])

# merge in neighbor station id with main dataset 
merged_df_w_neighbor = merged_df.join(merge_neighbor_station, "station_id", "left").drop("STATION")
# Rename weather columns to use for coalescing later 
rename_orig = {weather_col: weather_col + "_orig" for weather_col in weather_cols}
merged_df_w_neighbor = merged_df_w_neighbor.withColumnsRenamed(rename_orig)

# Create weather date and time columns to merge into main dataset 
weather_merge = weather_df.filter(col("STATION").isin(stations_w_missing_data))\
                .withColumn("date_split", split(col("DATE"), "[- T:]+"))\
                .withColumn("weather_year", col("date_split")[0])\
                .withColumn("weather_month", col("date_split")[1])\
                .withColumn("weather_day", col("date_split")[2])\
                .withColumn("weather_hour", col("date_split")[3])
#There are multiple rows with the same station and date and time, take first element of each
group_by_cols = ["STATION", "weather_year", "weather_month", "weather_day", "weather_hour"]
first_expr = [first(col_name).alias(col_name) for col_name in weather_df.columns if col_name not in group_by_cols]
weather_merge = weather_merge.groupBy(group_by_cols).agg(*first_expr).select(["STATION", "weather_year", "weather_month", "weather_day", "weather_hour"] + weather_cols)
# Rename weather columns to use for coalescing later 
rename_3hr = {weather_col: weather_col + "_3hr" for weather_col in weather_cols}
rename_neighbor = {weather_col: weather_col + "_neighbor" for weather_col in weather_cols}
weather_merge_3hr = weather_merge.withColumnsRenamed(rename_3hr)
weather_merge_neighbor = weather_merge.withColumnsRenamed(rename_neighbor)

# Create date and time columns to merge in with weather data for the station 3 hour before flight and 
# weather data for the nearest neighbor station 2 hour before flight 
merged_df_3hr_neighbor = merged_df_w_neighbor.withColumn("dep_time", lpad(col("CRS_DEP_TIME").cast("string"), 4, "0"))\
        .withColumn("HOUR", col("dep_time").substr(0, 2))\
        .withColumn("adjusted_date_2h", subtract_hours_udf(hour_str = col("HOUR").cast("string"), 
                                                            year_str = col("YEAR").cast("string"), 
                                                            month_str = col("MONTH").cast("string"), 
                                                            day_str = col("DAY_OF_MONTH").cast("string"), 
                                                            num_hours = lit(2)))\
        .withColumn("adjusted_date_3h", subtract_hours_udf(hour_str = col("HOUR").cast("string"), 
                                                            year_str = col("YEAR").cast("string"), 
                                                            month_str = col("MONTH").cast("string"), 
                                                            day_str = col("DAY_OF_MONTH").cast("string"), 
                                                            num_hours = lit(3)))\
        .withColumn("merge_year_3h", col("adjusted_date_3h.year"))\
        .withColumn("merge_month_3h", col("adjusted_date_3h.month"))\
        .withColumn("merge_day_3h", col("adjusted_date_3h.day"))\
        .withColumn("merge_hour_3h", col("adjusted_date_3h.hour"))\
        .withColumn("merge_year_neighbor", col("adjusted_date_2h.year"))\
        .withColumn("merge_month_neighbor", col("adjusted_date_2h.month"))\
        .withColumn("merge_day_neighbor", col("adjusted_date_2h.day"))\
        .withColumn("merge_hour_neighbor", col("adjusted_date_2h.hour"))#.drop("dep_time")

# Merge in 3 hour weather data for the same station  
final_merged = merged_df_3hr_neighbor\
            .join(weather_merge_3hr, 
            (merged_df_3hr_neighbor.station_id == weather_merge_3hr.STATION) & 
            (merged_df_3hr_neighbor.merge_year_3h == weather_merge_3hr.weather_year) &
            (merged_df_3hr_neighbor.merge_month_3h == weather_merge_3hr.weather_month) &
            (merged_df_3hr_neighbor.merge_day_3h == weather_merge_3hr.weather_day) &
            (merged_df_3hr_neighbor.merge_hour_3h == weather_merge_3hr.weather_hour), 
            "left")\
        .drop("adjusted_date_3h", "merge_year_3h", "merge_month_3h", "merge_day_3h", "merge_hour_3h", 
            "STATION", "weather_year", "weather_month", "weather_day", "weather_hour")
# Merge in 2 hour weather data for the nearest neighbor station 
final_merged = final_merged\
            .join(weather_merge_neighbor,
            (final_merged.neighbor_id == weather_merge_neighbor.STATION) & 
            (final_merged.merge_year_neighbor == weather_merge_neighbor.weather_year) &
            (final_merged.merge_month_neighbor == weather_merge_neighbor.weather_month) &
            (final_merged.merge_day_neighbor == weather_merge_neighbor.weather_day) &
            (final_merged.merge_hour_neighbor == weather_merge_neighbor.weather_hour), 
            "left")\
        .drop("neighbor_id", "adjusted_date_2h", "merge_year_neighbor", "merge_month_neighbor", "merge_day_neighbor", "merge_hour_neighbor", 
            "STATION", "weather_year", "weather_month", "weather_day", "weather_hour")

# Coalesce original weather values from 2 hour before departure, then weather data from 3 hour before departure at the station, then weather data from 2 hour before departure at the closest neighbor station if available   
for weather_col in weather_cols: 
    final_merged = final_merged.withColumn(weather_col, coalesce(final_merged[f"{weather_col}_orig"], 
                                                                 final_merged[f"{weather_col}_3hr"],
                                                                 final_merged[f"{weather_col}_neighbor"]))

In [0]:
display(final_merged.select(
    *[(count(when(col(c).isNull() | isnan(c), c)) / count("*")).alias(c)
    for c in weather_cols]
))


ELEVATION,HourlyAltimeterSetting,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySkyConditions,HourlySeaLevelPressure,HourlyVisibility,HourlyWindDirection,HourlyWindSpeed,HourlyPresentWeatherType
7.549948259230703e-05,0.0056069695568344,0.0024719571972895,0.0734363121848643,0.0025684046525423,0.0057305601849476,0.0515129209216683,0.0024557098629129,0.0025453353661946,0.0024600007019619,0.8496532423508321


In [0]:
# Clean up weather columns after coalescing
for weather_col in weather_cols_to_clean: 
    final_merged = final_merged.withColumn(weather_col, clean_weather_udf(col(weather_col)))

# Filter down on interested columns and one-hot encode categorical variables
final_merged = final_merged.select(cols_to_keep + ["index", "dep_time", "HOUR"])

# Create one hot encoded sparse vector for more complex models later 
stages = [] 
for col_name in cols_to_encode:
    string_indexer = StringIndexer(inputCol=col_name, outputCol=col_name + "_indexed")
    # OneHotEncoder for the column
    encoder = OneHotEncoder(inputCols=[string_indexer.getOutputCol()], outputCols=[col_name + "_vec"])
    
    stages += [string_indexer, encoder]

pipeline = Pipeline(stages=stages)

model = pipeline.fit(final_merged)

# Transforming the DataFrame
transformed_df = model.transform(final_merged).cache()

# One-hot encode the hourly sky conditions, contains multiple values separated by space
sky_cond_exploded = transformed_df.select("index", "HourlySkyConditions")\
                .withColumn("value", explode(split(col("HourlySkyConditions"), " ")))
sky_cond_unique = sky_cond_exploded.dropDuplicates(["index", "value"])
sky_cond_grouped = sky_cond_unique.groupBy("index").pivot("value").count()
sky_cond_grouped = sky_cond_grouped.fillna(0)
dummy_cols_sky_cond = sky_cond_grouped.drop("index", "*").columns
assembler_sky = VectorAssembler(inputCols=dummy_cols_sky_cond, outputCol="HourlySkyConditions_vec")
sky_cond_merge = assembler_sky.transform(sky_cond_grouped).select(["index", "HourlySkyConditions_vec"])

# Create individual one hot encoded columns to more easily pull out coefficients for logistic regression 
# One-hot encode weather conditions, contains multiple values separated by pipe and space
weather_cond_exploded = transformed_df.select("index", "HourlyPresentWeatherType")\
                .withColumn("HourlyPresentWeatherType", translate_weather_type_udf(col("HourlyPresentWeatherType")))\
                .withColumn("value", explode(split(col("HourlyPresentWeatherType"), ",")))
weather_cond_unique = weather_cond_exploded.dropDuplicates(["index", "value"])
weather_cond_grouped = weather_cond_unique.groupBy("index").pivot("value").count()
weather_cond_grouped = weather_cond_grouped.fillna(0)
dummy_cols_weather_cond = weather_cond_grouped.drop("index", "*").columns
assembler_weather = VectorAssembler(inputCols=dummy_cols_weather_cond, outputCol="HourlyPresentWeatherType_vec")
weather_cond_merge = assembler_weather.transform(weather_cond_grouped).select(["index", "HourlyPresentWeatherType_vec"] + dummy_cols_weather_cond)

# Add additional one-hot features for logistic regression 
month_exploded = transformed_df.select("index", "MONTH")\
            .withColumn("value", explode(array(col("MONTH"))))
month_grouped = month_exploded.groupBy("index").pivot("value").count()
month_grouped = month_grouped.fillna(0)
dummy_month = month_grouped.drop("index", "*").columns
month_grouped = month_grouped.withColumnsRenamed({i: f"MONTH_{i}" for i in dummy_month})
dummy_month = [f"MONTH_{i}" for i in dummy_month]
month_merge = month_grouped.select(["index"] + dummy_month)

hour_exploded = transformed_df.select("index", "HOUR")\
            .withColumn("value", explode(array(col("HOUR"))))
hour_grouped = hour_exploded.groupBy("index").pivot("value").count()
hour_grouped = hour_grouped.fillna(0)
dummy_hour = hour_grouped.drop("index", "*").columns
hour_grouped = hour_grouped.withColumnsRenamed({i: f"HOUR_{i}" for i in dummy_hour})
dummy_hour = [f"HOUR_{i}" for i in dummy_hour]
hour_merge = hour_grouped.select(["index"] + dummy_hour)

dow_exploded = transformed_df.select("index", "DAY_OF_WEEK")\
            .withColumn("value", explode(array(col("DAY_OF_WEEK"))))
dow_grouped = dow_exploded.groupBy("index").pivot("value").count()
dow_grouped = dow_grouped.fillna(0)
dummy_dow = dow_grouped.drop("index", "*").columns
dow_grouped = dow_grouped.withColumnsRenamed({i: f"DAY_OF_WEEK_{i}" for i in dummy_dow})
dummy_dow = [f"DAY_OF_WEEK_{i}" for i in dummy_dow]
dow_merge = dow_grouped.select(["index"] + dummy_dow)

carrier_exploded = transformed_df.select("index", "OP_CARRIER")\
            .withColumn("value", explode(array(col("OP_CARRIER"))))
carrier_grouped = carrier_exploded.groupBy("index").pivot("value").count()
carrier_grouped = carrier_grouped.fillna(0)
dummy_carrier = carrier_grouped.drop("index", "*").columns
carrier_grouped = carrier_grouped.withColumnsRenamed({i: f"OP_CARRIER_{i}" for i in dummy_carrier})
dummy_carrier = [f"OP_CARRIER_{i}" for i in dummy_carrier]
carrier_merge = carrier_grouped.select(["index"] + dummy_carrier)

type_exploded = transformed_df.select("index", "type")\
            .withColumn("value", explode(array(col("type"))))
type_grouped = type_exploded.groupBy("index").pivot("value").count()
type_grouped = type_grouped.fillna(0)
dummy_type = type_grouped.drop("index", "*").columns
type_grouped = type_grouped.withColumnsRenamed({i: f"type_{i}" for i in dummy_type})
dummy_type = [f"type_{i}" for i in dummy_type]
type_merge = type_grouped.select(["index"] + dummy_type)

# Join all dummy variables to transformed dataframe 
transformed_df = transformed_df.join(sky_cond_merge, "index", "left")\
                                .join(weather_cond_merge, "index", "left")\
                                .join(month_merge, "index", "left")\
                                .join(hour_merge, "index", "left")\
                                .join(dow_merge, "index", "left")\
                                .join(carrier_merge, "index", "left")\
                                .join(type_merge, "index", "left").cache()

# Fill in NA's with 0 for the weather conditions since dummy variable 
transformed_df = transformed_df.na.fill({i: 0 for i in dummy_cols_weather_cond + weather_cols_to_clean}) 

Downloading artifacts:   0%|          | 0/86 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
#Sort by expected departure date and time 
#Drop existing index column that uses monotonically increasing function and create index column for use in cross validation later

#formatted_crs_dep_time = F.format_string("%04d", col("CRS_DEP_TIME"))
#datetime_str = concat(col("FL_DATE"), lit(" "), formatted_crs_dep_time.substr(1, 2), lit(":"), formatted_crs_dep_time.substr(3, 2))
#datetime_col = to_timestamp(datetime_str, 'yyyy-MM-dd HH:mm')

windowSpec = Window.orderBy("DATE_TIME")


transformed_df.withColumn("DATE_TIME", to_timestamp(concat(col("FL_DATE").cast("string").substr(0, 10), lit(" "), col("dep_time")), "yyyy-MM-dd HH:mm"))\
    .withColumn("index", row_number().over(windowSpec) - 1)\
    .orderBy("index").drop("dep_time", "DATE_TIME")\
    .write.mode("overwrite").parquet(f"{team_blob_url}/TP/flight_stations_weather_all_cleaned")
display(dbutils.fs.ls(f"{team_blob_url}/TP"))

path,name,size,modificationTime
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_SUCCESS,_SUCCESS,0,1720561571000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_1002670972388615845,_committed_1002670972388615845,625,1720560531000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_5669257934384103852,_committed_5669257934384103852,221,1720561571000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_6618439955609603938,_committed_6618439955609603938,419,1720561337000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_9167039456723159873,_committed_9167039456723159873,318,1720559468000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_vacuum825530481471543349,_committed_vacuum825530481471543349,95,1720561338000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_1002670972388615845,_started_1002670972388615845,0,1720560531000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_5669257934384103852,_started_5669257934384103852,0,1720561570000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_6618439955609603938,_started_6618439955609603938,0,1720561337000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/df_1y_cleaned_transformed/,df_1y_cleaned_transformed/,0,1722046865000


In [0]:
# Count number of null values for each weather column after cleaning  
df_cleaned = spark.read.parquet(f"{team_blob_url}/TP/flight_stations_weather_1y_cleaned")
null_counts_final_df = df_cleaned.select([count(when(col(c).isNull(), c)).alias(c) for c in weather_cols])
display(null_counts_final_df)

ELEVATION,HourlyAltimeterSetting,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySkyConditions,HourlySeaLevelPressure,HourlyVisibility,HourlyWindDirection,HourlyWindSpeed
0,229001,99319,3037869,103309,233897,2129948,98661,102350,98815


In [0]:
# Count number of null values for each weather column after cleaning  
#df_cleaned_3m = spark.read.parquet(f"{team_blob_url}/TP/flight_stations_weather_3m_cleaned")
#df_cleaned_1y = spark.read.parquet(f"{team_blob_url}/TP/flight_stations_weather_1y_cleaned")
df_cleaned_all = spark.read.parquet(f"{team_blob_url}/TP/flight_stations_weather_all_cleaned")

# perc_null_cleaned_3m = df_cleaned_3m.select(
#     *[(count(when(col(c).isNull() | isnan(c), c)) / count("*")).alias(c)
#     for c in weather_cols]
# )
# perc_null_cleaned_1y = df_cleaned_1y.select(
#     *[(count(when(col(c).isNull() | isnan(c), c)) / count("*")).alias(c)
#     for c in weather_cols]
# )

perc_null_cleaned_all = df_cleaned_all.select(
    *[(count(when(col(c).isNull() | isnan(c), c)) / count("*")).alias(c)
    for c in weather_cols]
)
display(perc_null_cleaned_all)

ELEVATION,HourlyAltimeterSetting,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySkyConditions,HourlySeaLevelPressure,HourlyVisibility,HourlyWindDirection,HourlyWindSpeed,HourlyPresentWeatherType
0.0,0.0,0.0,0.0,0.0,0.0057296657410523,0.0,0.0,0.0,0.0,0.8496681006299576
