In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter("ignore")

In [2]:
df = pd.read_csv('newark_flights.csv')
df.columns

Index(['Unnamed: 0', 'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE',
       'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT',
       'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE',
       'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
       'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'],
      dtype='object')

In [3]:
# Clean up null

df.drop('Unnamed: 0', axis=1, inplace=True)
df.drop(df[(df['ORIGIN_AIRPORT']=='EWR') & (df['DEPARTURE_TIME'].isna())].index, inplace=True)
df.drop(df[(df['DESTINATION_AIRPORT']=='EWR') & (df['ARRIVAL_TIME'].isna())].index, inplace=True)
df.drop(df[(df['ARRIVAL_DELAY'].isna())].index, inplace=True)
delay = list(df[df.columns[-6:]].columns)
for cols in delay:
    if df[cols].isna().any():
        df[cols].fillna(value=0, inplace=True)

In [4]:
# df.isna().sum()

In [5]:
# Clean up date and time
df['DATE'] = pd.to_datetime(df[['YEAR','MONTH', 'DAY']])
df["ARR_HOUR_SCH"] = df["SCHEDULED_ARRIVAL"].apply(lambda x: int(str(int(x))[0:2]) if len(str(int(x)))==4 else int(str(int(x))[0:1]))
df["ARR_MIN_SCH"]  = df["SCHEDULED_ARRIVAL"].apply(lambda x: int(str(int(x))[-2:]))
df["DEP_HOUR_SCH"] = df["SCHEDULED_DEPARTURE"].apply(lambda x: int(str(int(x))[0:2]) if len(str(int(x)))==4 else int(str(int(x))[0:1]))
df["DEP_MIN_SCH"]  = df["SCHEDULED_DEPARTURE"].apply(lambda x: int(str(int(x))[-2:]))
df = df.rename(columns={'ARR_HOUR_SCH':'HOUR', 'ARR_MIN_SCH':'MINUTE'})
df["SCHEDULED_ARRIVAL"] = pd.to_datetime(df[['YEAR','MONTH', 'DAY', 'HOUR', 'MINUTE']])
df["SCH_ARR_TEMP"] = pd.to_datetime(df[['YEAR','MONTH', 'DAY', 'HOUR']])
df.drop(['HOUR','MINUTE'], axis=1, inplace=True)
df = df.rename(columns={'DEP_HOUR_SCH':'HOUR', 'DEP_MIN_SCH':'MINUTE'})
df["SCHEDULED_DEPARTURE"] = pd.to_datetime(df[['YEAR','MONTH', 'DAY', 'HOUR', 'MINUTE']])
df["SCH_DEP_TEMP"] = pd.to_datetime(df[['YEAR','MONTH', 'DAY', 'HOUR']])
df["NYC_TIME_TEMP"] = df.apply(lambda row: row['SCH_ARR_TEMP'] if row['DESTINATION_AIRPORT'] == 'EWR' else row['SCH_DEP_TEMP'], axis=1)
df.drop(['HOUR','MINUTE', 'SCH_ARR_TEMP', 'SCH_DEP_TEMP'], axis=1, inplace=True)

# Combine United Express (EV) as mainline United (UA)
df['AIRLINE'] = df.apply(lambda row: 'UA' if row['AIRLINE'] == 'EV' else row['AIRLINE'], axis=1)

# Dummy variables for reasons of delay
delay_cols = ['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
              'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']
for col in delay_cols:
    df[col] = df[col].apply(lambda row: 1 if row > 0 else 0)
    
# Dummy variables for Arrival or Departure
df['DEPARTURE'] = df.apply(lambda row: 0 if row['DESTINATION_AIRPORT'] == 'EWR' else 1, axis=1)

# Time in reference to Newark only
df['SCHEDULED_TIME'] = df.apply(lambda row: row['SCHEDULED_DEPARTURE'] if row['DEPARTURE'] == 1 else row['SCHEDULED_ARRIVAL'], axis=1)

# Time in reference to Newark only - Hour
df['SCHEDULED_HOUR'] = df['SCHEDULED_TIME'].apply(lambda row: row.hour)

# Dummy variables for Delay target
df['DELAY'] = df.apply(lambda row: 1 if (row['DEPARTURE']==1)&(row['DEPARTURE_DELAY']>15) else (1 if (row['DEPARTURE']==0)&(row['ARRIVAL_DELAY']>15) else 0), axis=1)

In [6]:
# Keep only relevant columns
keep_columns = ['DATE', 'DAY_OF_WEEK', 'SCHEDULED_HOUR', 'DEPARTURE', 'AIRLINE',
                'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'DISTANCE', 'DELAY', *delay_cols]
temp_columns = ['NYC_TIME_TEMP', 'TAIL_NUMBER']
df = df[keep_columns + temp_columns]

In [7]:
# Import weather database

weather = pd.DataFrame(pd.read_csv('newark_weather.csv'))
weather.drop('Unnamed: 0', axis=1, inplace=True)
weather['temperature'] = weather['temperature'].apply(lambda row: round(row,1))
weather['nyc_time'] = pd.to_datetime(weather['nyc_time'])
weather = weather.rename(columns={'icon':'weather'})
weather = weather.set_index('nyc_time')
weather = weather[weather.columns[:-4]]
weather = pd.get_dummies(weather, prefix='weather', columns=['weather'])
weather.drop(['weather_clear', 'weather_partly-cloudy'], axis=1, inplace=True) # instead of drop first
weather.head()

Unnamed: 0_level_0,precipIntensity,temperature,windSpeed,visibility,weather_cloudy,weather_fog,weather_rain,weather_sleet,weather_snow
nyc_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-01-01 00:00:00,0.0,-4.4,1.66,9.997,0,0,0,0,0
2015-01-01 01:00:00,0.0,-4.4,1.91,9.997,0,0,0,0,0
2015-01-01 02:00:00,0.0,-4.4,2.38,9.997,0,0,0,0,0
2015-01-01 03:00:00,0.0,-5.0,1.85,9.997,0,0,0,0,0
2015-01-01 04:00:00,0.0,-4.4,2.77,9.997,0,0,0,0,0


In [8]:
# Join main data with weather database

df = df.join(weather, on='NYC_TIME_TEMP')
df.drop('NYC_TIME_TEMP', axis=1, inplace=True)

In [9]:
# Import plane registration database

plane_reg = pd.DataFrame(pd.read_csv('newark_plane_reg.csv'))
plane_reg.drop('Unnamed: 0', axis=1, inplace=True)
plane_reg = plane_reg.set_index('N-Number')
plane_reg.head()

Unnamed: 0_level_0,Registered
N-Number,Unnamed: 1_level_1
N438WN,2003-07-21
N68061,2002-03-08
N914UY,2014-11-07
N446UA,1998-07-02
N18120,2005-02-25


In [10]:
# Join main data with plane registration database

df = df.join(plane_reg, on='TAIL_NUMBER')
df.drop(df[(df['Registered'].isna())].index, inplace=True)
df.drop('TAIL_NUMBER', axis=1, inplace=True)

df['planeAge'] = df['DATE'] - pd.to_datetime(df['Registered'])
df['planeAge'] = df['planeAge'].apply(lambda row: row.total_seconds()/365/24/60/60 if row.total_seconds() > 0 else 0)
df.drop('Registered', axis=1, inplace=True)
df = df.dropna()
df.head()

Unnamed: 0,DATE,DAY_OF_WEEK,SCHEDULED_HOUR,DEPARTURE,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,DISTANCE,DELAY,AIR_SYSTEM_DELAY,...,precipIntensity,temperature,windSpeed,visibility,weather_cloudy,weather_fog,weather_rain,weather_sleet,weather_snow,planeAge
0,2015-01-01,4,5,0,UA,SJU,EWR,1608,0,0,...,0.0,-4.4,0.0,9.997,0.0,0.0,0.0,0.0,0.0,4.961644
1,2015-01-01,4,5,0,B6,SJU,EWR,1608,0,0,...,0.0,-4.4,0.0,9.997,0.0,0.0,0.0,0.0,0.0,10.147945
2,2015-01-01,4,5,0,UA,BQN,EWR,1585,0,0,...,0.0,-4.4,0.0,9.997,0.0,0.0,0.0,0.0,0.0,9.49589
3,2015-01-01,4,7,0,UA,JAX,EWR,820,0,0,...,0.0,-5.6,1.46,9.997,0.0,0.0,0.0,0.0,0.0,11.2
4,2015-01-01,4,7,0,UA,CHS,EWR,628,0,0,...,0.0,-5.6,1.46,9.997,0.0,0.0,0.0,0.0,0.0,13.706849


In [12]:
# Assign df0 for all columns for analysis and visualization
df0 = df.copy()

# Create dummy variables for categorical variables
df['United'] = df.apply(lambda row: 1 if row['AIRLINE'] == 'UA' else 0, axis=1)
df = pd.get_dummies(df, prefix=['Day','Hour'], columns=['DAY_OF_WEEK','SCHEDULED_HOUR'])

# Assign df using only columns relevant to modelling
drop_list = ['DATE', 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
             'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
             'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']
df.drop(drop_list, axis=1, inplace=True)

In [14]:
# for col in df.columns:
#     print(col, '\n', df[col].value_counts(normalize=True), '\n\n')

In [23]:
df.describe()

Unnamed: 0,DEPARTURE,DISTANCE,DELAY,precipIntensity,temperature,windSpeed,visibility,weather_cloudy,weather_fog,weather_rain,...,Hour_14,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23
count,185142.0,185142.0,185142.0,185142.0,185142.0,185142.0,185142.0,185142.0,185142.0,185142.0,...,185142.0,185142.0,185142.0,185142.0,185142.0,185142.0,185142.0,185142.0,185142.0,185142.0
mean,0.50041,1096.003846,0.224082,0.002041,14.444606,4.285667,9.325998,0.232319,0.017424,0.043972,...,0.059592,0.066625,0.07336,0.059905,0.055844,0.056162,0.064718,0.051037,0.029658,0.022475
std,0.500001,758.003069,0.416977,0.014739,11.150732,3.324627,1.961256,0.422312,0.130847,0.205033,...,0.23673,0.249371,0.260727,0.237312,0.22962,0.230235,0.246028,0.220073,0.169643,0.148222
min,0.0,80.0,0.0,0.0,-17.2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,533.0,0.0,0.0,5.6,1.8,9.997,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,937.0,0.0,0.0,15.5,3.54,9.997,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1411.0,0.0,0.0,23.9,5.82,9.997,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,4962.0,1.0,0.2993,36.1,21.93,9.997,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [24]:
# Target = Delay [0/1]

# Predictors:
# 1. Departure [0/1]
# 2. Distance
# 3. PrecipIntensity
# 4. Temperature
# 5. WindSpeed
# 6. Visibility
# 7. PlaneAge
# 8. United [0/1]

# Predictors from categorical variables:
# 9.  Weather (5 types) [0/1]
# 10. Day of the week (7 days) [0/1]
# 11. Hour of the day (24 hours) [0/1]

df.shape

(185142, 44)

In [25]:
y = df['DELAY']
X = df.drop('DELAY', axis=1)