In [95]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter("ignore")

In [96]:
df = pd.read_csv('newark_flights.csv')
df.columns

Index(['Unnamed: 0', 'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE',
       'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT',
       'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE',
       'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
       'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'],
      dtype='object')

In [97]:
# Clean up null

df.drop('Unnamed: 0', axis=1, inplace=True)
df.drop(df[(df['ORIGIN_AIRPORT']=='EWR') & (df['DEPARTURE_TIME'].isna())].index, inplace=True)
df.drop(df[(df['DESTINATION_AIRPORT']=='EWR') & (df['ARRIVAL_TIME'].isna())].index, inplace=True)
df.drop(df[(df['ARRIVAL_DELAY'].isna())].index, inplace=True)
delay = list(df[df.columns[-6:]].columns)
for cols in delay:
    if df[cols].isna().any():
        df[cols].fillna(value=0, inplace=True)

In [98]:
# df.isna().sum()

In [99]:
# Clean up date and time
df['DATE'] = pd.to_datetime(df[['YEAR','MONTH', 'DAY']])
df["ARR_HOUR_SCH"] = df["SCHEDULED_ARRIVAL"].apply(lambda x: int(str(int(x))[0:2]) if len(str(int(x)))==4 else int(str(int(x))[0:1]))
df["ARR_MIN_SCH"]  = df["SCHEDULED_ARRIVAL"].apply(lambda x: int(str(int(x))[-2:]))
df["DEP_HOUR_SCH"] = df["SCHEDULED_DEPARTURE"].apply(lambda x: int(str(int(x))[0:2]) if len(str(int(x)))==4 else int(str(int(x))[0:1]))
df["DEP_MIN_SCH"]  = df["SCHEDULED_DEPARTURE"].apply(lambda x: int(str(int(x))[-2:]))
df = df.rename(columns={'ARR_HOUR_SCH':'HOUR', 'ARR_MIN_SCH':'MINUTE'})
df["SCHEDULED_ARRIVAL"] = pd.to_datetime(df[['YEAR','MONTH', 'DAY', 'HOUR', 'MINUTE']])
df["SCH_ARR_TEMP"] = pd.to_datetime(df[['YEAR','MONTH', 'DAY', 'HOUR']])
df.drop(['HOUR','MINUTE'], axis=1, inplace=True)
df = df.rename(columns={'DEP_HOUR_SCH':'HOUR', 'DEP_MIN_SCH':'MINUTE'})
df["SCHEDULED_DEPARTURE"] = pd.to_datetime(df[['YEAR','MONTH', 'DAY', 'HOUR', 'MINUTE']])
df["SCH_DEP_TEMP"] = pd.to_datetime(df[['YEAR','MONTH', 'DAY', 'HOUR']])
df["NYC_TIME_TEMP"] = df.apply(lambda row: row['SCH_ARR_TEMP'] if row['DESTINATION_AIRPORT'] == 'EWR' else row['SCH_DEP_TEMP'], axis=1)
df.drop(['HOUR','MINUTE', 'SCH_ARR_TEMP', 'SCH_DEP_TEMP'], axis=1, inplace=True)

# Combine United Express (EV) as mainline United (UA)
df['AIRLINE'] = df.apply(lambda row: 'UA' if row['AIRLINE'] == 'EV' else row['AIRLINE'], axis=1)

# Dummy variables for reasons of delay
delay_cols = ['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
              'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']
for col in delay_cols:
    df[col] = df[col].apply(lambda row: 1 if row > 0 else 0)
    
# Dummy variables for Arrival or Departure
df['DEPARTURE'] = df.apply(lambda row: 0 if row['DESTINATION_AIRPORT'] == 'EWR' else 1, axis=1)

# Time in reference to Newark only
df['SCHEDULED_TIME'] = df.apply(lambda row: row['SCHEDULED_DEPARTURE'] if row['DEPARTURE'] == 1 else row['SCHEDULED_ARRIVAL'], axis=1)

# Time in reference to Newark only - Hour
df['SCHEDULED_HOUR'] = df['SCHEDULED_TIME'].apply(lambda row: row.hour)

# Dummy variables for Delay target
df['DELAY'] = df.apply(lambda row: 1 if (row['DEPARTURE']==1)&(row['DEPARTURE_DELAY']>15) else (1 if (row['DEPARTURE']==0)&(row['ARRIVAL_DELAY']>15) else 0), axis=1)

In [100]:
# Keep only relevant columns
keep_columns = ['DATE', 'DAY_OF_WEEK', 'SCHEDULED_HOUR', 'AIRLINE',
                'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'DISTANCE', 'DELAY', *delay_cols]
temp_columns = ['NYC_TIME_TEMP', 'TAIL_NUMBER']
df = df[keep_columns + temp_columns]

In [101]:
# Import weather database

weather = pd.DataFrame(pd.read_csv('newark_weather.csv'))
weather.drop('Unnamed: 0', axis=1, inplace=True)
weather['temperature'] = weather['temperature'].apply(lambda row: round(row,1))
weather['nyc_time'] = pd.to_datetime(weather['nyc_time'])
weather = weather.rename(columns={'icon':'weather'})
weather = weather.set_index('nyc_time')
weather = weather[weather.columns[:-4]]
weather.head()

Unnamed: 0_level_0,weather,precipIntensity,temperature,windSpeed,visibility
nyc_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-01 00:00:00,clear,0.0,-4.4,1.66,9.997
2015-01-01 01:00:00,clear,0.0,-4.4,1.91,9.997
2015-01-01 02:00:00,clear,0.0,-4.4,2.38,9.997
2015-01-01 03:00:00,clear,0.0,-5.0,1.85,9.997
2015-01-01 04:00:00,clear,0.0,-4.4,2.77,9.997


In [102]:
# Join main data with weather database

df = df.join(weather, on='NYC_TIME_TEMP')
df.drop('NYC_TIME_TEMP', axis=1, inplace=True)

In [103]:
# Import plane registration database

plane_reg = pd.DataFrame(pd.read_csv('newark_plane_reg.csv'))
plane_reg.drop('Unnamed: 0', axis=1, inplace=True)
plane_reg = plane_reg.set_index('N-Number')
plane_reg.head()

Unnamed: 0_level_0,Registered
N-Number,Unnamed: 1_level_1
N438WN,2003-07-21
N68061,2002-03-08
N914UY,2014-11-07
N446UA,1998-07-02
N18120,2005-02-25


In [104]:
# Join main data with plane registration database

df = df.join(plane_reg, on='TAIL_NUMBER')
df.drop(df[(df['Registered'].isna())].index, inplace=True)
df.drop('TAIL_NUMBER', axis=1, inplace=True)

df['planeAge'] = df['DATE'] - pd.to_datetime(df['Registered'])
df['planeAge'] = df['planeAge'].apply(lambda row: round(row.total_seconds()/365/24/60/60),2)
df.drop('Registered', axis=1, inplace=True)

In [105]:
for col in df.columns:
    print(col, '\n', df[col].value_counts(normalize=True).head(), '\n\n')

DATE 
 2015-12-03    0.003651
2015-12-10    0.003619
2015-12-11    0.003565
2015-12-04    0.003554
2015-11-20    0.003527
Name: DATE, dtype: float64 


DAY_OF_WEEK 
 3    0.153448
4    0.151055
5    0.150326
1    0.150018
2    0.147447
Name: DAY_OF_WEEK, dtype: float64 


SCHEDULED_HOUR 
 16    0.073359
15    0.066624
20    0.064717
13    0.061244
17    0.059905
Name: SCHEDULED_HOUR, dtype: float64 


AIRLINE 
 UA    0.758804
B6    0.065036
WN    0.055465
DL    0.052521
AA    0.021815
Name: AIRLINE, dtype: float64 


ORIGIN_AIRPORT 
 EWR    0.500405
ATL    0.026471
SFO    0.026282
MCO    0.024732
LAX    0.024106
Name: ORIGIN_AIRPORT, dtype: float64 


DESTINATION_AIRPORT 
 EWR    0.499595
ATL    0.026482
SFO    0.026315
MCO    0.024797
LAX    0.024122
Name: DESTINATION_AIRPORT, dtype: float64 


DISTANCE 
 746     0.052953
2565    0.052597
937     0.049529
2454    0.048227
200     0.046018
Name: DISTANCE, dtype: float64 


DELAY 
 0    0.77592
1    0.22408
Name: DELAY, dtype: float64 
