In [197]:
import pandas as pd
import warnings
warnings.simplefilter("ignore")

In [198]:
df = pd.read_csv('newark_flights.csv')
df.columns

Index(['Unnamed: 0', 'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE',
       'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT',
       'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE',
       'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
       'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'],
      dtype='object')

In [199]:
df.head()

Unnamed: 0.1,Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,37,2015,1,1,4,UA,1528,N76519,SJU,EWR,...,458.0,-11.0,0,0,,,,,,
1,45,2015,1,1,4,B6,1990,N597JB,SJU,EWR,...,516.0,4.0,0,0,,,,,,
2,49,2015,1,1,4,UA,1162,N37293,BQN,EWR,...,605.0,6.0,0,0,,,,,,
3,110,2015,1,1,4,EV,4160,N11150,JAX,EWR,...,743.0,-14.0,0,0,,,,,,
4,125,2015,1,1,4,EV,4646,N29917,CHS,EWR,...,742.0,1.0,0,0,,,,,,


In [200]:
# Clean up null
df.drop('Unnamed: 0', axis=1, inplace=True)
df.drop(df[(df['ORIGIN_AIRPORT']=='EWR') & (df['DEPARTURE_TIME'].isna())].index, inplace=True)
df.drop(df[(df['DESTINATION_AIRPORT']=='EWR') & (df['ARRIVAL_TIME'].isna())].index, inplace=True)
df.drop(df[(df['ARRIVAL_DELAY'].isna())].index, inplace=True)
delay = list(df[df.columns[-6:]].columns)
for cols in delay:
    if df[cols].isna().any():
        df[cols].fillna(value=0, inplace=True)

In [201]:
df.isna().sum()

YEAR                   0
MONTH                  0
DAY                    0
DAY_OF_WEEK            0
AIRLINE                0
FLIGHT_NUMBER          0
TAIL_NUMBER            0
ORIGIN_AIRPORT         0
DESTINATION_AIRPORT    0
SCHEDULED_DEPARTURE    0
DEPARTURE_TIME         0
DEPARTURE_DELAY        0
TAXI_OUT               0
WHEELS_OFF             0
SCHEDULED_TIME         0
ELAPSED_TIME           0
AIR_TIME               0
DISTANCE               0
WHEELS_ON              0
TAXI_IN                0
SCHEDULED_ARRIVAL      0
ARRIVAL_TIME           0
ARRIVAL_DELAY          0
DIVERTED               0
CANCELLED              0
CANCELLATION_REASON    0
AIR_SYSTEM_DELAY       0
SECURITY_DELAY         0
AIRLINE_DELAY          0
LATE_AIRCRAFT_DELAY    0
WEATHER_DELAY          0
dtype: int64

In [203]:
# Clean up column
df["ARRIVAL_MIN"] = df["ARRIVAL_TIME"].apply(lambda x: int(str(int(x))[-2:]))
df["ARRIVAL_HOUR"] = df["ARRIVAL_TIME"].apply(lambda x: int(str(int(x))[0:2]) if len(str(int(x)))==4 else int(str(int(x))[0:1]))
df["DEPARTURE_MIN"] = df["DEPARTURE_TIME"].apply(lambda x: int(str(int(x))[-2:]))
df["DEPARTURE_HOUR"] = df["DEPARTURE_TIME"].apply(lambda x: int(str(int(x))[0:2]) if len(str(int(x)))==4 else int(str(int(x))[0:1]))

In [204]:
# Import plane registration database

plane_reg = pd.DataFrame(pd.read_csv('newark_plane_reg.csv'))
plane_reg.drop('Unnamed: 0', axis=1, inplace=True)
plane_reg = plane_reg.set_index('N-Number')
plane_reg.head()

Unnamed: 0_level_0,Reg_year,Reg_month
N-Number,Unnamed: 1_level_1,Unnamed: 2_level_1
N438WN,2003,7
N68061,2002,3
N914UY,2014,11
N446UA,1998,7
N18120,2005,2


In [205]:
# Join main data with plane registration database
df = df.join(plane_reg, on='TAIL_NUMBER')
df.drop(df[(df['Reg_year'].isna())].index, inplace=True)

In [206]:
df.dtypes

YEAR                     int64
MONTH                    int64
DAY                      int64
DAY_OF_WEEK              int64
AIRLINE                 object
FLIGHT_NUMBER            int64
TAIL_NUMBER             object
ORIGIN_AIRPORT          object
DESTINATION_AIRPORT     object
SCHEDULED_DEPARTURE      int64
DEPARTURE_TIME         float64
DEPARTURE_DELAY        float64
TAXI_OUT               float64
WHEELS_OFF             float64
SCHEDULED_TIME         float64
ELAPSED_TIME           float64
AIR_TIME               float64
DISTANCE                 int64
WHEELS_ON              float64
TAXI_IN                float64
SCHEDULED_ARRIVAL        int64
ARRIVAL_TIME           float64
ARRIVAL_DELAY          float64
DIVERTED                 int64
CANCELLED                int64
CANCELLATION_REASON      int64
AIR_SYSTEM_DELAY       float64
SECURITY_DELAY         float64
AIRLINE_DELAY          float64
LATE_AIRCRAFT_DELAY    float64
WEATHER_DELAY          float64
ARRIVAL_MIN              int64
ARRIVAL_

In [207]:
df.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,ARRIVAL_MIN,ARRIVAL_HOUR,DEPARTURE_MIN,DEPARTURE_HOUR,Reg_year,Reg_month
0,2015,1,1,4,UA,1528,N76519,SJU,EWR,154,...,0.0,0.0,0.0,0.0,58,4,57,1,2010.0,1.0
1,2015,1,1,4,B6,1990,N597JB,SJU,EWR,206,...,0.0,0.0,0.0,0.0,16,5,52,1,2004.0,11.0
2,2015,1,1,4,UA,1162,N37293,BQN,EWR,259,...,0.0,0.0,0.0,0.0,5,6,58,2,2005.0,7.0
3,2015,1,1,4,EV,4160,N11150,JAX,EWR,540,...,0.0,0.0,0.0,0.0,43,7,31,5,2003.0,10.0
4,2015,1,1,4,EV,4646,N29917,CHS,EWR,545,...,0.0,0.0,0.0,0.0,42,7,40,5,2001.0,4.0


In [208]:
weather = pd.DataFrame(pd.read_csv('newark_weather.csv'))
weather.drop('Unnamed: 0', axis=1, inplace=True)
# weather = weather.set_index('N-Number')
weather['temperature'] = weather['temperature'].apply(lambda row: round(row,1))
weather.head()

Unnamed: 0,icon,precipIntensity,temperature,windSpeed,visibility,nyc_time,year,month,day,hour
0,clear,0.0,-4.4,1.66,9.997,01/01/2015 00:00,2015,1,1,0
1,clear,0.0,-4.4,1.91,9.997,01/01/2015 01:00,2015,1,1,1
2,clear,0.0,-4.4,2.38,9.997,01/01/2015 02:00,2015,1,1,2
3,clear,0.0,-5.0,1.85,9.997,01/01/2015 03:00,2015,1,1,3
4,clear,0.0,-4.4,2.77,9.997,01/01/2015 04:00,2015,1,1,4


In [209]:
weather.dtypes

icon                object
precipIntensity    float64
temperature        float64
windSpeed          float64
visibility         float64
nyc_time            object
year                 int64
month                int64
day                  int64
hour                 int64
dtype: object