In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import model_selection
from datetime import datetime, timedelta

pd.set_option('max_columns', None)
pd.set_option('max_rows', 50)

# Weather Data

### Read Data from CSV

In [2]:
raw = pd.read_csv('wxkslc.csv')
raw.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Station_ID,Date_Time,altimeter_set_1,air_temp_set_1,dew_point_temperature_set_1,relative_humidity_set_1,wind_speed_set_1,wind_direction_set_1,wind_gust_set_1,snow_depth_set_1,sea_level_pressure_set_1,weather_cond_code_set_1,cloud_layer_3_code_set_1,pressure_tendency_set_1,qc_set_1,precip_accum_one_hour_set_1,precip_accum_three_hour_set_1,metar_origin_set_1,cloud_layer_1_code_set_1,cloud_layer_2_code_set_1,precip_accum_six_hour_set_1,precip_accum_24_hour_set_1,visibility_set_1,metar_remark_set_1,metar_set_1,air_temp_high_6_hour_set_1,air_temp_low_6_hour_set_1,peak_wind_speed_set_1,ceiling_set_1,pressure_change_code_set_1,air_temp_high_24_hour_set_1,air_temp_low_24_hour_set_1,peak_wind_direction_set_1,dew_point_temperature_set_1d,wind_chill_set_1d,wind_cardinal_direction_set_1d,pressure_set_1d,sea_level_pressure_set_1d,heat_index_set_1d,weather_condition_set_1d,weather_condition_set_2d,weather_condition_set_3d
0,,,INHG,Fahrenheit,Fahrenheit,%,knots,Degrees,knots,Inches,Millibars,code,code,code,code,Inches,Inches,code,code,code,Inches,Inches,Statute miles,text,text,Fahrenheit,Fahrenheit,knots,Feet,code,Fahrenheit,Fahrenheit,Degrees,Fahrenheit,Fahrenheit,Code,Millibars,Millibars,Fahrenheit,Code,,
1,KSLC,2016-01-01T00:00:00Z,30.45,12.02,6.08,76.51,9.0,360.0,,,,,,,,,,,1.0,,,,9.0,,,,,,,,,,,5.98,2.9,N,883.05,1027.38,,Clear,,
2,KSLC,2016-01-01T00:05:00Z,30.45,12.02,6.08,76.51,7.0,360.0,,,,,,,,,,,1.0,,,,10.0,,,,,,,,,,,5.98,,N,883.05,1027.38,,Clear,,
3,KSLC,2016-01-01T00:10:00Z,30.45,12.02,6.08,76.51,7.0,360.0,,,,,,,,,,,1.0,,,,10.0,,,,,,,,,,,5.98,,N,883.05,1027.38,,Clear,,
4,KSLC,2016-01-01T00:15:00Z,30.45,12.02,5.0,72.82,7.0,360.0,,,,,,,,,,,1.0,,,,10.0,,,,,,,,,,,4.88,,N,883.05,1027.41,,Clear,,


### Select Relevant Features

In [3]:
selectCol = raw[['Date_Time','air_temp_set_1','dew_point_temperature_set_1d',
                 'wind_speed_set_1','wind_direction_set_1','wind_gust_set_1',
                 'snow_depth_set_1','cloud_layer_1_code_set_1',
                 'cloud_layer_2_code_set_1','cloud_layer_3_code_set_1',
                 'visibility_set_1','ceiling_set_1','weather_condition_set_1d',
                 'weather_condition_set_2d','weather_condition_set_3d']]

In [4]:
selectRow = selectCol.iloc[1:]
selectRow.head()

Unnamed: 0,Date_Time,air_temp_set_1,dew_point_temperature_set_1d,wind_speed_set_1,wind_direction_set_1,wind_gust_set_1,snow_depth_set_1,cloud_layer_1_code_set_1,cloud_layer_2_code_set_1,cloud_layer_3_code_set_1,visibility_set_1,ceiling_set_1,weather_condition_set_1d,weather_condition_set_2d,weather_condition_set_3d
1,2016-01-01T00:00:00Z,12.02,5.98,9.0,360.0,,,1.0,,,9.0,,Clear,,
2,2016-01-01T00:05:00Z,12.02,5.98,7.0,360.0,,,1.0,,,10.0,,Clear,,
3,2016-01-01T00:10:00Z,12.02,5.98,7.0,360.0,,,1.0,,,10.0,,Clear,,
4,2016-01-01T00:15:00Z,12.02,4.88,7.0,360.0,,,1.0,,,10.0,,Clear,,
5,2016-01-01T00:20:00Z,10.94,4.9,7.0,350.0,,,1.0,,,10.0,,Clear,,


### Dealing with Missing Values

First take a look at the rows with most values missing and the missing value rate for each feature.

In [7]:
#rows with most features missing
mostNaN = selectRow.loc[selectRow.apply(lambda x:x.isnull().sum()>=14, axis=1)]

In [8]:
#missing value rate
selectRow.apply(lambda x: x.isnull().sum()/x.size, axis=0)

Date_Time                       0.000000
air_temp_set_1                  0.000805
dew_point_temperature_set_1d    0.000805
wind_speed_set_1                0.005159
wind_direction_set_1            0.017921
wind_gust_set_1                 0.924559
snow_depth_set_1                0.996531
cloud_layer_1_code_set_1        0.007405
cloud_layer_2_code_set_1        0.781717
cloud_layer_3_code_set_1        0.905902
visibility_set_1                0.001004
ceiling_set_1                   0.691088
weather_condition_set_1d        0.007415
weather_condition_set_2d        0.973809
weather_condition_set_3d        0.998549
dtype: float64

For feature 'Date_Time', there are no missing values.

In [9]:
perfect = selectRow['Date_Time']

For features that have values in most of the rows, use the strategy that replacing the missing value with the last valid observation in the time serie.

In [27]:
common_raw = selectRow[['air_temp_set_1','dew_point_temperature_set_1d','wind_speed_set_1',
                        'wind_direction_set_1','cloud_layer_1_code_set_1','visibility_set_1',
                        'weather_condition_set_1d']]
common = common_raw.fillna(method='ffill')
common.head()

Unnamed: 0,air_temp_set_1,dew_point_temperature_set_1d,wind_speed_set_1,wind_direction_set_1,cloud_layer_1_code_set_1,visibility_set_1,weather_condition_set_1d
1,12.02,5.98,9.0,360.0,1.0,9.0,Clear
2,12.02,5.98,7.0,360.0,1.0,10.0,Clear
3,12.02,5.98,7.0,360.0,1.0,10.0,Clear
4,12.02,4.88,7.0,360.0,1.0,10.0,Clear
5,10.94,4.9,7.0,350.0,1.0,10.0,Clear


In [28]:
common1 = common.iloc[:,0:-1]
common1.head()

Unnamed: 0,air_temp_set_1,dew_point_temperature_set_1d,wind_speed_set_1,wind_direction_set_1,cloud_layer_1_code_set_1,visibility_set_1
1,12.02,5.98,9.0,360.0,1.0,9.0
2,12.02,5.98,7.0,360.0,1.0,10.0
3,12.02,5.98,7.0,360.0,1.0,10.0
4,12.02,4.88,7.0,360.0,1.0,10.0
5,10.94,4.9,7.0,350.0,1.0,10.0


In [29]:
common2 = common.iloc[:,-1]
common2.head()

1    Clear
2    Clear
3    Clear
4    Clear
5    Clear
Name: weather_condition_set_1d, dtype: object

For features that miss values in most of the rows, use different strategy respectively.

In [26]:
rare = selectRow[['wind_gust_set_1','snow_depth_set_1','cloud_layer_2_code_set_1',
                  'cloud_layer_3_code_set_1','ceiling_set_1','weather_condition_set_2d',
                  'weather_condition_set_3d']]
rare.head()

Unnamed: 0,wind_gust_set_1,snow_depth_set_1,cloud_layer_2_code_set_1,cloud_layer_3_code_set_1,ceiling_set_1,weather_condition_set_2d,weather_condition_set_3d
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
5,,,,,,,


For Wind Gust and Snow Depth, replace with 0. 

In [25]:
rare1 = rare[['wind_gust_set_1','snow_depth_set_1']].fillna(0)
rare1.head()

Unnamed: 0,wind_gust_set_1,snow_depth_set_1
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0


For Cloud, replace with mode 1.0 (which means clear sky).

In [15]:
pd.to_numeric(common['cloud_layer_1_code_set_1']).max()

2506.0

In [16]:
pd.to_numeric(rare['cloud_layer_2_code_set_1']).max()

2806.0

In [17]:
pd.to_numeric(rare['cloud_layer_3_code_set_1']).max()

2803.0

In [24]:
rare2 = rare[['cloud_layer_2_code_set_1','cloud_layer_3_code_set_1']].fillna(1.0)
rare2.head()

Unnamed: 0,cloud_layer_2_code_set_1,cloud_layer_3_code_set_1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1


For Ceiling, replace with a large number which is greater than the maximum.

In [19]:
pd.to_numeric(rare['ceiling_set_1']).max()

28000.0

In [23]:
rare3 = rare[['ceiling_set_1']].fillna(50000)
rare3.head()

Unnamed: 0,ceiling_set_1
1,50000
2,50000
3,50000
4,50000
5,50000


For Weather Condition, do nothing.

In [22]:
rare4 = rare[['weather_condition_set_2d','weather_condition_set_3d']]
rare4.head()

Unnamed: 0,weather_condition_set_2d,weather_condition_set_3d
1,,
2,,
3,,
4,,
5,,


### Feature Encoding 

Change date and time features to DateTime format.

In [30]:
perfect_dt = perfect.apply(lambda x:datetime.strptime(x,'%Y-%m-%dT%H:%M:%SZ'))
perfect_dt.head()

1   2016-01-01 00:00:00
2   2016-01-01 00:05:00
3   2016-01-01 00:10:00
4   2016-01-01 00:15:00
5   2016-01-01 00:20:00
Name: Date_Time, dtype: datetime64[ns]

Change numeric features to float format.

In [31]:
common1_num = common1.apply(pd.to_numeric, errors='raise')
common1_num.head()

Unnamed: 0,air_temp_set_1,dew_point_temperature_set_1d,wind_speed_set_1,wind_direction_set_1,cloud_layer_1_code_set_1,visibility_set_1
1,12.02,5.98,9.0,360.0,1.0,9.0
2,12.02,5.98,7.0,360.0,1.0,10.0
3,12.02,5.98,7.0,360.0,1.0,10.0
4,12.02,4.88,7.0,360.0,1.0,10.0
5,10.94,4.9,7.0,350.0,1.0,10.0


In [32]:
rare1_num = rare1.apply(pd.to_numeric, errors='raise')
rare1_num.head()

Unnamed: 0,wind_gust_set_1,snow_depth_set_1
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
5,0.0,0.0


In [33]:
rare2_num = rare2.apply(pd.to_numeric, errors='raise')
rare2_num.head()

Unnamed: 0,cloud_layer_2_code_set_1,cloud_layer_3_code_set_1
1,1.0,1.0
2,1.0,1.0
3,1.0,1.0
4,1.0,1.0
5,1.0,1.0


In [34]:
rare3_num = rare3.apply(pd.to_numeric, errors='raise')
rare3_num.head()

Unnamed: 0,ceiling_set_1
1,50000.0
2,50000.0
3,50000.0
4,50000.0
5,50000.0


Deal with categorical feature 'Weather Condition'.

In [35]:
wx_cond_set1 = set(common2.unique())

In [36]:
wx_cond_set2 = set(rare4.iloc[:,0].unique())

In [37]:
wx_cond_set3 = set(rare4.iloc[:,1].unique())

In [40]:
wx_bin = pd.get_dummies(common2)
wx_bin.head()

Unnamed: 0,Blowing Dust,Blowing Snow,Clear,Fog,Frz Rain,Hail,Haze,Heavy Rain,Heavy Rain/Thunderstorm,Heavy Snow,Ice Fog,Ice pellets,Light Frz Rain,Light Ice Pellets,Light Rain,Light Rain/Thunderstorm,Light Snow,Light Snow Pellets,Mostly Clear,Mostly Cloudy,Overcast,Partly Cloudy,Rain,Snow,Snow Pellets,Squalls,Thunder,Thunderstorm,Unknown Precip
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [41]:
wx_bin2 = pd.get_dummies(rare4)
wx_bin2.head()

Unnamed: 0,weather_condition_set_2d_Fog,weather_condition_set_2d_Haze,weather_condition_set_2d_Ice Fog,weather_condition_set_2d_Light Rain,weather_condition_set_2d_Light Snow,weather_condition_set_2d_Light Snow Pellets,weather_condition_set_2d_Light Snow Shower,weather_condition_set_2d_Rain,weather_condition_set_2d_Snow,weather_condition_set_2d_Snow Pellets,weather_condition_set_2d_Squalls,weather_condition_set_3d_Fog,weather_condition_set_3d_Rain
1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0


In [42]:
wx_bin['Light Snow Shower'] = wx_bin2['weather_condition_set_2d_Light Snow Shower']

In [43]:
for wx in wx_cond_set2:
    name = str(wx)
    if(name == 'nan'):
        continue
    wx_bin[name] = wx_bin[name] | wx_bin2['weather_condition_set_2d_'+name]

for wx in wx_cond_set3:
    name = str(wx)
    if(name == 'nan'):
        continue
    wx_bin[name] = wx_bin[name] | wx_bin2['weather_condition_set_3d_'+name]

In [44]:
wx_bin.head()

Unnamed: 0,Blowing Dust,Blowing Snow,Clear,Fog,Frz Rain,Hail,Haze,Heavy Rain,Heavy Rain/Thunderstorm,Heavy Snow,Ice Fog,Ice pellets,Light Frz Rain,Light Ice Pellets,Light Rain,Light Rain/Thunderstorm,Light Snow,Light Snow Pellets,Mostly Clear,Mostly Cloudy,Overcast,Partly Cloudy,Rain,Snow,Snow Pellets,Squalls,Thunder,Thunderstorm,Unknown Precip,Light Snow Shower
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Concatenate Together 

In [45]:
wx = pd.concat([perfect_dt, common1_num, rare1_num, rare2_num, rare3_num, wx_bin], axis=1)

In [46]:
wx.head()

Unnamed: 0,Date_Time,air_temp_set_1,dew_point_temperature_set_1d,wind_speed_set_1,wind_direction_set_1,cloud_layer_1_code_set_1,visibility_set_1,wind_gust_set_1,snow_depth_set_1,cloud_layer_2_code_set_1,cloud_layer_3_code_set_1,ceiling_set_1,Blowing Dust,Blowing Snow,Clear,Fog,Frz Rain,Hail,Haze,Heavy Rain,Heavy Rain/Thunderstorm,Heavy Snow,Ice Fog,Ice pellets,Light Frz Rain,Light Ice Pellets,Light Rain,Light Rain/Thunderstorm,Light Snow,Light Snow Pellets,Mostly Clear,Mostly Cloudy,Overcast,Partly Cloudy,Rain,Snow,Snow Pellets,Squalls,Thunder,Thunderstorm,Unknown Precip,Light Snow Shower
1,2016-01-01 00:00:00,12.02,5.98,9.0,360.0,1.0,9.0,0.0,0.0,1.0,1.0,50000.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2016-01-01 00:05:00,12.02,5.98,7.0,360.0,1.0,10.0,0.0,0.0,1.0,1.0,50000.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2016-01-01 00:10:00,12.02,5.98,7.0,360.0,1.0,10.0,0.0,0.0,1.0,1.0,50000.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2016-01-01 00:15:00,12.02,4.88,7.0,360.0,1.0,10.0,0.0,0.0,1.0,1.0,50000.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,2016-01-01 00:20:00,10.94,4.9,7.0,350.0,1.0,10.0,0.0,0.0,1.0,1.0,50000.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Flight Data

### Read Data from CSV

In [48]:
to_raw = pd.read_csv('toslc.csv', index_col=0)
to_raw.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,UniqueCarrier,AirlineID,Carrier,TailNum,FlightNum,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,Origin,OriginCityName,OriginState,OriginStateFips,OriginStateName,OriginWac,DestAirportID,DestAirportSeqID,DestCityMarketID,Dest,DestCityName,DestState,DestStateFips,DestStateName,DestWac,CRSDepTime,DepTime,DepDelay,DepDelayMinutes,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDelayMinutes,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,Cancelled,CancellationCode,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,FirstDepTime,TotalAddGTime,LongestAddGTime,DivAirportLandings,DivReachedDest,DivActualElapsedTime,DivArrDelay,DivDistance,Div1Airport,Div1AirportID,Div1AirportSeqID,Div1WheelsOn,Div1TotalGTime,Div1LongestGTime,Div1WheelsOff,Div1TailNum,Div2Airport,Div2AirportID,Div2AirportSeqID,Div2WheelsOn,Div2TotalGTime,Div2LongestGTime,Div2WheelsOff,Div2TailNum,Div3Airport,Div3AirportID,Div3AirportSeqID,Div3WheelsOn,Div3TotalGTime,Div3LongestGTime,Div3WheelsOff,Div3TailNum,Div4Airport,Div4AirportID,Div4AirportSeqID,Div4WheelsOn,Div4TotalGTime,Div4LongestGTime,Div4WheelsOff,Div4TailNum,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum,Unnamed: 109
0,2016,1,1,1,5,2016-01-01,AA,19805,AA,N3JAAA,79,11298,1129804,30194,DFW,"Dallas/Fort Worth, TX",TX,48,Texas,74,14869,1486903,34614,SLC,"Salt Lake City, UT",UT,49,Utah,87,2230,2316.0,46.0,46.0,1.0,3.0,2200-2259,11.0,2327.0,44.0,5.0,20,49.0,29.0,29.0,1.0,1.0,0001-0559,0.0,,0.0,170.0,153.0,137.0,1.0,989.0,4,0.0,0.0,0.0,0.0,29.0,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2016,1,1,2,6,2016-01-02,AA,19805,AA,N3ALAA,79,11298,1129804,30194,DFW,"Dallas/Fort Worth, TX",TX,48,Texas,74,14869,1486903,34614,SLC,"Salt Lake City, UT",UT,49,Utah,87,2230,2231.0,1.0,1.0,0.0,0.0,2200-2259,18.0,2249.0,10.0,47.0,20,57.0,37.0,37.0,1.0,2.0,0001-0559,0.0,,0.0,170.0,206.0,141.0,1.0,989.0,4,1.0,0.0,36.0,0.0,0.0,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2016,1,1,3,7,2016-01-03,AA,19805,AA,N3EPAA,79,11298,1129804,30194,DFW,"Dallas/Fort Worth, TX",TX,48,Texas,74,14869,1486903,34614,SLC,"Salt Lake City, UT",UT,49,Utah,87,2230,2335.0,65.0,65.0,1.0,4.0,2200-2259,11.0,2346.0,58.0,12.0,20,110.0,50.0,50.0,1.0,3.0,0001-0559,0.0,,0.0,170.0,155.0,132.0,1.0,989.0,4,0.0,0.0,0.0,0.0,50.0,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2016,1,1,4,1,2016-01-04,AA,19805,AA,N3GLAA,79,11298,1129804,30194,DFW,"Dallas/Fort Worth, TX",TX,48,Texas,74,14869,1486903,34614,SLC,"Salt Lake City, UT",UT,49,Utah,87,2230,2227.0,-3.0,0.0,0.0,-1.0,2200-2259,14.0,2241.0,2349.0,6.0,20,2355.0,-25.0,0.0,0.0,-2.0,0001-0559,0.0,,0.0,170.0,148.0,128.0,1.0,989.0,4,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2016,1,1,5,2,2016-01-05,AA,19805,AA,N3KSAA,79,11298,1129804,30194,DFW,"Dallas/Fort Worth, TX",TX,48,Texas,74,14869,1486903,34614,SLC,"Salt Lake City, UT",UT,49,Utah,87,2230,2225.0,-5.0,0.0,0.0,-1.0,2200-2259,12.0,2237.0,2358.0,3.0,18,1.0,-17.0,0.0,0.0,-2.0,0001-0559,0.0,,0.0,168.0,156.0,141.0,1.0,989.0,4,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Select Relevant Features

In [49]:
to_select = to_raw[['Month','DayofMonth','DayOfWeek','AirlineID','TailNum','FlightNum',
                    'OriginAirportID','CRSDepTime','FlightDate','CRSArrTime','Distance',
                    'ArrDel15']]
to_select.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,AirlineID,TailNum,FlightNum,OriginAirportID,CRSDepTime,FlightDate,CRSArrTime,Distance,ArrDel15
0,1,1,5,19805,N3JAAA,79,11298,2230,2016-01-01,20,989.0,1.0
1,1,2,6,19805,N3ALAA,79,11298,2230,2016-01-02,20,989.0,1.0
2,1,3,7,19805,N3EPAA,79,11298,2230,2016-01-03,20,989.0,1.0
3,1,4,1,19805,N3GLAA,79,11298,2230,2016-01-04,20,989.0,0.0
4,1,5,2,19805,N3KSAA,79,11298,2230,2016-01-05,18,989.0,0.0


### Drop Missing Values

In [51]:
to = to_select.dropna(axis=0, how='any')

### Feature Encoding 

Change circraft tail number to numberic.

In [52]:
le = preprocessing.LabelEncoder()
code = le.fit_transform(to['TailNum'])

In [53]:
to['TailNum'] = code

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Convert departure time and arriving time to 4-digit string.

In [54]:
to['CRSDepTime'] = to['CRSDepTime'].apply(lambda x:str(x).zfill(4))
to['CRSArrTime'] = to['CRSArrTime'].apply(lambda x:str(x).zfill(4))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Create another feature "DateTime" by concatenating flight date and arring time for joining with weather data.

In [55]:
to['DateTime'] = to['FlightDate'].astype(str)+'T'+to['CRSArrTime'].astype(str)
to.drop('FlightDate', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [56]:
to['DateTime'] = to['DateTime'].apply(lambda x:datetime.strptime(x, '%Y-%m-%dT%H%M'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Convert departure time and arriving time to numeric.

In [57]:
to['CRSDepTime'] = to['CRSDepTime'].apply(pd.to_numeric, errors='raise')
to['CRSArrTime'] = to['CRSArrTime'].apply(pd.to_numeric, errors='raise')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


[Important]

For flights with an arriving time between 12:00am and 05:00am, add one day from the original datetime to make sure we assign the corresponding weather info to them. It is because the original date is the departure date which is the previous day of the arriving date.

In [58]:
to1 = to[to['CRSArrTime']>500]

In [59]:
to2 = to[to['CRSArrTime']<500]

In [60]:
to2['DateTime'] = to2['DateTime'] + timedelta(days=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [61]:
to = pd.concat([to1, to2])
to = to.sort_index()

In [62]:
to.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,AirlineID,TailNum,FlightNum,OriginAirportID,CRSDepTime,CRSArrTime,Distance,ArrDel15,DateTime
0,1,1,5,19805,954,79,11298,2230,20,989.0,1.0,2016-01-02 00:20:00
1,1,2,6,19805,789,79,11298,2230,20,989.0,1.0,2016-01-03 00:20:00
2,1,3,7,19805,879,79,11298,2230,20,989.0,1.0,2016-01-04 00:20:00
3,1,4,1,19805,920,79,11298,2230,20,989.0,0.0,2016-01-05 00:20:00
4,1,5,2,19805,991,79,11298,2230,18,989.0,0.0,2016-01-06 00:18:00


# Join 

Round-up the arriving time to every 5 minutes in order to join with weather data.

In [63]:
to['DateTime'] = to['DateTime'].apply(lambda x:x - timedelta(minutes=x.minute % 5))

Join the flight operation data and weather data.

In [64]:
join = pd.merge(to, wx, left_on='DateTime', right_on='Date_Time', how='inner')
join = join.rename(columns={'ArrDel15':'temp'})
join['ArrDel15'] = join['temp']
join = join.drop(['DateTime', 'Date_Time', 'temp'], axis=1)

In [65]:
join.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,AirlineID,TailNum,FlightNum,OriginAirportID,CRSDepTime,CRSArrTime,Distance,air_temp_set_1,dew_point_temperature_set_1d,wind_speed_set_1,wind_direction_set_1,cloud_layer_1_code_set_1,visibility_set_1,wind_gust_set_1,snow_depth_set_1,cloud_layer_2_code_set_1,cloud_layer_3_code_set_1,ceiling_set_1,Blowing Dust,Blowing Snow,Clear,Fog,Frz Rain,Hail,Haze,Heavy Rain,Heavy Rain/Thunderstorm,Heavy Snow,Ice Fog,Ice pellets,Light Frz Rain,Light Ice Pellets,Light Rain,Light Rain/Thunderstorm,Light Snow,Light Snow Pellets,Mostly Clear,Mostly Cloudy,Overcast,Partly Cloudy,Rain,Snow,Snow Pellets,Squalls,Thunder,Thunderstorm,Unknown Precip,Light Snow Shower,ArrDel15
0,1,1,5,19805,954,79,11298,2230,20,989.0,14.0,4.85,2.99,290.0,1.0,10.0,0.0,0.0,1.0,1.0,50000.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0
1,1,2,6,19805,789,79,11298,2230,20,989.0,19.94,10.79,0.0,0.0,1.0,10.0,0.0,0.0,1.0,1.0,50000.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0
2,1,3,7,19805,879,79,11298,2230,20,989.0,24.98,17.84,4.0,340.0,1.0,8.0,0.0,0.0,1.0,1.0,50000.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0
3,1,4,1,19805,920,79,11298,2230,20,989.0,28.04,21.82,0.0,0.0,652.0,5.0,0.0,0.0,804.0,1.0,8000.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
4,1,4,1,19393,1124,883,13796,2145,20,588.0,28.04,21.82,0.0,0.0,652.0,5.0,0.0,0.0,804.0,1.0,8000.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0


# Prediction

In [None]:
total = join.as_matrix()

In [None]:
X = total[:,:-1]
y = total[:,-1]

In [None]:
trainX, testX, trainy, testy = model_selection.train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtc = DecisionTreeClassifier(max_depth=4)
predy = dtc.fit(trainX, trainy).predict(testX)

In [None]:
print('Accuracy: {0:.5f}'.format((predy==testy).sum() / testy.size))

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=30, max_depth=20)
predy = rfc.fit(trainX, trainy).predict(testX)

In [None]:
print('Accuracy: {0:.5f}'.format((predy==testy).sum() / testy.size))

In [None]:
from sklearn.linear_model import Perceptron

In [None]:
perc = Perceptron()
predy = perc.fit(trainX, trainy).predict(testX)

In [None]:
print('Accuracy: {0:.5f}'.format((predy==testy).sum() / testy.size))

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()
predy = gnb.fit(trainX, trainy).predict(testX)

In [None]:
print('Accuracy: {0:.5f}'.format((predy==testy).sum() / testy.size))

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlpc = MLPClassifier()
predy = mlpc.fit(trainX, trainy).predict(testX)

In [None]:
print('Accuracy: {0:.5f}'.format((predy==testy).sum() / testy.size))