## Load Libraries

In [76]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from pylab import figure, show
import holidays
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import glob
from sklearn import preprocessing
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from scipy.stats import randint as sp_randint
from sklearn.grid_search import RandomizedSearchCV
import warnings
warnings.filterwarnings("ignore")

## Concatenate data from each month into one big dataframe

In [77]:
path ="C:\\Data\\Airline\\" # use your path
allFiles = glob.glob(path + "/*.csv")
delay = pd.DataFrame()
list_ = []

#Concatenate all the .csv files in path into one data frame
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0)
    list_.append(df)
delay = pd.concat(list_)
delay.shape

(5819079, 110)

## Load dataset with plane model information and dataset with airport (latitude, longitude) info

In [78]:
#Load the dataset with plane model information
planes = pd.read_csv("C:\\Data\\plane-data.csv")

#Change the "tailnum" column to "TailNum"
planes=planes.rename(columns = {'tailnum':'TailNum'})

print ("Length of dataframe - ", len(planes))
planes.tail()

Length of dataframe -  5029


Unnamed: 0,TailNum,type,manufacturer,issue_date,model,status,aircraft_type,engine_type,year
5024,N997DL,Corporation,MCDONNELL DOUGLAS AIRCRAFT CO,03/11/1992,MD-88,Valid,Fixed Wing Multi-Engine,Turbo-Fan,1992
5025,N998AT,Corporation,BOEING,01/23/2003,717-200,Valid,Fixed Wing Multi-Engine,Turbo-Fan,2002
5026,N998DL,Corporation,MCDONNELL DOUGLAS CORPORATION,04/02/1992,MD-88,Valid,Fixed Wing Multi-Engine,Turbo-Jet,1992
5027,N999CA,Foreign Corporation,CANADAIR,07/09/2008,CL-600-2B19,Valid,Fixed Wing Multi-Engine,Turbo-Jet,1998
5028,N999DN,Corporation,MCDONNELL DOUGLAS CORPORATION,04/02/1992,MD-88,Valid,Fixed Wing Multi-Engine,Turbo-Jet,1992


In [79]:
#Load the dataset with plane model information
airport = pd.read_csv("C:\\Data\\airports.csv")

print ("Length of dataframe - ", len(airport))
airport.head()

Length of dataframe -  3376


Unnamed: 0,iata,airport,city,state,country,lat,long
0,00M,Thigpen,Bay Springs,MS,USA,31.953765,-89.234505
1,00R,Livingston Municipal,Livingston,TX,USA,30.685861,-95.017928
2,00V,Meadow Lake,Colorado Springs,CO,USA,38.945749,-104.569893
3,01G,Perry-Warsaw,Perry,NY,USA,42.741347,-78.052081
4,01J,Hilliard Airpark,Hilliard,FL,USA,30.688012,-81.905944


## Add holiday feature

In [80]:
#Add a holiday variable to the delay dataset based on the variable "FlightDate"
us_holidays = holidays.UnitedStates()
holiday = delay.FlightDate.apply(lambda x: x in us_holidays) # True/False
delay['Holiday'] = LabelEncoder().fit_transform(holiday)
us_holidays

{datetime.date(2015, 1, 1): "New Year's Day",
 datetime.date(2015, 1, 19): 'Martin Luther King, Jr. Day',
 datetime.date(2015, 2, 16): "Washington's Birthday",
 datetime.date(2015, 5, 25): 'Memorial Day',
 datetime.date(2015, 7, 3): 'Independence Day (Observed)',
 datetime.date(2015, 7, 4): 'Independence Day',
 datetime.date(2015, 9, 7): 'Labor Day',
 datetime.date(2015, 10, 12): 'Columbus Day',
 datetime.date(2015, 11, 11): 'Veterans Day',
 datetime.date(2015, 11, 26): 'Thanksgiving',
 datetime.date(2015, 12, 25): 'Christmas Day'}

## Merge all three datasets into one

In [81]:
#Merge the the delay dataset with the planes dataset by "TailNum"
delay_planes = delay.merge(planes, left_on='TailNum', right_on='TailNum', how='inner')

#Merge the airport information to get the Lat-Long information on the origin airports
delay_planes.rename(columns={'Origin' : 'iata'}, inplace=True)
delay_planes_origin = pd.merge(delay_planes, airport, on='iata', how = 'inner')
delay_planes_origin.rename(columns={'iata' : 'Origin', 'airport' : 'Origin_airport', 'city' : 'Origin_city', 'state' : 'Origin_state', 'country' : 'Origin_country', 'lat' : 'Origin_lat', 'long' : 'Origin_long'}, inplace='True')

#Merge the airport information to get the Lat-Long information on the destination airports
delay_planes_origin.rename(columns={'Dest' : 'iata'}, inplace=True)
final = pd.merge(delay_planes_origin, airport, on='iata', how = 'inner')
final.rename(columns={'iata' : 'Dest', 'airport' : 'Dest_airport', 'city' : 'Dest_city', 'state' : 'Dest_state', 'country' : 'Dest_country', 'lat' : 'Dest_lat', 'long' : 'Dest_long'}, inplace='True')

In [82]:
final = final.rename(columns = {'year':'PlaneYear', 'manufacturer':'Manufacturer', 'model':'PlaneModel', 'engine_type':'EngineType'})
final = final[['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'Holiday', 'Carrier', 'DepTime', 'FlightNum', 
               'Origin', 'Dest', 'DistanceGroup', 'PlaneYear', 'Manufacturer', 'PlaneModel', 'EngineType', 
               'Origin_lat', 'Origin_long', 'Dest_lat', 'Dest_long', 'DepDelay', 'DepDel15']]
final.head(10)

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,Holiday,Carrier,DepTime,FlightNum,Origin,...,PlaneYear,Manufacturer,PlaneModel,EngineType,Origin_lat,Origin_long,Dest_lat,Dest_long,DepDelay,DepDel15
0,2015,1,1,1,4,1,AA,1754,5,DFW,...,1978,PIPER,PA-32RT-300,Reciprocating,32.895951,-97.0372,21.318691,-157.922407,289,1
1,2015,1,1,19,1,1,AA,1051,123,DFW,...,1978,PIPER,PA-32RT-300,Reciprocating,32.895951,-97.0372,21.318691,-157.922407,-4,0
2,2015,4,10,2,5,0,AA,1046,5,DFW,...,1978,PIPER,PA-32RT-300,Reciprocating,32.895951,-97.0372,21.318691,-157.922407,1,0
3,2015,4,10,9,5,0,AA,1042,5,DFW,...,1978,PIPER,PA-32RT-300,Reciprocating,32.895951,-97.0372,21.318691,-157.922407,-3,0
4,2015,4,10,1,4,0,AA,918,123,DFW,...,1978,PIPER,PA-32RT-300,Reciprocating,32.895951,-97.0372,21.318691,-157.922407,13,0
5,2015,4,10,3,6,0,AA,900,123,DFW,...,1978,PIPER,PA-32RT-300,Reciprocating,32.895951,-97.0372,21.318691,-157.922407,-5,0
6,2015,4,10,12,1,1,AA,905,123,DFW,...,1978,PIPER,PA-32RT-300,Reciprocating,32.895951,-97.0372,21.318691,-157.922407,0,0
7,2015,4,10,17,6,0,AA,1128,123,DFW,...,1978,PIPER,PA-32RT-300,Reciprocating,32.895951,-97.0372,21.318691,-157.922407,143,1
8,2015,4,10,19,1,0,AA,905,123,DFW,...,1978,PIPER,PA-32RT-300,Reciprocating,32.895951,-97.0372,21.318691,-157.922407,0,0
9,2015,4,11,23,1,0,AA,1048,5,DFW,...,1978,PIPER,PA-32RT-300,Reciprocating,32.895951,-97.0372,21.318691,-157.922407,-7,0


## Visualize top 100 most popular flight route on the map

In [58]:
from IPython.display import HTML
import folium

def embed_map(map, path="map.html"):
    """
    Embeds a linked iframe to the map into the IPython notebook.
    
    Note: this method will not capture the source of the map into the notebook.
    This method should work for all maps (as long as they use relative urls).
    """
    map.create_map(path=path)
    return HTML('<iframe src="files/{path}" style="width: 100%; height: 510px; border: none"></iframe>'.format(path=path))

In [70]:
#Aggregate to find most popular route
popular_route=final.groupby(['Origin','Dest','Origin_lat','Origin_long','Dest_lat','Dest_long'])['FlightNum'].agg([len]).reset_index()

popular_route.reset_index(inplace=True)

popular_route=popular_route.sort_values(by='len', ascending=False)

In [75]:
#Plot the 100 most popular routes on the map
m = folium.Map(location=[41.9, -97.3], zoom_start=4)

flag=0

for index, row in popular_route.iterrows():
    orig=np.array([row['Origin_lat'],row['Origin_long']], dtype=float)
    dest=np.array([row['Dest_lat'],row['Dest_long']], dtype=float)
    flag=flag+1
    #  Create the map and add the line
    lines = folium.features.PolyLine([orig,dest], color='#883399', weight=2)
    m.add_children(lines)
    if flag==100:
        break

embed_map(m)



# Prediction model

In [83]:
#Drop rows with NaN values
final = final.dropna()
print (final.shape)

(3588437, 22)


## Shuffle the rows

In [84]:
# Set the randomizer seed so results are the same each time
np.random.seed(0)

#Shuffle the order of the rows
final = final.reindex(np.random.permutation(final.index))

#Check that the data frame is properly shuffled
final.head(10)

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,Holiday,Carrier,DepTime,FlightNum,Origin,...,PlaneYear,Manufacturer,PlaneModel,EngineType,Origin_lat,Origin_long,Dest_lat,Dest_long,DepDelay,DepDel15
289470,2015,1,3,17,2,0,EV,1029,5921,ROA,...,2000,EMBRAER,EMB-145LR,Turbo-Fan,37.325468,-79.975428,41.979595,-87.904464,7,0
85630,2015,4,11,1,7,0,UA,1622,529,SNA,...,2001,AIRBUS INDUSTRIE,A319-131,Turbo-Jet,33.675659,-117.868223,37.619002,-122.374843,-4,0
2342714,2015,3,9,10,4,0,DL,556,2535,XNA,...,1993,MCDONNELL DOUGLAS AIRCRAFT CO,MD-88,Turbo-Fan,36.281869,-94.306811,33.640444,-84.426944,-4,0
861835,2015,3,8,10,1,0,B6,939,518,JFK,...,2006,EMBRAER,ERJ 190-100 IGW,Turbo-Fan,40.639751,-73.778926,42.364348,-71.005179,-9,0
1395031,2015,1,1,30,5,0,US,826,1937,TPA,...,2001,AIRBUS INDUSTRIE,A321-211,Turbo-Jet,27.975472,-82.53325,35.214011,-80.943126,-4,0
2294673,2015,2,4,8,3,0,DL,541,2260,PBI,...,1986,BOEING,757-232,Turbo-Jet,26.683162,-80.095594,33.640444,-84.426944,-1,0
189327,2015,2,6,24,3,0,UA,856,1252,LAX,...,1994,BOEING,757-224,Turbo-Jet,33.942536,-118.408074,41.979595,-87.904464,6,0
1049027,2015,3,7,2,4,0,UA,2018,465,MCO,...,1994,AIRBUS INDUSTRIE,A320-232,Turbo-Jet,28.428889,-81.316028,39.858408,-104.667002,56,1
1024579,2015,3,7,13,1,0,DL,2119,1657,ATL,...,2002,BOEING,757-351,Turbo-Jet,33.640444,-84.426944,26.072583,-80.15275,19,1
773095,2015,2,5,21,4,0,DL,1025,2069,ATL,...,1988,BOEING,757-251,Turbo-Jet,33.640444,-84.426944,28.428889,-81.316028,30,1


## Recode categorical variables to numerical variables

In [85]:
#Recode string variables into numerical categorical variables
enc = LabelEncoder()
final['Carrier'] = enc.fit_transform(final.Carrier)
final['Origin'] = enc.fit_transform(final.Origin)
final['Dest'] = enc.fit_transform(final.Dest)
final['PlaneYear'] = enc.fit_transform(final.PlaneYear)
final['Manufacturer'] = enc.fit_transform(final.Manufacturer)
final['PlaneModel'] = enc.fit_transform(final.PlaneModel)
final['EngineType'] = enc.fit_transform(final.EngineType)

## Split dataset into train, dev, and test set

In [97]:
#Separate the dataset into input and output columns
X = final[['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'Holiday', 'Carrier', 'DepTime', 'FlightNum', 
           'Origin', 'Dest', 'DistanceGroup', 'PlaneYear', 'Manufacturer', 'PlaneModel', 'EngineType']]
Y = final.DepDel15

test_data, test_labels = X[2500000:], Y[2500000:]
dev_data, dev_labels = X[2500000:2510000], Y[2500000:2510000]
train_data, train_labels = X[:2500000], Y[:2500000]

In [98]:
#Check to see if the data is in the right shape
print ('Train data shape: ', train_data.shape)
print ('Test data shape: ', test_data.shape)

#Check percentage of delays in dev and test data, accuracy must be greater than these numbers
print ('Accuracy to beat(dev): ', 1-np.mean(dev_labels))
print ('Accuracy to beat(test): ', 1-np.mean(test_labels))

Train data shape:  (2500000, 15)
Test data shape:  (1088437, 15)
Accuracy to beat(dev):  0.8142
Accuracy to beat(test):  0.8175356038061918


## Random forest with random gridsearch cross-validation

In [86]:
# This seed is required for each gridsearch to be identical
np.random.seed(0)

# Define the ML pipe with cross validation and parameters to optimize
pipe = Pipeline([("RF", RandomForestClassifier(n_jobs=1))])

params = {"RF__n_estimators": sp_randint(100, 200),
          "RF__max_depth": sp_randint(15, 50),
          "RF__min_samples_split": sp_randint(25, 45),
          "RF__min_samples_leaf": sp_randint(1, 30),
          "RF__max_leaf_nodes": sp_randint(5000, 7000),
          "RF__max_features": ["auto", .25, .5, .3]
         }

#Run a grid-search that chooses random paramaters based on the given range
gridsearch = RandomizedSearchCV(pipe, params, fit_params=None, n_iter=10, scoring="accuracy", cv=2, n_jobs=1)
gridsearch.fit(train_data, train_labels)

RandomizedSearchCV(cv=2, error_score='raise',
          estimator=Pipeline(steps=[('RF', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'RF__n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000002868DDAC8>, 'RF__min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000002868DDE80>, 'RF__max_leaf_nodes': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000... 'RF__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000028F43A0B8>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          sco

In [87]:
#Check the accuracy and f1-score on the dev_data
pred = gridsearch.predict(dev_data)
print ("Accuracy:", metrics.accuracy_score(dev_labels, pred))
print ("F1 score:", metrics.f1_score(dev_labels, pred))

Accuracy: 0.8366
F1 score: 0.214423076923


In [95]:
rf = RandomForestClassifier(n_estimators=500, max_depth=20)
rf.fit(train_data, train_labels)
print ("Accuracy", rf.score(dev_data, dev_labels))
pred = rf.predict(dev_data)
print ("F1 score", metrics.f1_score(dev_labels, pred))

Accuracy 0.8399
F1 score 0.229918229918


## Assess predictive power of each feature

In [96]:
feature_importance = pd.DataFrame(rf.feature_importances_, columns=["Feature Importance"])
feature_importance.index = ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'Holiday', 'Carrier', 'DepTime', 'FlightNum', 
                            'Origin', 'Dest', 'DistanceGroup', 'PlaneYear', 'Manufacturer', 'PlaneModel', 'EngineType']
feature_importance.sort_values(by='Feature Importance', ascending=False)

Unnamed: 0,Feature Importance
DepTime,0.317024
FlightNum,0.117331
DayofMonth,0.092098
Origin,0.082699
Dest,0.080193
Month,0.067544
PlaneYear,0.057574
DayOfWeek,0.049461
DistanceGroup,0.047216
PlaneModel,0.038338


## Prediction example

In [99]:
predict = np.asarray([2015, 3, 4, 3, 0, 5, 1200, 111, 
                      25, 30, 11, 10, 1, 7, 5])

print ("Probability of delay:" , rf.predict_proba(predict.reshape(1,15))[0][1])

Probability of delay: 0.406014214606


## Future goals
1. Deploy the prediction model onto an API where users can enter information on their upcoming flight and get a probability that the flight will be delayed. (See example above)

2. Identify a robust weather data source and incorporate that into our existing model.

3. Improve our current prediction model through future machine learning coursework in the MIDS program.