In [158]:
import pandas as pd
import numpy as np
import os
import datetime

#Load flight data
flights_1 = pd.read_csv('flights.csv', header=0, sep=',')

#load airline data
airlines = pd.read_csv('airlines.csv', header=0, sep=',')


#merge airline data to flight data
flights_2 = pd.merge(flights_1, 
                     airlines, 
                     how='left', 
                     left_on='AIRLINE',
                     right_on='IATA_CODE')


#load airport data
airports = pd.read_csv('airports.csv', header=0, sep=',')

#merge airport data to flight data
flights_3 = pd.merge(flights_2, 
                     airports, 
                     how='left', 
                     left_on='ORIGIN_AIRPORT', 
                     right_on='IATA_CODE')


flights_4 = pd.merge(flights_3, 
                     airports, 
                     how='left', 
                     left_on='DESTINATION_AIRPORT', 
                     right_on='IATA_CODE', 
                     suffixes=('_ORIGIN', '_DESTINATION'))

#drop duplicate columns from merging
flights = flights_4.drop(['IATA_CODE_y','IATA_CODE_x','IATA_CODE'],axis = 1)

#rename columns changed in merging
flights = flights.rename(columns = {'AIRLINE_x':'AIRLINE_CODE', 'AIRLINE_y':'AIRLINE_NAME'})

#replace NaN with 0
flights = flights.fillna(0)

  interactivity=interactivity, compiler=compiler, result=result)


In [159]:
#makes hourly bins for departure and arrival times format "0000" 
hour_bins = np.arange(0,2500,100)
Hours = np.arange(0,2400,100)

time_columns = ['SCHEDULED_DEPARTURE','DEPARTURE_TIME', 'SCHEDULED_TIME', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME']

for i in time_columns:
    flights[i.lower()] = pd.cut(flights[i], 
                                        hour_bins, 
                                        labels=Hours)
    if i[0] == 'S':
        flights[i + '_SINE'] = flights[i].apply(lambda x: np.sin(2*np.pi*x/2400))
        flights[i + '_COSINE'] = flights[i].apply(lambda x: np.cos(2*np.pi*x/2400))

In [173]:
# Weather data processing
directory = os.getcwd() + os.sep + 'weather_data' + os.sep
file_after = '_201501010000_201601010000.txt'
all_weather_list = []

for IATA in airports['IATA_CODE'].unique():
    airport_weather = pd.read_csv(directory + IATA + file_after, skiprows = 5,
                                 na_values = 'M', parse_dates = ['valid'])
    
    airport_weather['IATA'] = IATA
    airport_weather['MONTH'] = airport_weather['valid'].apply(lambda x: x.month).astype(int)
    airport_weather['DAY'] = airport_weather['valid'].apply(lambda x: x.day).astype(int)
    airport_weather['HOUR'] = airport_weather['valid'].apply(lambda x: x.hour * 100).astype(int)
    airport_weather['hour'] = pd.cut(airport_weather['HOUR'], hour_bins, labels = Hours, include_lowest = True)

    airport_weather.drop(labels = ['station',' mslp',' skyc2',' skyc3',' skyc4',' skyl1',
                                   ' skyl2',' skyl3',' skyl4',' presentwx',' metar'],
                                     axis = 1, inplace = True)
    #Take the first instance when there are multiple overlapping points for one interval
    airport_weather.drop_duplicates(['MONTH','DAY','hour'], inplace = True)
    all_weather_list.append(airport_weather)

weather = pd.concat(all_weather_list)

#sub NA values
weather[' gust'].fillna(0, inplace = True)
for col in ['tmpf',' dwpf',' relh',' drct',' sknt',' p01i',' alti',' vsby']:
    weather[col].fillna(-1, inplace = True)

In [179]:
#Merge the weather data to the relevant airports
flights_weather = pd.merge(flights,
                           weather,
                           how = 'left',
                           left_on = ['ORIGIN_AIRPORT', 'MONTH', 'DAY','departure_time'],
                           right_on = ['IATA', 'MONTH', 'DAY', 'hour'])

flights_weather = pd.merge(flights_weather,
                           weather,
                           how = 'left',
                           left_on = ['DESTINATION_AIRPORT', 'MONTH', 'DAY','departure_time'],
                           right_on = ['IATA', 'MONTH', 'DAY', 'hour'],
                           suffixes = ('_ORIGIN', '_DESTINATION'))


In [196]:
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import KFold

subset = flights[['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK','SCHEDULED_TIME', 'DISTANCE', 'LATITUDE_ORIGIN', 'LONGITUDE_ORIGIN',
       'LATITUDE_DESTINATION', 'LONGITUDE_DESTINATION', 'SCHEDULED_DEPARTURE_SINE', 'SCHEDULED_DEPARTURE_COSINE',
            'SCHEDULED_ARRIVAL_SINE', 'SCHEDULED_ARRIVAL_COSINE','CANCELLED']]

weather_subset = flights_weather[['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK','SCHEDULED_TIME', 'DISTANCE', 'LATITUDE_ORIGIN', 'LONGITUDE_ORIGIN',
       'LATITUDE_DESTINATION', 'LONGITUDE_DESTINATION', 'SCHEDULED_DEPARTURE_SINE', 'SCHEDULED_DEPARTURE_COSINE',
            'SCHEDULED_ARRIVAL_SINE', 'SCHEDULED_ARRIVAL_COSINE','tmpf_ORIGIN',' dwpf_ORIGIN',' relh_ORIGIN',' drct_ORIGIN',
            ' sknt_ORIGIN',' p01i_ORIGIN',' alti_ORIGIN',' vsby_ORIGIN','tmpf_DESTINATION',' dwpf_DESTINATION',
            ' relh_DESTINATION',' drct_DESTINATION',' sknt_DESTINATION',' p01i_DESTINATION',' alti_DESTINATION',' vsby_DESTINATION', 'CANCELLED']]

weather_subset = weather_subset.dropna()

X = subset.drop('CANCELLED', axis = 1)
X_weather = weather_subset.drop('CANCELLED', axis = 1)

Y = subset['CANCELLED']
Y_weather = weather_subset['CANCELLED']

#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
#                                   test_size=0.25, random_state=10)

#X_w_train,X_w_test,Y_w_train,Y_w_test = train_test_split(X_weather,Y_weather,
#                                       test_size = 0.25, random_state = 10)

kfolds = KFold(X.shape[0], n_folds = 5)
kfolds_weather = KFold(X_weather.shape[0], n_folds = 5)

In [198]:
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

steps = [('scaler', StandardScaler()),
        ('lr',LogisticRegression())]

pipeline = Pipeline(steps)

parameters = {'lr__C' : [10**i for i in range(-3,3)],
             'lr__penalty': ['l1','l2']}

LR_grid_search = GridSearchCV(pipeline, param_grid = parameters, cv = kfolds, scoring = 'roc_auc')
LR_grid_search.fit(X, Y)


LR_weather_grid_search = GridSearchCV(pipeline, param_grid = parameters, cv = kfolds_weather, scoring = 'roc_auc')
LR_weather_grid_search.fit(X_weather, Y_weather)

GridSearchCV(cv=sklearn.cross_validation.KFold(n=5159515, n_folds=5, shuffle=False, random_state=None),
       error_score='raise',
       estimator=Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'lr__C': [0.001, 0.01, 0.1, 1, 10, 100], 'lr__penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [202]:
print("Best CV AUC = %.3f" % LR_grid_search.best_score_)

print("Weather Data:")
print("Best CV AUC = %.3f" % LR_weather_grid_search.best_score_)



Best CV AUC = 0.621
Weather Data:
Best CV AUC = 0.654
