In [158]:
import pandas as pd
import numpy as np
import os
import datetime

#Load flight data
flights_1 = pd.read_csv('flights.csv', header=0, sep=',')

#load airline data
airlines = pd.read_csv('airlines.csv', header=0, sep=',')


#merge airline data to flight data
flights_2 = pd.merge(flights_1, 
                     airlines, 
                     how='left', 
                     left_on='AIRLINE',
                     right_on='IATA_CODE')


#load airport data
airports = pd.read_csv('airports.csv', header=0, sep=',')

#merge airport data to flight data
flights_3 = pd.merge(flights_2, 
                     airports, 
                     how='left', 
                     left_on='ORIGIN_AIRPORT', 
                     right_on='IATA_CODE')


flights_4 = pd.merge(flights_3, 
                     airports, 
                     how='left', 
                     left_on='DESTINATION_AIRPORT', 
                     right_on='IATA_CODE', 
                     suffixes=('_ORIGIN', '_DESTINATION'))

#drop duplicate columns from merging
flights = flights_4.drop(['IATA_CODE_y','IATA_CODE_x','IATA_CODE'],axis = 1)

#rename columns changed in merging
flights = flights.rename(columns = {'AIRLINE_x':'AIRLINE_CODE', 'AIRLINE_y':'AIRLINE_NAME'})

#replace NaN with 0
flights = flights.fillna(0)




  interactivity=interactivity, compiler=compiler, result=result)


In [159]:
#Create name friendly columns
DayOfWeek = {1: 'Monday', 
             2: 'Tuesday', 
             3: 'Wednesday', 
             4: 'Thursday', 
             5: 'Friday', 
             6: 'Saturday', 
             7: 'Sunday'}

MonthName = {1: 'January', 
              2: 'February', 
              3: 'March', 
              4: 'April', 
              5: 'May', 
              6: 'June', 
              7: 'July', 
              8: 'August', 
              9: 'September', 
              10: 'October', 
              11: 'November', 
              12: 'December'}

CancellationReason = {'A': 'Airline/Carrier',
                      'B': 'Weather',
                      'C': 'National Air System',
                      'D': 'Security'}

flights['Day_Of_Week'] = flights['DAY_OF_WEEK'].map(lambda x: DayOfWeek[x])
flights['Month_Name'] = flights['MONTH'].map(lambda x: MonthName[x])
flights['Cancellation_Reason'] = flights['CANCELLATION_REASON'].map(lambda x: 'N/A' if x == 0 else CancellationReason[x])


#make dataframe for only cancelled flights
flights_c = flights[flights['CANCELLED']==1]

#make dataframe for flights with departure delays
flights_ddelays = flights[flights['DEPARTURE_DELAY'] > 0]

#make dataframe for flights with arrival delays
flights_adelays = flights[flights['ARRIVAL_DELAY'] > 0]


#makes hourly bins for departure and arrival times format "0000" 
hour_bins = np.arange(0,2500,100)
hour_bins

Hours = np.arange(0,2400,100)

time_columns = ['SCHEDULED_DEPARTURE','DEPARTURE_TIME', 'SCHEDULED_TIME', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME']

for i in time_columns:
    flights[i.lower()] = pd.cut(flights[i], 
                                        hour_bins, 
                                        labels=Hours)
    if i[0] == 'S':
        flights[i + '_SINE'] = flights[i].apply(lambda x: np.sin(2*np.pi*x/2400))
        flights[i + '_COSINE'] = flights[i].apply(lambda x: np.cos(2*np.pi*x/2400))


In [173]:
# Weather data processing
directory = os.getcwd() + os.sep + 'weather_data' + os.sep
file_after = '_201501010000_201601010000.txt'
all_weather_list = []

for IATA in airports['IATA_CODE'].unique():
    airport_weather = pd.read_csv(directory + IATA + file_after, skiprows = 5,
                                 na_values = 'M', parse_dates = ['valid'])
    
    airport_weather['IATA'] = IATA
    airport_weather['MONTH'] = airport_weather['valid'].apply(lambda x: x.month).astype(int)
    airport_weather['DAY'] = airport_weather['valid'].apply(lambda x: x.day).astype(int)
    airport_weather['HOUR'] = airport_weather['valid'].apply(lambda x: x.hour * 100).astype(int)
    airport_weather['hour'] = pd.cut(airport_weather['HOUR'], hour_bins, labels = Hours, include_lowest = True)

    airport_weather.drop(labels = ['station',' mslp',' skyc2',' skyc3',' skyc4',' skyl1',
                                   ' skyl2',' skyl3',' skyl4',' presentwx',' metar'],
                                     axis = 1, inplace = True)
    #Take the first instance when there are multiple overlapping points for one interval
    airport_weather.drop_duplicates(['MONTH','DAY','hour'], inplace = True)
    all_weather_list.append(airport_weather)

weather = pd.concat(all_weather_list)

#sub NA values
weather[' gust'].fillna(0, inplace = True)
for col in ['tmpf',' dwpf',' relh',' drct',' sknt',' p01i',' alti',' vsby']:
    weather[col].fillna(-1, inplace = True)

In [179]:
#Merge the weather data to the relevant airports
flights_weather = pd.merge(flights,
                           weather,
                           how = 'left',
                           left_on = ['ORIGIN_AIRPORT', 'MONTH', 'DAY','departure_time'],
                           right_on = ['IATA', 'MONTH', 'DAY', 'hour'])

flights_weather = pd.merge(flights_weather,
                           weather,
                           how = 'left',
                           left_on = ['DESTINATION_AIRPORT', 'MONTH', 'DAY','departure_time'],
                           right_on = ['IATA', 'MONTH', 'DAY', 'hour'],
                           suffixes = ('_ORIGIN', '_DESTINATION'))


In [181]:
flights_weather.isnull().sum()

YEAR                        0
MONTH                       0
DAY                         0
DAY_OF_WEEK                 0
AIRLINE_CODE                0
FLIGHT_NUMBER               0
TAIL_NUMBER                 0
ORIGIN_AIRPORT              0
DESTINATION_AIRPORT         0
SCHEDULED_DEPARTURE         0
DEPARTURE_TIME              0
DEPARTURE_DELAY             0
TAXI_OUT                    0
WHEELS_OFF                  0
SCHEDULED_TIME              0
ELAPSED_TIME                0
AIR_TIME                    0
DISTANCE                    0
WHEELS_ON                   0
TAXI_IN                     0
SCHEDULED_ARRIVAL           0
ARRIVAL_TIME                0
ARRIVAL_DELAY               0
DIVERTED                    0
CANCELLED                   0
CANCELLATION_REASON         0
AIR_SYSTEM_DELAY            0
SECURITY_DELAY              0
AIRLINE_DELAY               0
LATE_AIRCRAFT_DELAY         0
                        ...  
lat_ORIGIN             644248
tmpf_ORIGIN            644248
 dwpf_ORIG

In [184]:
from sklearn.model_selection import train_test_split


X_base = flights[['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'SCHEDULED_DEPARTURE','SCHEDULED_TIME', 'DISTANCE','SCHEDULED_ARRIVAL', 'LATITUDE_ORIGIN', 'LONGITUDE_ORIGIN',
       'LATITUDE_DESTINATION', 'LONGITUDE_DESTINATION', 'scheduled_departure', 'scheduled_time','scheduled_arrival']]

X = flights[['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK','SCHEDULED_TIME', 'DISTANCE', 'LATITUDE_ORIGIN', 'LONGITUDE_ORIGIN',
       'LATITUDE_DESTINATION', 'LONGITUDE_DESTINATION', 'SCHEDULED_DEPARTURE_SINE', 'SCHEDULED_DEPARTURE_COSINE',
            'SCHEDULED_ARRIVAL_SINE', 'SCHEDULED_ARRIVAL_COSINE']]

X_weather = flights_weather[['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK','SCHEDULED_TIME', 'DISTANCE', 'LATITUDE_ORIGIN', 'LONGITUDE_ORIGIN',
       'LATITUDE_DESTINATION', 'LONGITUDE_DESTINATION', 'SCHEDULED_DEPARTURE_SINE', 'SCHEDULED_DEPARTURE_COSINE',
            'SCHEDULED_ARRIVAL_SINE', 'SCHEDULED_ARRIVAL_COSINE','tmpf_ORIGIN',' dwpf_ORIGIN',' relh_ORIGIN',' drct_ORIGIN',
                            ' sknt_ORIGIN',' p01i_ORIGIN',' alti_ORIGIN',' vsby_ORIGIN','tmpf_DESTINATION',' dwpf_DESTINATION',
                            ' relh_DESTINATION',' drct_DESTINATION',' sknt_DESTINATION',' p01i_DESTINATION',' alti_DESTINATION',' vsby_DESTINATION']]

Y_weather = flights_weather['CANCELLED']

Y = flights['CANCELLED']

X = X.fillna(0)
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25, random_state=10)

X_weather = X_weather.fillna(0)
X_w_train,X_w_test,Y_w_train,Y_w_test = train_test_split(X_weather,Y_weather,
                                                         test_size = 0.25, random_state = 10)

In [185]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn import metrics

#Logistic Regression
model_LR = LogisticRegression()
model_LR.fit(X_train, Y_train)

model_weather = LogisticRegression()
model_weather.fit(X_w_train, Y_w_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [186]:

print("Area under the LR ROC curve on the test data = %.3f"
      % metrics.roc_auc_score(y_score = model_LR.predict_proba(X_test)[:,-1:], y_true = Y_test))

print("Test Data Score = %.3f" % model_LR.score(X_test ,Y_test ))

print("Weather Data:")
print("Area under the LR ROC curve on the test data = %.3f"
      % metrics.roc_auc_score(y_score = model_weather.predict_proba(X_w_test)[:,-1:], y_true = Y_w_test))
print("Test Data Score = %.3f" % model_weather.score(X_w_test ,Y_w_test ))



Area under the LR ROC curve on the test data = 0.687
Test Data Score = 0.984
Weather Data:
Area under the LR ROC curve on the test data = 0.979
Test Data Score = 0.995
