In [221]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sc
import math
import copy
import datetime
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFE
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn import tree
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')


In [222]:
############################################
# LOAD RAW DATA
############################################

all_df = pd.read_csv("flights_train.csv")
d = {'ARR_DELAY': all_df['ARR_DELAY']}
all_y = pd.DataFrame(data=d)


In [223]:
############################################
# PREP DATA - DROP CATEGORICAL COLS
############################################

day_of_year = [datetime.datetime.strptime(date_str, '%Y-%m-%d').timetuple().tm_yday for date_str in all_df['FL_DATE'] ]
month = [datetime.datetime.strptime(date_str, '%Y-%m-%d').timetuple().tm_mon for date_str in all_df['FL_DATE'] ]
all_x = all_df.drop(['UID','FL_NUM','AIRLINE_ID','ORIGIN_CITY_NAME','DISTANCE', 'FL_DATE', 'DEST_CITY_NAME','ORIGIN_STATE_ABR','DEST_STATE_ABR', 'FIRST_DEP_TIME'], 
                 axis=1)  
all_x['month'] = month
all_x['day_of_year'] = day_of_year

all_x = all_x.drop(['UNIQUE_CARRIER', 'ORIGIN', 'DEST', 'ORIGIN_CITY_MARKET_ID', 'DEST_CITY_MARKET_ID'], axis=1)


In [224]:
############################################
# DECIDE WHETHER TO RE-ADD CATEGORICAL COLS USING OHE
############################################

ohe = True
if ohe:
    carrier_ohe = pd.get_dummies(all_df['UNIQUE_CARRIER'])
    carrier_ohe = carrier_ohe.astype('int64')
    origin_ohe = pd.get_dummies(all_df['ORIGIN'])
    origin_ohe = origin_ohe.astype('int64')
    dest_ohe = pd.get_dummies(all_df['DEST'])
    dest_ohe.rename(columns=lambda x: x+'_DEST', inplace=True)
    dest_ohe = dest_ohe.astype('int64')
    all_x = pd.concat([all_x, carrier_ohe, origin_ohe, dest_ohe], axis=1) 
    

In [225]:
############################################
# SHOW SAMPLE DATA
############################################
all_x.info()
all_x[0:2]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4911 entries, 0 to 4910
Columns: 488 entries, DAY_OF_WEEK to YUM_DEST
dtypes: int64(488)
memory usage: 18.3 MB


Unnamed: 0,DAY_OF_WEEK,CRS_DEP_TIME,TAXI_OUT,TAXI_IN,ACTUAL_ELAPSED_TIME,DISTANCE_GROUP,ARR_DELAY,month,day_of_year,AA,...,TTN_DEST,TUL_DEST,TUS_DEST,TVC_DEST,TWF_DEST,TYS_DEST,VPS_DEST,WRG_DEST,XNA_DEST,YUM_DEST
0,2,1020,11,8,373,11,-11,1,3,0,...,0,0,0,0,0,0,0,0,0,0
1,6,1220,13,9,183,5,1,1,28,1,...,0,0,0,0,0,0,0,0,0,0


In [226]:
def clean_data(all_df, train_x_unclean, s=1):
    
    s = s
    
    y_mean = all_df['ARR_DELAY'].mean()
    y_std = all_df['ARR_DELAY'].std()
    y_max = y_mean + (s * y_std)
    y_min = y_mean - (s * y_std)
    
    a = train_x_unclean['ARR_DELAY'] < y_max
    b = train_x_unclean['ARR_DELAY'] > y_min
    c = a & b
    
    train_x_clean = train_x_unclean[c]
    d = {'ARR_DELAY': train_x_clean['ARR_DELAY']}
    train_y = pd.DataFrame(data=d)
    train_x = train_x_clean.drop(['ARR_DELAY'], axis=1)
    
    return train_x, train_y


def dont_clean_data(all_df, train_x_unclean):
    
    d = {'ARR_DELAY': train_x_unclean['ARR_DELAY']}
    train_y = pd.DataFrame(data=d)
    train_x = train_x_unclean.drop(['ARR_DELAY'], axis=1)
    
    return train_x, train_y

In [230]:
################################################
# DECISION TREE
################################################

max_depth = 10
folds = 10

for depth in range(1,max_depth+1):

    kf = KFold(n_splits=folds, random_state=42)
    i = 0
    mses = []
    for train_index, valid_index in kf.split(all_x):
        i += 1
        train_x_unclean, valid_x = all_x.iloc[train_index], all_x.iloc[valid_index]
        valid_x = valid_x.drop(['ARR_DELAY'], axis=1)
        garbage, valid_y = all_y.iloc[train_index], all_y.iloc[valid_index]
        #train_x, train_y = clean_data(all_df, train_x_unclean, 1)
        train_x, train_y = dont_clean_data(all_df, train_x_unclean)
        clf = tree.DecisionTreeClassifier(max_depth=depth)
        clf = clf.fit(train_x, train_y)
        y_hats = clf.predict(valid_x)
        mse = mean_squared_error(y_hats, valid_y)
        mses.append(mse)
        print('MSE, fold '+str(i)+': '+ str(mse))
    print('Avg MSE, depth '+str(depth)+': '+ str(np.mean(mses)))


MSE, fold 1: 2022.821138211382
MSE, fold 2: 2054.7169042769856
MSE, fold 3: 1787.7087576374745
MSE, fold 4: 3248.28716904277
MSE, fold 5: 1655.8350305498982
MSE, fold 6: 3366.3503054989815
MSE, fold 7: 3772.1181262729124
MSE, fold 8: 1228.8248472505093
MSE, fold 9: 2688.600814663951
MSE, fold 10: 1476.6191446028513
Avg MSE, depth 1: 2330.1882238007715
MSE, fold 1: 2022.9837398373984
MSE, fold 2: 2054.773930753564
MSE, fold 3: 1787.81466395112
MSE, fold 4: 3248.401221995927
MSE, fold 5: 1655.8839103869655
MSE, fold 6: 3366.5458248472505
MSE, fold 7: 3772.3788187372706
MSE, fold 8: 1228.8248472505093
MSE, fold 9: 2688.7881873727088
MSE, fold 10: 1476.6109979633402
Avg MSE, depth 2: 2330.3006143096054
MSE, fold 1: 2023.6158536585365
MSE, fold 2: 2056.7596741344196
MSE, fold 3: 1792.8431771894093
MSE, fold 4: 3253.6395112016294
MSE, fold 5: 1661.1283095723015
MSE, fold 6: 3367.0570264765784
MSE, fold 7: 3783.3116089613036
MSE, fold 8: 1229.6476578411405
MSE, fold 9: 2690.7006109979634
MSE,

In [229]:
################################################
# LINEAR REGRESSION
################################################

folds = 10

kf = KFold(n_splits=folds, random_state=42)
i = 0
mses = []
for train_index, valid_index in kf.split(all_x):
    i += 1
    train_x_unclean, valid_x = all_x.iloc[train_index], all_x.iloc[valid_index]
    valid_x = valid_x.drop(['ARR_DELAY'], axis=1)
    garbage, valid_y = all_y.iloc[train_index], all_y.iloc[valid_index]
    train_x, train_y = clean_data(all_df, train_x_unclean, 1)
    #train_x, train_y = dont_clean_data(all_df, train_x_unclean)
    clf = LinearRegression()
    clf = clf.fit(train_x, train_y)
    y_hats = clf.predict(valid_x)
    mse = mean_squared_error(y_hats, valid_y)
    mses.append(mse)
    print('MSE, fold '+str(i)+': '+ str(mse))
print('Avg MSE: '+ str(np.mean(mses)))

MSE, fold 1: 1664.8933499724542
MSE, fold 2: 1871.8611017975697
MSE, fold 3: 1423.2541198946358
MSE, fold 4: 2.090346152519809e+18
MSE, fold 5: 5.604630129032821e+17
MSE, fold 6: 2.1763762597645683e+18
MSE, fold 7: 1.6959420635169472e+16
MSE, fold 8: 1021.5267952584369
MSE, fold 9: 8.86533759992918e+18
MSE, fold 10: 5.971175337694081e+19
Avg MSE: 7.342123582269282e+18


In [228]:
################################################
# MLP - NN
################################################

hidden_nodes = 100
learning_rate = 0.00001
print_epoch_info = False
folds = 10

kf = KFold(n_splits=folds, random_state=42)
i = 0
mses = []
for train_index, valid_index in kf.split(all_x):
    i += 1
    train_x_unclean, valid_x = all_x.iloc[train_index], all_x.iloc[valid_index]
    valid_x = valid_x.drop(['ARR_DELAY'], axis=1)
    garbage, valid_y = all_y.iloc[train_index], all_y.iloc[valid_index]
    train_x, train_y = clean_data(all_df, train_x_unclean, 1)
    #train_x, train_y = dont_clean_data(all_df, train_x_unclean)
    clf = MLPRegressor(hidden_layer_sizes=(hidden_nodes,), solver='sgd', alpha=learning_rate,
                    activation='logistic', verbose=print_epoch_info)
    clf = clf.fit(train_x, train_y)
    y_hats = clf.predict(valid_x)
    mse = mean_squared_error(y_hats, valid_y)
    mses.append(mse)
    print('MSE, fold '+str(i)+': '+ str(mse))
print('Avg MSE: '+ str(np.mean(mses)))


MSE, fold 1: 1878.2932395403113
MSE, fold 2: 1939.9792491879782
MSE, fold 3: 1597.9394558088934
MSE, fold 4: 2980.814120978744
MSE, fold 5: 1485.46412686411
MSE, fold 6: 3087.2847885856786
MSE, fold 7: 3515.40217627663
MSE, fold 8: 1086.2800782729435
MSE, fold 9: 2481.6833915924594
MSE, fold 10: 1375.6224981872297
Avg MSE: 2142.876312529498
