In [333]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sc
import math
import copy
import datetime
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFE
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn import tree
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')


In [334]:
############################################
# LOAD RAW DATA
############################################

all_df = pd.read_csv("flights_train.csv")
d = {'ARR_DELAY': all_df['ARR_DELAY']}
all_y = pd.DataFrame(data=d)


In [335]:
############################################
# PREP DATA - DROP CATEGORICAL COLS
############################################

day_of_year = [datetime.datetime.strptime(date_str, '%Y-%m-%d').timetuple().tm_yday for date_str in validation_df['FL_DATE'] ]
month = [datetime.datetime.strptime(date_str, '%Y-%m-%d').timetuple().tm_mon for date_str in validation_df['FL_DATE'] ]
all_x = all_df.drop(['UID','FL_NUM','AIRLINE_ID','ORIGIN_CITY_NAME','DISTANCE', 'FL_DATE', 'DEST_CITY_NAME','ORIGIN_STATE_ABR','DEST_STATE_ABR', 'FIRST_DEP_TIME'], 
                 axis=1)  
all_x['month'] = month
all_x['day_of_year'] = day_of_year

all_x = all_x.drop(['UNIQUE_CARRIER', 'ORIGIN', 'DEST', 'ORIGIN_CITY_MARKET_ID', 'DEST_CITY_MARKET_ID'], axis=1)


In [336]:
############################################
# DECIDE WHETHER TO RE-ADD CATEGORICAL COLS USING OHE
############################################

ohe = False
if ohe:
    carrier_ohe = pd.get_dummies(all_df['UNIQUE_CARRIER'])
    carrier_ohe = carrier_ohe.astype('int64')
    origin_ohe = pd.get_dummies(all_df['ORIGIN'])
    origin_ohe = origin_ohe.astype('int64')
    dest_ohe = pd.get_dummies(all_df['DEST'])
    dest_ohe.rename(columns=lambda x: x+'_DEST', inplace=True)
    dest_ohe = dest_ohe.astype('int64')
    all_x = pd.concat([all_x, carrier_ohe, origin_ohe, dest_ohe], axis=1) 
    

In [337]:
############################################
# SHOW SAMPLE DATA
############################################
all_x.info()
all_x[0:2]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4911 entries, 0 to 4910
Data columns (total 9 columns):
DAY_OF_WEEK            4911 non-null int64
CRS_DEP_TIME           4911 non-null int64
TAXI_OUT               4911 non-null int64
TAXI_IN                4911 non-null int64
ACTUAL_ELAPSED_TIME    4911 non-null int64
DISTANCE_GROUP         4911 non-null int64
ARR_DELAY              4911 non-null int64
month                  4911 non-null int64
day_of_year            4911 non-null int64
dtypes: int64(9)
memory usage: 345.4 KB


Unnamed: 0,DAY_OF_WEEK,CRS_DEP_TIME,TAXI_OUT,TAXI_IN,ACTUAL_ELAPSED_TIME,DISTANCE_GROUP,ARR_DELAY,month,day_of_year
0,2,1020,11,8,373,11,-11,1,3
1,6,1220,13,9,183,5,1,1,28


In [338]:
############################################
# CLEAN DATA
############################################

# choose method 1 or 2
method = 1

# METHOD 1
# remove rows with label more than 's' standard deviations from mean
# note: don't forget to set s!
if method == 1:
    
    s = 1  # getting best results with s = 1
    
    y_mean = all_df['ARR_DELAY'].mean()
    y_std = all_df['ARR_DELAY'].std()
    y_max = y_mean + (s * y_std)
    y_min = y_mean - (s * y_std)
    
    a = all_x['ARR_DELAY'] < y_max
    b = all_x['ARR_DELAY'] > y_min
    c = a & b
    
    all_x_clean = all_x[c]
    d = {'ARR_DELAY': all_x_clean['ARR_DELAY']}
    all_y = pd.DataFrame(data=d)
    all_x_clean = all_x_clean.drop(['ARR_DELAY'], axis=1)


# METHOD 2
# select the k best features 
# note: currently, can choose ohe features, unless you don't include ohe
# note: don't forget to set k!
elif method == 2:

    k = 5

    all_x_temp = all_x.drop(['ARR_DELAY'], axis=1)
    b = SelectKBest(mutual_info_regression, k=k).fit(all_x_temp, all_y)
    b.get_support()
    all_x_clean = b.transform(all_x_temp)


# note: consider combining both methods


In [339]:
############################################
# SPLIT INTO TRAINING/VALID/TEST
############################################

num_examples = all_x_clean.shape[0]

# 80% train - 20% valid - 0% test
percent_train = 0.8
half = 0.5

cutoff1 = int(num_examples * percent_train)
cutoff2 = cutoff1 + int(num_examples * percent_valid)

train_x, valid_x, train_y, valid_y = train_test_split(all_x_clean, all_y, test_size=percent_valid_test, random_state=42)
#valid_x, test_x, valid_y, test_y = train_test_split(valid_x, valid_y, test_size=half, random_state=42)

print(type(train_x))
print(type(valid_y))

# use .iloc[] to index
#print(train_x.iloc[0])
#print(valid_y.iloc[0])


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [340]:
################################################
# DECISION TREE
################################################

max_depth = 10
folds = 10

cross_fold = True

for depth in range(1,max_depth+1):
    print('Results with depth of '+str(depth)+':')
    clf = tree.DecisionTreeClassifier(max_depth=depth)
    clf = clf.fit(train_x, train_y)

    y_hats = clf.predict(valid_x)
    errs = []
    for i in range(len(y_hats)):
        errs.append((valid_y.iloc[i]['ARR_DELAY'] - y_hats[i])**2)
    err = np.mean(errs)
    print('Mean-squared error WITHOUT cross-fold validation: '+ str(err))

    scores = cross_val_score(clf, valid_x, valid_y, scoring='mean_squared_error', cv=folds)
    print('Mean-squared error WITH cross-fold validation: '+ str(np.mean(-scores)))


Results with depth of 1:
Mean-squared error WITHOUT cross-fold validation: 236.6361631753032
Mean-squared error WITH cross-fold validation: 238.18388425580898
Results with depth of 2:
Mean-squared error WITHOUT cross-fold validation: 268.44487320837925
Mean-squared error WITH cross-fold validation: 244.10974366101055
Results with depth of 3:
Mean-squared error WITHOUT cross-fold validation: 254.45093715545755
Mean-squared error WITH cross-fold validation: 254.32534075618855
Results with depth of 4:
Mean-squared error WITHOUT cross-fold validation: 249.20231532524807
Mean-squared error WITH cross-fold validation: 259.6478659791404
Results with depth of 5:
Mean-squared error WITHOUT cross-fold validation: 275.48235942668134
Mean-squared error WITH cross-fold validation: 265.10980880560095
Results with depth of 6:
Mean-squared error WITHOUT cross-fold validation: 266.03583241455345
Mean-squared error WITH cross-fold validation: 282.3355726295514
Results with depth of 7:
Mean-squared error

In [341]:
################################################
# LINEAR REGRESSION
################################################

clf = LinearRegression()

clf.fit(train_x, train_y)
y_hats = clf.predict(valid_x)
errs = []
for i in range(len(y_hats)):
    errs.append((valid_y.iloc[i]['ARR_DELAY'] - y_hats[i])**2)
err = np.mean(errs)
print('Mean-squared error WITHOUT cross-fold validation: '+ str(err))

# cross-fold validation
folds = 10
scores = cross_val_score(clf, train_x, train_y, scoring='mean_squared_error', cv=folds)
print('Mean-squared error WITH cross-fold validation: '+ str(np.mean(-scores)))

Mean-squared error WITHOUT cross-fold validation: 201.96184891295042
Mean-squared error WITH cross-fold validation: 218.18961199467913


In [342]:
################################################
# LOGISTIC REGRESSION
################################################

clf = LogisticRegression()

clf.fit(train_x, train_y)
y_hats = clf.predict(valid_x)
errs = []
for i in range(len(y_hats)):
    errs.append((valid_y.iloc[i]['ARR_DELAY'] - y_hats[i])**2)
err = np.mean(errs)
print('Mean-squared error WITHOUT cross-fold validation: '+ str(err))

# cross-fold validation
folds = 10
scores = cross_val_score(clf, train_x, train_y, scoring='mean_squared_error', cv=folds)
print('Mean-squared error WITH cross-fold validation: '+ str(np.mean(-scores)))

Mean-squared error WITHOUT cross-fold validation: 263.90628445424477
Mean-squared error WITH cross-fold validation: 277.4820022400422


In [343]:
################################################
# MLP - NN
################################################

hidden_nodes = 500
learning_rate = 0.00001
print_epoch_info = False
clf = MLPRegressor(hidden_layer_sizes=(hidden_nodes,), solver='sgd', alpha=learning_rate,
                    activation='logistic', verbose=print_epoch_info)

clf.fit(train_x, train_y)
y_hats = clf.predict(valid_x)
errs = []
for i in range(len(y_hats)):
    errs.append((valid_y.iloc[i]['ARR_DELAY'] - y_hats[i])**2)
err = np.mean(errs)
print('Mean-squared error WITHOUT cross-fold validation: '+ str(err))

# cross-fold validation
folds = 10
scores = cross_val_score(clf, train_x, train_y, scoring='mean_squared_error', cv=folds)
print('Mean-squared error WITH cross-fold validation: '+ str(np.mean(-scores)))


Mean-squared error WITHOUT cross-fold validation: 244.75447286091108
Mean-squared error WITH cross-fold validation: 260.02702517789135
