In [146]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sc
import math
import copy
import datetime
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFE
from sklearn.linear_model import LogisticRegression, LinearRegression,BayesianRidge
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn import tree
from sklearn import metrics
from sklearn import gaussian_process
from sklearn.gaussian_process.kernels \
    import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared
import warnings
warnings.filterwarnings('ignore')

In [120]:
validation_df = pd.read_csv("flights_train.csv")
validation_y = validation_df['ARR_DELAY']

In [121]:
day_of_year = [datetime.datetime.strptime(date_str, '%Y-%m-%d').timetuple().tm_yday for date_str in validation_df['FL_DATE'] ]
month = [datetime.datetime.strptime(date_str, '%Y-%m-%d').timetuple().tm_mon for date_str in validation_df['FL_DATE'] ]
validation_x = validation_df.drop(['UID','FL_NUM','AIRLINE_ID','ORIGIN_CITY_NAME','DISTANCE', 'FL_DATE', 'DEST_CITY_NAME','ORIGIN_STATE_ABR','DEST_STATE_ABR', 'FIRST_DEP_TIME'], 
                 axis=1)  
validation_x['month'] = month
validation_x['day_of_year'] = day_of_year

validation_x = validation_x.drop(['UNIQUE_CARRIER', 'ORIGIN', 'DEST', 'ORIGIN_CITY_MARKET_ID', 'DEST_CITY_MARKET_ID'], axis=1)

In [122]:
carrier_ohe = pd.get_dummies(validation_df['UNIQUE_CARRIER'])
carrier_ohe = carrier_ohe.astype('int64')
origin_ohe = pd.get_dummies(validation_df['ORIGIN'])
origin_ohe = origin_ohe.astype('int64')
dest_ohe = pd.get_dummies(validation_df['DEST'])
dest_ohe.rename(columns=lambda x: x+'_DEST', inplace=True)
dest_ohe = dest_ohe.astype('int64')
nn_validation_x = pd.concat([validation_x, carrier_ohe, origin_ohe, dest_ohe], axis=1) 

In [123]:
nn_validation_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4911 entries, 0 to 4910
Columns: 488 entries, DAY_OF_WEEK to YUM_DEST
dtypes: int64(488)
memory usage: 18.3 MB


In [124]:
validation_x[0:2]

Unnamed: 0,DAY_OF_WEEK,CRS_DEP_TIME,TAXI_OUT,TAXI_IN,ACTUAL_ELAPSED_TIME,DISTANCE_GROUP,ARR_DELAY,month,day_of_year
0,2,1020,11,8,373,11,-11,1,3
1,6,1220,13,9,183,5,1,1,28


In [150]:
y_mean = validation_df['ARR_DELAY'].mean()
y_std = validation_df['ARR_DELAY'].std()
print(y_mean)
print(y_std)
print(validation_df['ARR_DELAY'].median())
y_max = y_mean + (2 * y_std)
y_min = y_mean - (2 * y_std)

training_x = validation_x[validation_x['ARR_DELAY'] < y_max]
nn_training_x = nn_validation_x[validation_x['ARR_DELAY'] < y_max]
training_y = training_x['ARR_DELAY']

4.316228873956424
45.3866572365006
-6.0


KeyError: 'ARR_DELAY'

In [126]:

training_x = training_x.drop(['ARR_DELAY'], axis=1)
validation_x = validation_x.drop(['ARR_DELAY'], axis=1)
nn_training_x = nn_training_x.drop(['ARR_DELAY'], axis=1)

In [148]:
svr = SVR(C=1.0, epsilon=0.2)
scores = cross_val_score(svr, training_x, training_y, scoring='mean_squared_error', cv=folds)
print('Mean-squared error WITH cross-fold validation: '+ str(np.mean(-scores)))

Mean-squared error WITH cross-fold validation: 484.457334016


In [111]:
#decision tree
max_depth = 10
folds = 10

for depth in range(1,max_depth+1):
    print('Results with depth of '+str(depth)+':')
    clf = tree.DecisionTreeClassifier(max_depth=depth)

    scores = cross_val_score(clf, training_x, training_y, scoring='mean_squared_error', cv=folds)
    print('Mean-squared error WITH cross-fold validation: '+ str(np.mean(-scores)))
    

Results with depth of 1:
Mean-squared error WITH cross-fold validation: 475.951209417
Results with depth of 2:
Mean-squared error WITH cross-fold validation: 495.603300788
Results with depth of 3:
Mean-squared error WITH cross-fold validation: 509.383257354
Results with depth of 4:
Mean-squared error WITH cross-fold validation: 514.078514427
Results with depth of 5:
Mean-squared error WITH cross-fold validation: 513.265278998
Results with depth of 6:
Mean-squared error WITH cross-fold validation: 529.223897087
Results with depth of 7:
Mean-squared error WITH cross-fold validation: 521.52536527
Results with depth of 8:
Mean-squared error WITH cross-fold validation: 538.543417148
Results with depth of 9:
Mean-squared error WITH cross-fold validation: 560.415887159
Results with depth of 10:
Mean-squared error WITH cross-fold validation: 576.084975374


In [113]:
for feature_count in range(1, len(training_x) ):
    kbest_x = SelectKBest(mutual_info_regression, k=feature_count).fit_transform(training_x, training_y)
    lin_clf = LinearRegression()
    folds = 10
    scores = cross_val_score(lin_clf, kbest_x, training_y, scoring='mean_squared_error', cv=folds)
    print('For ' + str(feature_count) + ' features, Mean-squared error WITH cross-fold validation: '+ str(np.mean(-scores)))

For 1 features, Mean-squared error WITH cross-fold validation: 422.376579699
For 2 features, Mean-squared error WITH cross-fold validation: 410.077363402
For 3 features, Mean-squared error WITH cross-fold validation: 401.255155097
For 4 features, Mean-squared error WITH cross-fold validation: 397.862287515
For 5 features, Mean-squared error WITH cross-fold validation: 393.362148747
For 6 features, Mean-squared error WITH cross-fold validation: 399.390995935
For 7 features, Mean-squared error WITH cross-fold validation: 393.325705455
For 8 features, Mean-squared error WITH cross-fold validation: 393.620514214


ValueError: k should be >=0, <= n_features; got 9.Use k='all' to return all features.

In [145]:
bayes_clf = BayesianRidge(compute_score=True)
scores = cross_val_score(bayes_clf, training_x, training_y, scoring='mean_squared_error', cv=folds)
print('For ' + str(feature_count) + ' features, Mean-squared error WITH cross-fold validation: '+ str(np.mean(-scores)))

For 9 features, Mean-squared error WITH cross-fold validation: 393.62297921


In [141]:
hidden_nodes = 5
learning_rate = 0.0001
print_epoch_info = False
clf = MLPRegressor(hidden_layer_sizes=(hidden_nodes,), solver='adam', alpha=learning_rate,
                    activation='logistic', verbose=print_epoch_info)

#clf.fit(training_x, training_y)
#y_hats = clf.predict(validation_x)
#errs = []
#for i in range(len(y_hats)):
#    errs.append((validation_y.iloc[i]['ARR_DELAY'] - y_hats[i])**2)
#err = np.mean(errs)
#print('Mean-squared error WITHOUT cross-fold validation: '+ str(err))

# cross-fold validation
folds = 10
scores = cross_val_score(clf, nn_training_x, training_y, scoring='mean_squared_error', cv=folds)
print('Mean-squared error WITH cross-fold validation: '+ str(np.mean(-scores)))

Mean-squared error WITH cross-fold validation: 449.472508021
