In [2]:
# Python library imports: numpy, random, sklearn, pandas, etc

import warnings
warnings.filterwarnings('ignore')

import sys
import random
import numpy as np

# cross_validation is deprecated since version 0.18. This module will be removed in 0.20.
# Use sklearn.model_selection.train_test_split instead.
# from sklearn import linear_model, cross_validation, metrics, svm
from sklearn.model_selection import train_test_split
from sklearn import linear_model, metrics, svm
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

import pandas as pd
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

In [47]:
# Config size
TRAINING_MATRIX_RESULT_FILE_NAME = 'training_matrix_result.pickle'

### Load training matrix result

In [48]:
# Load trained model from file
result_location_path = f"./training_results/{TRAINING_MATRIX_RESULT_FILE_NAME}"
clf_rf = pickle.load(open(result_location_path, "rb"))


### Import data and make performance test

In [49]:
# read files
cols = ['dep_delay', 'month', 'day_of_month', 'day_of_week', 'dest',  'distance']
col_types = {'dep_delay': int, 'month': int, 'day_of_month': int, 'day_of_week': int, 'distance': int, 
            'dest': str}

df_1994 = pd.read_csv('result/1994.csv')
df_1995 = pd.read_csv('result/1995.csv')
df_1996 = pd.read_csv('result/1996.csv')

# df_1996 = pd.read_csv('result/1996.csv')
# df_1996 = df_1996[cols]

df_years = [df_1995, df_1996]

# flight_df = pd.concat(df_years)
flight_df = df_1994
flight_df


Unnamed: 0,year,month,day_of_month,day_of_week,dep_time,crs_dep_time,arr_time,crs_arr_time,unique_carrier,flight_num,actual_elapsed_time,crs_elapsed_time,arr_delay,dep_delay,origin,dest,distance,cancelled,diverted
0,1994,1,7,5,858,900,954,1003,US,227,56.0,63,-9.0,-2.0,CLT,ORF,290.0,0,0
1,1994,1,8,6,859,900,952,1003,US,227,53.0,63,-11.0,-1.0,CLT,ORF,290.0,0,0
2,1994,1,10,1,935,900,1023,1003,US,227,48.0,63,20.0,35.0,CLT,ORF,290.0,0,0
3,1994,1,11,2,903,900,1131,1003,US,227,148.0,63,88.0,3.0,CLT,ORF,290.0,0,0
4,1994,1,12,3,933,900,1024,1003,US,227,51.0,63,21.0,33.0,CLT,ORF,290.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5078406,1994,12,25,7,1716,1715,1923,1945,DL,149,127.0,150,-22.0,1.0,JFK,ATL,760.0,0,0
5078407,1994,12,26,1,1716,1715,1910,1945,DL,149,114.0,150,-35.0,1.0,JFK,ATL,760.0,0,0
5078408,1994,12,27,2,1721,1715,1930,1945,DL,149,129.0,150,-15.0,6.0,JFK,ATL,760.0,0,0
5078409,1994,12,28,3,1715,1715,1934,1945,DL,149,139.0,150,-11.0,0.0,JFK,ATL,760.0,0,0


In [50]:
# Select columns to reduce the training matrix side and define dimension for prediction feature
cols = ['unique_carrier', 'month', 'flight_num', 'day_of_month', 'day_of_week', 'dest', 'origin', 'dep_time', 'distance', 'dep_delay']
flight_df_selc = flight_df[cols]


In [51]:
flight_df_selc['dep_delay'] = (flight_df_selc['dep_delay'] > 10) *1
categ_cols = ['unique_carrier', 'flight_num', 'dest', 'origin']

for item  in categ_cols:
    flight_df_selc[item] = flight_df_selc[item].astype('category').cat.codes + 1


In [52]:
# Divide test and train matrix
train, test, y_train, y_test = train_test_split(flight_df_selc.drop(['dep_delay'], axis=1), flight_df_selc["dep_delay"],
                                                random_state=10, test_size=0.99)

In [53]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
enc.fit(train["unique_carrier"].values.reshape(-1, 1))  

# enc.feature_indices_
airline_onehot = enc.transform(train["unique_carrier"].values.reshape(-1, 1)).toarray()
airline_onehot_test = enc.transform(test["unique_carrier"].values.reshape(-1, 1)).toarray()

In [54]:
air_oh_df = pd.DataFrame(airline_onehot, columns = ["A1","A2","A3","A4","A5","A6","A7",
                                                    "A8","A9","A10"])
air_oh_test_df = pd.DataFrame(airline_onehot_test, columns = ["A1","A2","A3","A4","A5","A6","A7",
                                                         "A8","A9","A10"])

In [55]:
train2 = pd.concat([train.reset_index(),air_oh_df.reset_index()],axis=1).drop(["index","month","day_of_month","day_of_week","unique_carrier","flight_num","dest","origin"],axis=1)
test2  = pd.concat([test.reset_index(),air_oh_test_df.reset_index()],axis=1).drop(["index","month","day_of_month","day_of_week","unique_carrier","flight_num","dest","origin"],axis=1)

test2

Unnamed: 0,dep_time,distance,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10
0,1345,256.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1607,328.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1410,611.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,955,1379.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1450,213.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5027622,1143,946.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5027623,1309,647.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5027624,659,92.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5027625,956,180.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
# showing the performance of a classification model for binary
def auc(m, train, test): 
    return (metrics.roc_auc_score(y_train,m.predict_proba(train)[:,1]),
            metrics.roc_auc_score(y_test,m.predict_proba(test)[:,1]))
# Parameter Tuning
param_dist = {"max_depth": [10,30,50],
              "min_child_weight" : [1,3,6],
             "n_estimators": [200],
              "learning_rate": [0.05,0.1,0.16]}

### Evaluate on test set

In [None]:
# Evaluate on test set
pr = clf_rf.predict(test2)

# print results
cm = confusion_matrix(y_test, pr)
print("Confusion matrix")
print(pd.DataFrame(cm))
report_svm = precision_recall_fscore_support(list(y_test), list(pr), average='micro')
print("\nprecision = %0.2f, recall = %0.2f, F1 = %0.2f, accuracy = %0.2f\n" % \
        (report_svm[0], report_svm[1], report_svm[2], accuracy_score(list(y_test), list(pr))))

In [14]:
y_test2 = pd.DataFrame({"y":y_test.reset_index(drop=True)})
predict2 = pd.DataFrame({"yhat":pr})
x_test2 = pd.DataFrame(test2)
result = x_test2.join(y_test2)
predicted_result = result.join(predict2)
predicted_result

Unnamed: 0,dep_time,distance,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,y,yhat
0,1345,256.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0
1,1607,328.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0
2,1410,611.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0
3,955,1379.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
4,1450,213.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2031360,1857,224.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2031361,1432,268.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2031362,1200,223.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2031363,1013,978.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0


In [15]:
predicted_result[predicted_result['y'] != predicted_result['yhat']]

Unnamed: 0,dep_time,distance,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,y,yhat
18,1855,569.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
34,1839,541.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0
69,2039,503.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,1
105,2123,714.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
110,2020,369.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2031289,717,717.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0
2031334,1849,1438.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
2031337,2029,284.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0
2031348,940,1449.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
