In [2]:
# Python library imports: numpy, random, sklearn, pandas, etc

import warnings
warnings.filterwarnings('ignore')

import sys
import random
import numpy as np

# cross_validation is deprecated since version 0.18. This module will be removed in 0.20.
# Use sklearn.model_selection.train_test_split instead.
# from sklearn import linear_model, cross_validation, metrics, svm
from sklearn.model_selection import train_test_split
from sklearn import linear_model, metrics, svm
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

import pandas as pd
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

In [58]:
# Config size
TRAINING_MATRIX_RESULT_FILE_NAME = 'training_matrix_result.pickle'

### Load training matrix result

In [59]:
# Load trained model from file
result_location_path = f"./training_results/{TRAINING_MATRIX_RESULT_FILE_NAME}"
clf_rf = pickle.load(open(result_location_path, "rb"))


### Import data and make performance test

In [60]:
# read files
cols = ['dep_delay', 'month', 'day_of_month', 'day_of_week', 'dest',  'distance']
col_types = {'dep_delay': int, 'month': int, 'day_of_month': int, 'day_of_week': int, 'distance': int, 
            'dest': str}

df_1996 = pd.read_csv('result/1996.csv')
flight_df = df_1996
flight_df


Unnamed: 0,year,month,day_of_month,day_of_week,dep_time,crs_dep_time,arr_time,crs_arr_time,unique_carrier,flight_num,...,air_time,arr_delay,dep_delay,origin,dest,distance,taxi_in,taxi_out,cancelled,diverted
0,1996,1,29,1,2039,1930,2245,2139,DL,345,...,230.0,66.0,69.0,ATL,PHX,1587,6,10,0,0
1,1996,1,30,2,1931,1930,2142,2139,DL,345,...,224.0,3.0,1.0,ATL,PHX,1587,5,22,0,0
2,1996,1,31,3,1956,1930,2231,2139,DL,345,...,241.0,52.0,26.0,ATL,PHX,1587,7,27,0,0
3,1996,1,1,1,1730,1550,1909,1745,DL,411,...,201.0,84.0,100.0,ATL,PHX,1587,4,14,0,0
4,1996,1,2,2,1714,1550,1841,1745,DL,411,...,195.0,56.0,84.0,ATL,PHX,1587,4,8,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5209321,1996,12,6,5,1512,0,1624,0,AA,1885,...,106.0,-12.0,-2.0,ATL,DFW,732,12,14,0,0
5209322,1996,12,7,6,1514,0,1633,0,AA,1885,...,115.0,-3.0,0.0,ATL,DFW,732,11,13,0,0
5209323,1996,12,8,7,1510,0,1650,0,AA,1885,...,129.0,14.0,-4.0,ATL,DFW,732,10,21,0,0
5209324,1996,12,9,1,1513,0,1616,0,AA,1885,...,105.0,-20.0,-1.0,ATL,DFW,732,9,9,0,0


In [61]:
# Select columns to reduce the training matrix side and define dimension for prediction feature
cols = ['unique_carrier', 'month', 'flight_num', 'day_of_month', 'day_of_week', 'dest', 'origin', 'dep_time', 'distance', 'dep_delay']
flight_df_selc = flight_df[cols]


In [62]:
flight_df_selc['dep_delay'] = (flight_df_selc['dep_delay'] > 10) *1
categ_cols = ['unique_carrier', 'flight_num', 'dest', 'origin']

for item  in categ_cols:
    flight_df_selc[item] = flight_df_selc[item].astype('category').cat.codes + 1


In [63]:
# Divide test and train matrix
train, test, y_train, y_test = train_test_split(flight_df_selc.drop(['dep_delay'], axis=1), flight_df_selc["dep_delay"],
                                                random_state=10, test_size=0.99)

In [64]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
enc.fit(train["unique_carrier"].values.reshape(-1, 1))  

# enc.feature_indices_
airline_onehot = enc.transform(train["unique_carrier"].values.reshape(-1, 1)).toarray()
airline_onehot_test = enc.transform(test["unique_carrier"].values.reshape(-1, 1)).toarray()

In [65]:
air_oh_df = pd.DataFrame(airline_onehot, columns = ["A1","A2","A3","A4","A5","A6","A7",
                                                    "A8","A9","A10"])
air_oh_test_df = pd.DataFrame(airline_onehot_test, columns = ["A1","A2","A3","A4","A5","A6","A7",
                                                         "A8","A9","A10"])

In [66]:
train2 = pd.concat([train.reset_index(),air_oh_df.reset_index()],axis=1).drop(["index","month","day_of_month","day_of_week","unique_carrier","flight_num","dest","origin"],axis=1)
test2  = pd.concat([test.reset_index(),air_oh_test_df.reset_index()],axis=1).drop(["index","month","day_of_month","day_of_week","unique_carrier","flight_num","dest","origin"],axis=1)

test2

Unnamed: 0,dep_time,distance,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10
0,1600,287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1257,1431,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,758,1037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1747,1956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,600,228,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5157228,1706,1431,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5157229,1600,1372,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5157230,1212,2475,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5157231,1000,233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [67]:
# showing the performance of a classification model for binary
def auc(m, train, test): 
    return (metrics.roc_auc_score(y_train,m.predict_proba(train)[:,1]),
            metrics.roc_auc_score(y_test,m.predict_proba(test)[:,1]))
# Parameter Tuning
param_dist = {"max_depth": [10,30,50],
              "min_child_weight" : [1,3,6],
             "n_estimators": [200],
              "learning_rate": [0.05,0.1,0.16]}

### Evaluate on test set

In [68]:
# Evaluate on test set
pr = clf_rf.predict(test2)

# print results
cm = confusion_matrix(y_test, pr)
print("Confusion matrix")
print(pd.DataFrame(cm))
report_svm = precision_recall_fscore_support(list(y_test), list(pr), average='micro')
print("\nprecision = %0.2f, recall = %0.2f, F1 = %0.2f, accuracy = %0.2f\n" % \
        (report_svm[0], report_svm[1], report_svm[2], accuracy_score(list(y_test), list(pr))))

Confusion matrix
         0       1
0  3313549  674616
1   472627  696441

precision = 0.78, recall = 0.78, F1 = 0.78, accuracy = 0.78



In [69]:
y_test2 = pd.DataFrame({"y":y_test.reset_index(drop=True)})
predict2 = pd.DataFrame({"yhat":pr})
x_test2 = pd.DataFrame(test2)
result = x_test2.join(y_test2)
predicted_result = result.join(predict2)
predicted_result

Unnamed: 0,dep_time,distance,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,y,yhat
0,1600,287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0
1,1257,1431,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
2,758,1037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0
3,1747,1956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1
4,600,228,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5157228,1706,1431,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
5157229,1600,1372,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
5157230,1212,2475,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
5157231,1000,233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0


In [70]:
predicted_result[predicted_result['y'] != predicted_result['yhat']]

Unnamed: 0,dep_time,distance,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,y,yhat
3,1747,1956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1
7,1235,446,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
8,1430,867,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
10,746,326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1
19,1205,950,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5157215,1757,948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1
5157220,1544,445,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
5157224,1418,205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,1
5157227,2014,602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1
