In [1]:
# Python library imports: numpy, random, sklearn, pandas, etc

import warnings
warnings.filterwarnings('ignore')

import sys
import random
import numpy as np

# cross_validation is deprecated since version 0.18. This module will be removed in 0.20.
# Use sklearn.model_selection.train_test_split instead.
# from sklearn import linear_model, cross_validation, metrics, svm
from sklearn.model_selection import train_test_split
from sklearn import linear_model, metrics, svm
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

import pandas as pd
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

In [2]:
# Config size
TRAINING_MATRIX_RESULT_FILE_NAME = 'training_matrix_result.pickle'

### Load training matrix result

In [3]:
# Load trained model from file
result_location_path = f"./training_results/{TRAINING_MATRIX_RESULT_FILE_NAME}"
clf_rf = pickle.load(open(result_location_path, "rb"))


### Import data and make performance test

In [4]:
# read files
cols = ['dep_delay', 'month', 'day_of_month', 'day_of_week', 'dest',  'distance']
col_types = {'dep_delay': int, 'month': int, 'day_of_month': int, 'day_of_week': int, 'distance': int, 
            'dest': str}

df_1990 = pd.read_csv('result/1990.csv')
df_1991 = pd.read_csv('result/1991.csv')
df_1992 = pd.read_csv('result/1992.csv')
df_1993 = pd.read_csv('result/1993.csv')
df_1994 = pd.read_csv('result/1994.csv')
df_1995 = pd.read_csv('result/1995.csv')
df_1996 = pd.read_csv('result/1996.csv')

# df_1996 = pd.read_csv('result/1996.csv')
# df_1996 = df_1996[cols]

df_years = [df_1990, df_1991, df_1992, df_1993, df_1994, df_1995, df_1996]

flight_df = pd.concat(df_years)
flight_df


Unnamed: 0,year,month,day_of_month,day_of_week,dep_time,crs_dep_time,arr_time,crs_arr_time,unique_carrier,flight_num,...,dep_delay,origin,dest,distance,cancelled,diverted,tail_num,air_time,taxi_in,taxi_out
0,1990,1,3,3,1707,1630,1755,1723,US,29,...,37.0,CMH,IND,182.0,0,0,,,,
1,1990,1,4,4,1706,1630,1807,1723,US,29,...,36.0,CMH,IND,182.0,0,0,,,,
2,1990,1,5,5,1629,1630,1715,1723,US,29,...,-1.0,CMH,IND,182.0,0,0,,,,
3,1990,1,6,6,1633,1630,1718,1723,US,29,...,3.0,CMH,IND,182.0,0,0,,,,
4,1990,1,8,1,1630,1630,1726,1723,US,29,...,0.0,CMH,IND,182.0,0,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5209321,1996,12,6,5,1512,0,1624,0,AA,1885,...,-2.0,ATL,DFW,732.0,0,0,N867AA,106.0,12.0,14.0
5209322,1996,12,7,6,1514,0,1633,0,AA,1885,...,0.0,ATL,DFW,732.0,0,0,N861AA,115.0,11.0,13.0
5209323,1996,12,8,7,1510,0,1650,0,AA,1885,...,-4.0,ATL,DFW,732.0,0,0,N846AA,129.0,10.0,21.0
5209324,1996,12,9,1,1513,0,1616,0,AA,1885,...,-1.0,ATL,DFW,732.0,0,0,N709AA,105.0,9.0,9.0


In [5]:
# Select columns to reduce the training matrix side and define dimension for prediction feature
cols = ['unique_carrier', 'month', 'flight_num', 'day_of_month', 'day_of_week', 'dest', 'origin', 'dep_time', 'distance', 'dep_delay']
flight_df_selc = flight_df[cols]
# flight_df_selc['unique_carrier'].unique()
flight_df_selc["unique_carrier"].values.reshape(-1, 1)


array([['US'],
       ['US'],
       ['US'],
       ...,
       ['AA'],
       ['AA'],
       ['AA']], dtype=object)

In [6]:
flight_df_selc['dep_delay'] = (flight_df_selc['dep_delay'] > 10) *1
categ_cols = ['unique_carrier', 'flight_num', 'dest', 'origin']

for item  in categ_cols:
    flight_df_selc[item] = flight_df_selc[item].astype('category').cat.codes + 1


In [7]:
# Divide test and train matrix
train, test, y_train, y_test = train_test_split(flight_df_selc.drop(['dep_delay'], axis=1), flight_df_selc["dep_delay"],
                                                random_state=10, test_size=0.2)

In [13]:
test["unique_carrier"].unique()

array([13,  3,  8, 12,  1, 11,  4, 10,  5,  2,  6,  9,  7], dtype=int8)

In [16]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
enc.fit(train["unique_carrier"].values.reshape(-1, 1))  

# enc.feature_indices_
airline_onehot = enc.transform(train["unique_carrier"].values.reshape(-1, 1)).toarray()
airline_onehot_test = enc.transform(test["unique_carrier"].values.reshape(-1, 1)).toarray()

In [17]:
air_oh_df = pd.DataFrame(airline_onehot, columns = ["A1","A2","A3","A4","A5","A6","A7",
                                                    "A8","A9","A10", "A11", "A12", "A13"])
air_oh_test_df = pd.DataFrame(airline_onehot_test, columns = ["A1","A2","A3","A4","A5","A6","A7",
                                                         "A8","A9","A10", "A11", "A12", "A13"])

In [18]:
train2 = pd.concat([train.reset_index(),air_oh_df.reset_index()],axis=1).drop(["index","month","day_of_month","day_of_week","unique_carrier","flight_num","dest","origin"],axis=1)
test2  = pd.concat([test.reset_index(),air_oh_test_df.reset_index()],axis=1).drop(["index","month","day_of_month","day_of_week","unique_carrier","flight_num","dest","origin"],axis=1)

test2

Unnamed: 0,dep_time,distance,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13
0,740,718.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1635,1754.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2002,305.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,725,284.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1401,956.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7125325,1734,733.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7125326,1837,650.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7125327,1415,1754.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7125328,1753,2106.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
# showing the performance of a classification model for binary
def auc(m, train, test): 
    return (metrics.roc_auc_score(y_train,m.predict_proba(train)[:,1]),
            metrics.roc_auc_score(y_test,m.predict_proba(test)[:,1]))
# Parameter Tuning
param_dist = {"max_depth": [10,30,50],
              "min_child_weight" : [1,3,6],
             "n_estimators": [200],
              "learning_rate": [0.05,0.1,0.16]}

### Evaluate on test set

In [20]:
# Evaluate on test set
pr = clf_rf.predict(test2)

# print results
cm = confusion_matrix(y_test, pr)
print("Confusion matrix")
print(pd.DataFrame(cm))
report_svm = precision_recall_fscore_support(list(y_test), list(pr), average='micro')
print("\nprecision = %0.2f, recall = %0.2f, F1 = %0.2f, accuracy = %0.2f\n" % \
        (report_svm[0], report_svm[1], report_svm[2], accuracy_score(list(y_test), list(pr))))

Confusion matrix
         0       1
0  5724598  167307
1   576506  656919

precision = 0.90, recall = 0.90, F1 = 0.90, accuracy = 0.90



In [21]:
y_test2 = pd.DataFrame({"y":y_test.reset_index(drop=True)})
predict2 = pd.DataFrame({"yhat":pr})
x_test2 = pd.DataFrame(test2)
result = x_test2.join(y_test2)
predicted_result = result.join(predict2)
predicted_result

Unnamed: 0,dep_time,distance,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,y,yhat
0,740,718.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0
1,1635,1754.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2,2002,305.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,725,284.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0
4,1401,956.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7125325,1734,733.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
7125326,1837,650.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0
7125327,1415,1754.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0
7125328,1753,2106.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0


In [22]:
predicted_result[predicted_result['y'] != predicted_result['yhat']]

Unnamed: 0,dep_time,distance,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,y,yhat
13,1420,1011.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0
22,2135,239.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0
77,1818,440.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,0
79,1751,1024.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0
90,2044,576.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7125295,1217,1589.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
7125308,1726,1619.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
7125314,1551,1208.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
7125317,1717,732.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
