In [16]:
from FlightMLP import FlightMLP
import torch
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import train_test_split
from FlightDataset import FlightDataset

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


Get data:

In [17]:
data = np.load('../data/airline_final.npy')
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

test_dataset = FlightDataset(torch.tensor(test_data))
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)
test_data = None

X = test_dataset.data[:, :-1]
y = test_dataset.data[:,-1]

In [18]:

def parse_params(file_name):
    '''
    parse out layers, dropout, and learning rate
    '''
    
    # remove file extension
    idx = file_name[::-1].find('.')
    file_name = file_name[:-idx - 1]
    
    # remove prefix
    file_name = file_name[file_name.find('_')+1:]

    # get lr
    idx = file_name[::-1].find('_')
    lr = float(file_name[-idx:])
    file_name = file_name[:-idx]
    file_name = file_name.strip('_lr_')

    if 'drop' in file_name:
        dropout = True
        file_name = file_name.strip('_drop')
    else:
        dropout = False

    layers = file_name.split('_')
    layers = [int(l) for l in layers]

    return layers, lr, dropout

In [19]:
weights = os.listdir('weights/')
models = []
params = []

columns=[
    'hidden layers', 
    'hidden nodes', 
    'lr', 
    'dropout',
    'accuracy',
    'precision',
    'recall',
    'f1',
    'auc'
]


results = pd.DataFrame(columns=columns)
roc_list = []

for i, file in enumerate(weights):
    layers, lr, dropout = parse_params(file)
    params.append([layers, lr, dropout])
    model = FlightMLP(layer_sizes=layers, dropout=dropout)
    model.load_state_dict(torch.load('weights/' + file))
    models.append(model)
    
    model.eval()
    X = test_dataset.data[:,:-1].float()
    y = test_dataset.data[:,-1].float()
    y_pred_score = torch.sigmoid(model(X)).detach().numpy()
    y_pred = y_pred_score >= 0.5
    
    accuracy = accuracy_score(y, y_pred)
    auc = roc_auc_score(y, y_pred)
    roc = roc_curve(y, y_pred_score)
    roc_list.append(roc)
    
    precision, recall, f, support = precision_recall_fscore_support(y, y_pred)
    precision = precision[1]
    recall = recall[1]
    f1 = f[1]
    
    data = {
        'hidden layers' : len(layers),
        'hidden nodes' : layers[0],
        'lr' : lr,
        'dropout' : dropout,
        'accuracy' : accuracy,
        'precision' : precision,
        'recall' : recall,
        'f1' : f1,
        'auc' : auc
    }
    
    results.loc[i] = data
    

In [20]:
results

Unnamed: 0,hidden layers,hidden nodes,lr,dropout,accuracy,precision,recall,f1,auc
0,3,20,1e-05,True,0.691247,0.430233,0.718542,0.538209,0.700336
1,4,300,1e-05,True,0.743805,0.490962,0.628625,0.55133,0.705452
2,2,100,0.0001,True,0.700673,0.439751,0.713089,0.544015,0.704807
3,3,50,1e-05,True,0.707834,0.446693,0.698856,0.545021,0.704844
4,3,400,1e-05,True,0.747702,0.496974,0.622506,0.552702,0.706014
5,3,20,0.0001,True,0.707834,0.445846,0.686619,0.540637,0.70077
6,2,50,1e-05,True,0.692346,0.432147,0.728119,0.542383,0.704258
7,3,400,1e-06,True,0.71303,0.452144,0.689944,0.546288,0.705343
8,4,300,1e-06,True,0.715594,0.454137,0.672386,0.54212,0.701207
9,3,200,0.0001,True,0.725653,0.465496,0.645118,0.540782,0.698837


In [21]:
results.to_csv('../results/metrics.csv', index=False)
results_dropout = results[ results['dropout'] == True]

Best AUC:

In [22]:
cols=[
    'hidden layers', 
    'hidden nodes', 
    'lr', 
    'accuracy',
    'precision',
    'recall',
    'f1',
    'auc'
]
results_filtered = results[ (results['dropout'] == True) & (results['hidden layers'] >= 2) & (results['lr'] <= 0.0001)].sort_values(['hidden layers', 'hidden nodes', 'lr'])
print(results_filtered.to_latex(index=False, columns=cols))

\begin{tabular}{rrrrrrrr}
\toprule
 hidden layers &  hidden nodes &       lr &  accuracy &  precision &   recall &       f1 &      auc \\
\midrule
             2 &            20 & 0.000010 &  0.700806 &   0.439378 & 0.706172 & 0.541707 & 0.702593 \\
             2 &            20 & 0.000100 &  0.682088 &   0.422687 & 0.737031 & 0.537257 & 0.700383 \\
             2 &            50 & 0.000010 &  0.692346 &   0.432147 & 0.728119 & 0.542383 & 0.704258 \\
             2 &            50 & 0.000100 &  0.691480 &   0.431391 & 0.729715 & 0.542229 & 0.704212 \\
             2 &           100 & 0.000010 &  0.700340 &   0.439271 & 0.711492 & 0.543184 & 0.704053 \\
             2 &           100 & 0.000100 &  0.700673 &   0.439751 & 0.713089 & 0.544015 & 0.704807 \\
             3 &            20 & 0.000010 &  0.691247 &   0.430233 & 0.718542 & 0.538209 & 0.700336 \\
             3 &            20 & 0.000100 &  0.707834 &   0.445846 & 0.686619 & 0.540637 & 0.700770 \\
             3 &            5

  print(results_filtered.to_latex(index=False, columns=cols))


In [23]:
print('best AUC:')
results_filtered.sort_values('auc').tail(1)

best AUC:


Unnamed: 0,hidden layers,hidden nodes,lr,dropout,accuracy,precision,recall,f1,auc
14,3,100,0.0001,True,0.712563,0.452373,0.702447,0.550333,0.709195


Best F1:

In [24]:
results_filtered.sort_values('f1').tail(1)

Unnamed: 0,hidden layers,hidden nodes,lr,dropout,accuracy,precision,recall,f1,auc
4,3,400,1e-05,True,0.747702,0.496974,0.622506,0.552702,0.706014


Best precision:

In [25]:
results_filtered.sort_values('precision').tail(1)

Unnamed: 0,hidden layers,hidden nodes,lr,dropout,accuracy,precision,recall,f1,auc
4,3,400,1e-05,True,0.747702,0.496974,0.622506,0.552702,0.706014


Best recall:

In [26]:
results_filtered.sort_values('recall').tail(1)

Unnamed: 0,hidden layers,hidden nodes,lr,dropout,accuracy,precision,recall,f1,auc
10,2,20,0.0001,True,0.682088,0.422687,0.737031,0.537257,0.700383


In [27]:
models

[FlightMLP(
   (mlp): Sequential(
     (0): Linear(in_features=31, out_features=20, bias=True)
     (1): ReLU()
     (2): Linear(in_features=20, out_features=20, bias=True)
     (3): ReLU()
     (4): Dropout(p=0.1, inplace=False)
     (5): Linear(in_features=20, out_features=20, bias=True)
     (6): ReLU()
     (7): Dropout(p=0.1, inplace=False)
     (8): Linear(in_features=20, out_features=1, bias=True)
   )
 ),
 FlightMLP(
   (mlp): Sequential(
     (0): Linear(in_features=31, out_features=300, bias=True)
     (1): ReLU()
     (2): Linear(in_features=300, out_features=300, bias=True)
     (3): ReLU()
     (4): Dropout(p=0.1, inplace=False)
     (5): Linear(in_features=300, out_features=300, bias=True)
     (6): ReLU()
     (7): Dropout(p=0.1, inplace=False)
     (8): Linear(in_features=300, out_features=300, bias=True)
     (9): ReLU()
     (10): Dropout(p=0.1, inplace=False)
     (11): Linear(in_features=300, out_features=1, bias=True)
   )
 ),
 FlightMLP(
   (mlp): Sequential(
    

# Baseline models

In [29]:
# get data
#######################
data = np.load('../data/airline_final.npy')
X = data[:,:-1]
y = data[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
data = None # free up memory

xgb = XGBClassifier(
    n_estimators=500,
    object='binary:logistic',
    random_state=42,
    n_jobs=-1,
)

rf = RandomForestClassifier(
    n_estimators=400,
    # class_weight='balanced',
    max_depth=10,
    min_samples_leaf=100,
    random_state=42,
    n_jobs=12,
)

lr = LogisticRegression(
    random_state=42, 
    n_jobs=-1, 
    # class_weight={0 : 1., 1 : 3.}, 
)

models = {
    'XGBoost' : xgb,
    'Logistic Regression' : lr,
    'Random Forest' : rf
}

columns=[
    'hidden layers', 
    'hidden nodes', 
    'lr', 
    'dropout',
    'accuracy',
    'precision',
    'recall',
    'f1',
    'auc'
]


# 1 for 0 class, 3 for 1 class
sample_weight = np.where(y_train == 0, 1, 3)

columns=[
    'hidden layers', 
    'hidden nodes', 
    'lr', 
    'accuracy',
    'precision',
    'recall',
    'f1',
    'auc'
]


results = pd.DataFrame(columns=columns)

for i, model in enumerate(models.keys()):
    
    models[model].fit(X_train, y_train, sample_weight=sample_weight)
    
    y_pred = models[model].predict(X_test)
    
    precision, recall, f1, support  = precision_recall_fscore_support(y_test, y_pred)
    precision = precision[1]
    recall = recall[1]
    f1 = f1[1]
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    
    d = {
        'accuracy' : accuracy,
        'precision' : precision,
        'recall' : recall,
        'f1' : f1,
        'accuracy' : accuracy, 
        'auc' : auc,
    }
    
    results.loc[i] = d

print(results.to_latex(index=False))

FileNotFoundError: [Errno 2] No such file or directory: 'data/airline_final.npy'

In [None]:
roc = roc_curve(y, y_pred)
plt.plot(roc_list[5][0], roc_list[5][1])
plt.xlabel('FPR')
plt.ylabel('TPR')