In [39]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

In [40]:
def calc_prevalence(y):
    return (sum(y)/len(y))

In [41]:
df = pd.read_csv('cleaned_airline_data/clean_merge_apr.csv')


In [42]:
# shuffle the samples
df = df.sample(n = len(df), random_state = 42)
df = df.reset_index(drop = True)

df_test = df.sample(frac = 0.3, random_state = 42)
df_train = df.drop(df_test.index)

In [43]:
print('Test prevalence(n = %d):%.3f'%(len(df_test),calc_prevalence(df_test.label.values)))
print('Train prevalence(n = %d):%.3f'%(len(df_train), calc_prevalence(df_train.label.values)))

Test prevalence(n = 21874):0.355
Train prevalence(n = 51039):0.358


In [44]:
y_train = df_train['label'].values
y_test = df_test['label'].values
df_train = df_train.drop(['label'], axis=1)
df_test = df_test.drop(['label'], axis=1)

X_train = df_train.values
X_test = df_test.values

print('Training shapes:',X_train.shape, y_train.shape)
print('Testing shapes:',X_test.shape, y_test.shape)

Training shapes: (51039, 316) (51039,)
Testing shapes: (21874, 316) (21874,)


In [45]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

In [51]:
from sklearn.model_selection import GridSearchCV
# Create a based model
rf = RandomForestClassifier(max_depth=20)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)

In [52]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=20)

## Do not run grid search unless you want to wait for a very long time

In [37]:
# Fit the grid search to the data

grid_search.fit(X_train, y_train)

grid_search.best_params_

Fitting 3 folds for each of 1980 candidates, totalling 5940 fits


KeyboardInterrupt: 

In [53]:
y_train_preds = rf.predict_proba(X_train)[:,1]
y_test_preds = rf.predict_proba(X_test)[:,1]

In [54]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
def calc_specificity(y_actual, y_pred, thresh):
    # calculates specificity
    return sum((y_pred < thresh) & (y_actual == 0)) /sum(y_actual ==0)

def print_report(y_actual, y_pred, thresh):
    auc = roc_auc_score(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, (y_pred > thresh))
    recall = recall_score(y_actual, (y_pred > thresh))
    precision = precision_score(y_actual, (y_pred > thresh))
    specificity = calc_specificity(y_actual, y_pred, thresh)
    print('AUC:%.3f'%auc)
    print('accuracy:%.3f'%accuracy)
    print('recall:%.3f'%recall)
    print('precision:%.3f'%precision)
    print('specificity:%.3f'%specificity)
    print('prevalence:%.3f'%calc_prevalence(y_actual))
    return auc, accuracy, recall, precision, specificity

In [55]:
thresh = 0.357
print('Random Forest')
print('Training: ')
print_report(y_train, y_train_preds, thresh)
print('Testing: ')
print_report(y_test, y_test_preds, thresh)


Random Forest
Training: 
AUC:0.973
accuracy:0.899
recall:0.941
precision:0.809
specificity:0.876
prevalence:0.358
Testing: 
AUC:0.717
accuracy:0.662
recall:0.642
precision:0.518
specificity:0.672
prevalence:0.355


(0.7174899059093192,
 0.6615159550150864,
 0.6417064054646218,
 0.518483807143601,
 0.6724052426496635)