# Models: Decision Tree, Random Forest

In [1]:
# Import relevant packages
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, recall_score, roc_auc_score, precision_score
from sklearn.metrics import plot_confusion_matrix, auc, roc_curve, plot_roc_curve, plot_precision_recall_curve
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from utils import process_data

In [2]:
data = process_data(type_ = 'normal')

Beginning data processing ...
Reading in batched data ...


100%|███████████████████████████████████████████| 19/19 [00:06<00:00,  2.97it/s]


Splitting data into train and test ...
Scaling data ...
Completed normal data processing.


In [3]:
X_train = data['X_train_scaled']
y_train = data['y_train']

X_test = data['X_test_scaled']
y_test = data['y_test']

## Useful Functions

In [4]:
def plot_roc_auc(clf):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
    
    fpr, tpr, thresholds = roc_curve(y_train, clf.predict_proba(X_train)[:, 1])
    auc_ = auc(fpr, tpr)

    ax1.plot([0, 1], [0, 1], '--')
    ax1.plot(fpr, tpr, label='area = {:.3f}'.format(auc_))
    ax1.set_xlabel('False positive rate')
    ax1.set_ylabel('True positive rate')
    ax1.set_title("Train ROC AUC")
    ax1.legend(loc='best')

    fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:, 1])
    auc_ = auc(fpr, tpr)

    ax2.plot([0, 1], [0, 1], '--')
    ax2.plot(fpr, tpr, label='area = {:.3f}'.format(auc_))
    ax2.set_xlabel('False positive rate')
    ax2.set_ylabel('True positive rate')
    ax2.set_title("Test ROC AUC")
    
    ax2.legend(loc='best')
    plt.show()

In [5]:
def plot_importances(clf):
    importances_impurity = clf.feature_importances_
    impurity_importances = pd.Series(importances_impurity).nlargest(10) 
    # select the 10 X variables with largest feature importance values
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
    impurity_importances.plot.bar(ax=ax1)
    ax1.set_title("Feature importances using MDI (mean decrease in impurity)")
    ax1.set_ylabel("Mean decrease in impurity")
    
    importances_permutation = permutation_importance(
        clf, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
    )
    permutation_importances = pd.Series(importances_permutation.importances_mean).nlargest(10)

    permutation_importances.plot.bar(ax=ax2)
    ax2.set_title("Feature importances using permutation importances")
    ax2.set_ylabel("Mean decrease in accuracy")
    fig.tight_layout()

In [11]:
def grid_search(clf, criterion, param_grid, k):
    # Testing through a 5-fold CV and finding the combination that yields the highest criterion
    grid = GridSearchCV(estimator=clf,
                        param_grid=param_grid,
                        scoring=criterion,
                        verbose=2,
                        cv=StratifiedKFold(n_splits=k),
                        n_jobs=-1)

    grid_result = grid.fit(X_train, y_train)
    return grid_result

In [12]:
def get_gridsearch_results(grid_search_result):
    res = {}
    
    param_names = list(grid_search_result.cv_results_['params'][0].keys())
    for param in param_names:
        params = grid_search_result.cv_results_['param_' + param]
        res[param] = params
    
    res['criterion_result'] = grid_search_result.cv_results_['mean_test_score']
    return pd.DataFrame(res)

In [13]:
# compare final models
def compare_models(clfs, clf_names):
    compare_results = {}
    for i in range(len(clfs)):
        clf = clfs[i]
        clf_results = {}
        y_pred_train = clf.predict(X_train)
        y_pred_test = clf.predict(X_test)
        
        clf_results['accuracy_train'] = accuracy_score(y_train, y_pred_train)
        clf_results['recall_train'] = recall_score(y_train, y_pred_train)
        clf_results['precision_train'] = precision_score(y_train, y_pred_train)
        clf_results['roc_auc_train'] = roc_auc_score(y_train, clf.predict_proba(X_train)[:,1])
        clf_results['accuracy_test'] = accuracy_score(y_test, y_pred_test)
        clf_results['recall_test'] = recall_score(y_test, y_pred_test)
        clf_results['precision_test'] = precision_score(y_test, y_pred_test)
        clf_results['roc_auc_test'] = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    
        compare_results[clf_names[i]] = clf_results
    
    return pd.DataFrame(compare_results)

## Decision Tree Model

In [None]:
# Specify scoring criterion
criterion = make_scorer(roc_auc_score, needs_proba=True)

clf_tree = DecisionTreeClassifier()

param_grid = dict(criterion=['gini', 'entropy'],
                  max_depth=[2, 5, 10],
                  ccp_alpha=[0, 0.00001, 0.0001])

grid_result_tree = grid_search(clf_tree, criterion, param_grid, k=4)
get_gridsearch_results(grid_result_tree)

Fitting 4 folds for each of 18 candidates, totalling 72 fits


In [None]:
# Choose best parameters from hyperparameter tuning
clf_tree = DecisionTreeClassifier(**grid_result_tree.best_params_).fit(X_train, y_train)

# save best decision tree model
pickle.dump(clf_tree, open('models/model_tree.sav', 'wb'))

In [None]:
# read best decision tree model
clf_tree = pickle.load(open('models/model_tree.sav', 'rb'))

## Random Forest Model

In [19]:
# Specify scoring criterion
criterion = make_scorer(roc_auc_score, needs_proba=True)

clf_rf = RandomForestClassifier()

param_grid = dict(criterion=['gini', 'entropy'],
                  max_depth=[2, 5],
                  ccp_alpha=[0, 0.0001])

grid_result_rf = grid_search(clf_rf, criterion, param_grid)
get_gridsearch_results(grid_result_rf)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


KeyboardInterrupt: 

In [None]:
# Choose best parameters from hyperparameter tuning
clf_rf = RandomForestClassifier(**grid_result_rf.best_params_).fit(X_train, y_train)

# save best decision tree model
pickle.dump(clf_rf, open('models/model_rf.sav', 'wb'))

In [None]:
# read best decision rf model
clf_rf = pickle.load(open('models/model_rf.sav', 'rb'))

## Compare Models

In [None]:
clfs = [clf_tree, clf_rf]
clf_names = ['decision_tree', 'random_forest']

In [None]:
compare_models(clfs, clf_names)

In [None]:
# Get plots
for i in range(len(clfs)):
    clf = clfs[i]
    print(f"Getting results for {clf_names[i]} model")
    plot_roc_auc(clf)
    plot_importances(clf)