# Models: Decision Tree, Random Forest

In [26]:
# Import relevant packages
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, recall_score, roc_auc_score, precision_score
from sklearn.metrics import plot_confusion_matrix, auc, roc_curve, plot_roc_curve, plot_precision_recall_curve
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from utils import process_data

In [4]:
data = process_data(type_ = 'normal')

Beginning data processing ...
Reading in batched data ...


100%|███████████████████████████████████████████| 19/19 [00:07<00:00,  2.67it/s]


Splitting data into train and test ...
Scaling data ...
Completed normal data processing.


In [5]:
X_train = data['X_train_scaled']
y_train = data['y_train']

X_test = data['X_test_scaled']
y_test = data['y_test']

## Useful Functions

In [6]:
def plot_roc(clf, X, y, title):
    fpr, tpr, thresholds = roc_curve(y, clf.predict_proba(X)[:, 1])
    auc_ = auc(fpr, tpr)

    plt.figure(1)
    plt.plot([0, 1], [0, 1], '--')
    plt.plot(fpr, tpr, label='area = {:.3f}'.format(auc_))
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title(title)
    plt.legend(loc='best')
    plt.show()

In [11]:
def plot_impurities(clf):
    importances_impurity = clf.feature_importances_
    impurity_importances = pd.Series(importances_impurity).nlargest(10) 
    # select the 10 X variables with largest feature importance values
    fig, ax = plt.subplots()
    impurity_importances.plot.bar(ax=ax)
    ax.set_title("Feature importances using MDI (mean decrease in impurity)")
    ax.set_ylabel("Mean decrease in impurity")
    fig.tight_layout()

In [12]:
def plot_permutations(clf):
    importances_permutation = permutation_importance(
        clf, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
    )
    permutation_importances = pd.Series(importances_permutation.importances_mean).nlargest(10)

    fig, ax = plt.subplots()
    permutation_importances.plot.bar(ax=ax)
    ax.set_title("Feature importances using permutation importances")
    ax.set_ylabel("Mean decrease in accuracy")
    fig.tight_layout()

In [19]:
def grid_search(clf, X_train, y_train, criterion, param_grid):
    # Testing through a 5-fold CV and finding the combination that yields the highest criterion
    grid = GridSearchCV(estimator=clf,
                        param_grid=param_grid,
                        scoring=criterion,
                        verbose=2,
                        cv=5,
                        n_jobs=-1)

    grid_result = grid.fit(X_train, y_train)
    return grid_result

## Decision Tree Model

In [20]:
# Specify scoring criterion
criterion = make_scorer(roc_auc_score, needs_proba=True)

In [22]:
clf_tree = DecisionTreeClassifier()

# param_grid = dict(criterion=['gini', 'entropy'],
#                   min_samples_split=[2, 5, 10],
#                   max_depth=[2, 5, 10],
#                   ccp_alpha=[0, 0.00001, 0.0001])

param_grid = dict(max_depth = [2])

grid_result_tree = grid_search(clf_tree, X_train, y_train, criterion, param_grid)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [25]:
# Choose best parameters from hyperparameter tuning
clf_tree = DecisionTreeClassifier(**grid_result_tree.best_params_).fit(X_train, y_train)

In [28]:
y_pred_train = clf_tree.predict(X_train)
y_pred_test = clf_tree.predict(X_test)

results_tree = pd.DataFrame(dict(model = ['decision tree'],
                                 accuracy_train = [accuracy_score(y_train, y_pred_train)],
                                 recall_train = [recall_score(y_train, y_pred_train)],
                                 precision_train = [precision_score(y_train, y_pred_train)],
                                 roc_auc_train = [roc_auc_score(y_train, y_pred_train)],
                                 accuracy_test = [accuracy_score(y_test, y_pred_test)],
                                 recall_test = [recall_score(y_test, y_pred_test)],
                                 precision_test = [precision_score(y_test, y_pred_test)],
                                 roc_auc_test = [roc_auc_score(y_test, y_pred_test)],
                                ))
results_tree

Unnamed: 0,model,accuracy_train,recall_train,precision_train,roc_auc_train,accuracy_test,recall_test,precision_test,roc_auc_test
0,decision tree,0.999426,0.31379,1.0,0.656895,0.999436,0.326733,1.0,0.663366
