# Successful Discharge Ensemble Algorithms

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Initial imports
import pandas as pd
import numpy as np
from path import Path
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
# Loading data
file_path = Path("Resources/master_data_encoded_rev3.csv")
df_master = pd.read_csv(file_path)
df_master.head()

Unnamed: 0,length_of_stay,y,cans_ldf_develop,cans_ldf_family,cans_ldf_legal,cans_ldf_rec,cans_ldf_sleep,cans_ldf_social_func,cans_yb_anger,cans_yb_conduct,...,diagnosis_value_Generalized anxiety disorder,diagnosis_value_Major depressive disorder recurrent episode moderate,diagnosis_value_Major depressive disorder single episode moderate,diagnosis_value_Major depressive disorder single episode unspecified,diagnosis_value_Oppositional defiant disorder,diagnosis_value_Other,diagnosis_value_Persistent depressive disorder dysthymia,diagnosis_value_Posttraumatic stress disorder,diagnosis_value_Unspecified anxiety disorder,diagnosis_value_Unspecified depressive disorder
0,246,0,1,0,0,1,1,2,1,0,...,0,1,0,0,0,0,0,0,0,0
1,307,1,0,1,0,1,1,1,2,0,...,0,0,0,0,0,1,0,0,0,0
2,313,0,0,1,0,1,0,1,1,1,...,0,0,0,0,0,1,0,0,0,0
3,601,1,0,2,0,0,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
4,176,1,0,0,0,0,0,0,1,1,...,0,0,0,0,1,0,0,0,0,0


In [5]:
#Drop some predictor features and male gender dummy variable to remove issue of perfect collinearity
#Define the features set
X = df_master.copy()
X = X.drop(['y','patient_sex_code_M','ethnic_origin_value_Not Hispanic','ethnic_origin_value_Unk','diagnosis_value_Major depressive disorder  recurrent episode  moderate'], axis=1)
X.head()

Unnamed: 0,length_of_stay,cans_ldf_develop,cans_ldf_family,cans_ldf_legal,cans_ldf_rec,cans_ldf_sleep,cans_ldf_social_func,cans_yb_anger,cans_yb_conduct,cans_yb_depression,...,diagnosis_value_Disruptive mood dysregulation disorder,diagnosis_value_Generalized anxiety disorder,diagnosis_value_Major depressive disorder single episode moderate,diagnosis_value_Major depressive disorder single episode unspecified,diagnosis_value_Oppositional defiant disorder,diagnosis_value_Other,diagnosis_value_Persistent depressive disorder dysthymia,diagnosis_value_Posttraumatic stress disorder,diagnosis_value_Unspecified anxiety disorder,diagnosis_value_Unspecified depressive disorder
0,246,1,0,0,1,1,2,1,0,2,...,0,0,0,0,0,0,0,0,0,0
1,307,0,1,0,1,1,1,2,0,2,...,0,0,0,0,0,1,0,0,0,0
2,313,0,1,0,1,0,1,1,1,2,...,0,0,0,0,0,1,0,0,0,0
3,601,0,2,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,1,0,0
4,176,0,0,0,0,0,0,1,1,2,...,0,0,0,0,1,0,0,0,0,0


In [6]:
X.shape

(1237, 54)

In [7]:
#Define the target set
y = df_master["y"].ravel()
y[:5]

array([0, 1, 0, 1, 1], dtype=int64)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.75, random_state=1)

### Ensemble Learners
In this section, I compare two ensemble algorithms to determine which algorithm results in the best performance. I trained a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, I have completed the following steps:

1. Trained the model using the training data.
2. Calculated the balanced accuracy score from sklearn.metrics.
3. Printed the confusion matrix from sklearn.metrics.
4. Generated a classication report using the imbalanced_classification_report from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, I printed the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: I used a random state of 1 for each algorithm to ensure consistency between tests

In [9]:
#resample the training data with the RandomOversampler
from imblearn.ensemble import BalancedRandomForestClassifier
brf= BalancedRandomForestClassifier(n_estimators = 100, random_state = 1)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                               criterion='gini', max_depth=None,
                               max_features='auto', max_leaf_nodes=None,
                               max_samples=None, min_impurity_decrease=0.0,
                               min_samples_leaf=2, min_samples_split=2,
                               min_weight_fraction_leaf=0.0, n_estimators=100,
                               n_jobs=None, oob_score=False, random_state=1,
                               replacement=False, sampling_strategy='auto',
                               verbose=0, warm_start=False)

In [10]:
#calculate the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.668010752688172

In [11]:
#Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
#Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,85,39
Actual 1,65,121


In [13]:
#Displaying results
print(classification_report_imbalanced(y_test,y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.57      0.69      0.65      0.62      0.67      0.45       124
          1       0.76      0.65      0.69      0.70      0.67      0.44       186

avg / total       0.68      0.66      0.67      0.67      0.67      0.45       310



In [14]:
#list the features sorted in descending order by feature import
importances = brf.feature_importances_ 
indices = np.argsort(importances)[::-1]
for f in range(X.shape[1]):
    print(f"{X.columns.values[f]}: ({importances[indices[f]]})")

length_of_stay: (0.19201159358389347)
cans_ldf_develop: (0.08966900627471862)
cans_ldf_family: (0.034333258544650136)
cans_ldf_legal: (0.03288923459424876)
cans_ldf_rec: (0.03101959599157039)
cans_ldf_sleep: (0.030463741363834628)
cans_ldf_social_func: (0.029915362690272104)
cans_yb_anger: (0.029156883566783285)
cans_yb_conduct: (0.029029258139398554)
cans_yb_depression: (0.026654413217292963)
cans_yb_opposition: (0.026155667080816368)
cans_yb_substance: (0.025417294283958017)
cans_yb_trauma: (0.02526534131544475)
cans_ys_community: (0.024283041365348028)
cans_ys_family: (0.02404610275801669)
cans_ys_interpersonal: (0.022444063961721394)
cans_ys_optimism: (0.022314483269881433)
cans_ys_relationship: (0.021358527489001515)
cans_ys_resiliency: (0.02098048445800804)
cans_ys_spirit: (0.020212500825391305)
cans_ys_talent: (0.020017635623271256)
cans_ys_resource: (0.019391214112200166)
EPISODE_NUMBER: (0.01920759363252568)
MAXEpisode: (0.014588960202181912)
age2: (0.013272582731714767)
loc_n

### Easy Ensemble AdaBoost Classifier

In [15]:
# Train the classifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(base_estimator=None, n_estimators=100, n_jobs=None,
                       random_state=1, replacement=False,
                       sampling_strategy='auto', verbose=0, warm_start=False)

In [16]:
# Calculate the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6532258064516129

In [17]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)
cm2 = confusion_matrix(y_test, y_pred)
#Create a DataFrame from the confusion matrix
cm2_df = pd.DataFrame(
    cm2, index=["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])
cm2_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,78,46
Actual 1,60,126
