In [1]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Loading data
file_path = Path("Resources/master_data_encoded_rev3.csv")
df_master = pd.read_csv(file_path)
df_master.head()

Unnamed: 0,length_of_stay,y,cans_ldf_develop,cans_ldf_family,cans_ldf_legal,cans_ldf_rec,cans_ldf_sleep,cans_ldf_social_func,cans_yb_anger,cans_yb_conduct,...,diagnosis_value_Generalized anxiety disorder,diagnosis_value_Major depressive disorder recurrent episode moderate,diagnosis_value_Major depressive disorder single episode moderate,diagnosis_value_Major depressive disorder single episode unspecified,diagnosis_value_Oppositional defiant disorder,diagnosis_value_Other,diagnosis_value_Persistent depressive disorder dysthymia,diagnosis_value_Posttraumatic stress disorder,diagnosis_value_Unspecified anxiety disorder,diagnosis_value_Unspecified depressive disorder
0,246,0,1,0,0,1,1,2,1,0,...,0,1,0,0,0,0,0,0,0,0
1,307,1,0,1,0,1,1,1,2,0,...,0,0,0,0,0,1,0,0,0,0
2,313,0,0,1,0,1,0,1,1,1,...,0,0,0,0,0,1,0,0,0,0
3,601,1,0,2,0,0,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
4,176,1,0,0,0,0,0,0,1,1,...,0,0,0,0,1,0,0,0,0,0


In [3]:
#Drop some predictor features and male gender dummy variable to remove issue of perfect collinearity
#Define the features set
X = df_master.copy()
X = X.drop(['y','patient_sex_code_M','ethnic_origin_value_Not Hispanic','ethnic_origin_value_Unk','diagnosis_value_Major depressive disorder  recurrent episode  moderate'], axis=1)
X.head()

Unnamed: 0,length_of_stay,cans_ldf_develop,cans_ldf_family,cans_ldf_legal,cans_ldf_rec,cans_ldf_sleep,cans_ldf_social_func,cans_yb_anger,cans_yb_conduct,cans_yb_depression,...,diagnosis_value_Disruptive mood dysregulation disorder,diagnosis_value_Generalized anxiety disorder,diagnosis_value_Major depressive disorder single episode moderate,diagnosis_value_Major depressive disorder single episode unspecified,diagnosis_value_Oppositional defiant disorder,diagnosis_value_Other,diagnosis_value_Persistent depressive disorder dysthymia,diagnosis_value_Posttraumatic stress disorder,diagnosis_value_Unspecified anxiety disorder,diagnosis_value_Unspecified depressive disorder
0,246,1,0,0,1,1,2,1,0,2,...,0,0,0,0,0,0,0,0,0,0
1,307,0,1,0,1,1,1,2,0,2,...,0,0,0,0,0,1,0,0,0,0
2,313,0,1,0,1,0,1,1,1,2,...,0,0,0,0,0,1,0,0,0,0
3,601,0,2,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,1,0,0
4,176,0,0,0,0,0,0,1,1,2,...,0,0,0,0,1,0,0,0,0,0


In [4]:
X.shape

(1237, 54)

In [5]:
#Define the target set
y = df_master["y"].ravel()
y[:5]

array([0, 1, 0, 1, 1], dtype=int64)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.75, random_state=78)

In [7]:
#Creating a StandardScaler instance
scaler = StandardScaler()
#Fitting the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [8]:
#Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
#Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators = 500, random_state =78)

In [10]:
#Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [11]:
predictions = rf_model.predict(X_test_scaled)
predictions

array([1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,

In [12]:
#Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
#Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,64,72
Actual 1,25,149


In [13]:
#Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [14]:
#Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,64,72
Actual 1,25,149


Accuracy Score : 0.6870967741935484
Classification Report
              precision    recall  f1-score   support

           0       0.72      0.47      0.57       136
           1       0.67      0.86      0.75       174

    accuracy                           0.69       310
   macro avg       0.70      0.66      0.66       310
weighted avg       0.69      0.69      0.67       310



In [15]:
#Calculate feature importance in the Random Forest Model
importances = rf_model.feature_importances_
importances

array([0.16758883, 0.01349335, 0.02520698, 0.01470404, 0.02162868,
       0.02122037, 0.02601467, 0.0225849 , 0.01967065, 0.02358139,
       0.02493158, 0.01801905, 0.0241135 , 0.026329  , 0.02938129,
       0.02356009, 0.0249156 , 0.02217285, 0.02273327, 0.02624167,
       0.0241329 , 0.02460503, 0.02965991, 0.03022061, 0.07989324,
       0.02768251, 0.01418683, 0.00912545, 0.00430649, 0.00321821,
       0.008775  , 0.00828544, 0.00814776, 0.        , 0.00826742,
       0.01022003, 0.00836746, 0.01133095, 0.0037268 , 0.00297789,
       0.00646632, 0.00344302, 0.00379713, 0.00777666, 0.0044059 ,
       0.00587633, 0.0040189 , 0.00617725, 0.00851188, 0.01348377,
       0.00397274, 0.00576516, 0.00617961, 0.00490364])

In [16]:
#We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.16758882606558934, 'length_of_stay'),
 (0.07989323906430108, 'age2'),
 (0.030220614056471116, 'MAXEpisode'),
 (0.02965991227315902, 'EPISODE_NUMBER'),
 (0.029381294059245153, 'cans_ys_family'),
 (0.027682510177274727, 'loc_num'),
 (0.026328995331523726, 'cans_ys_community'),
 (0.026241668616575228, 'cans_ys_spirit'),
 (0.026014668651097386, 'cans_ldf_social_func'),
 (0.02520697762985964, 'cans_ldf_family'),
 (0.024931581645804626, 'cans_yb_opposition'),
 (0.02491559953473174, 'cans_ys_optimism'),
 (0.024605034104729816, 'cans_ys_resource'),
 (0.024132898798080765, 'cans_ys_talent'),
 (0.024113504312257734, 'cans_yb_trauma'),
 (0.023581392603250012, 'cans_yb_depression'),
 (0.0235600907228781, 'cans_ys_interpersonal'),
 (0.022733270666944316, 'cans_ys_resiliency'),
 (0.02258490337353637, 'cans_yb_anger'),
 (0.022172847678090243, 'cans_ys_relationship'),
 (0.02162868178380332, 'cans_ldf_rec'),
 (0.021220372068789413, 'cans_ldf_sleep'),
 (0.019670646301190062, 'cans_yb_conduct'),
 (0.