In [1]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Loading data
file_path = Path("Resources/master_data_encoded_rev2.csv")
df_master = pd.read_csv(file_path)
df_master.head()

Unnamed: 0,length_of_stay,y,cans_ldf_develop,cans_ldf_family,cans_ldf_legal,cans_ldf_rec,cans_ldf_sleep,cans_ldf_social_func,cans_yb_anger,cans_yb_conduct,...,diagnosis_value_Generalized anxiety disorder,diagnosis_value_Major depressive disorder recurrent episode moderate,diagnosis_value_Major depressive disorder single episode moderate,diagnosis_value_Major depressive disorder single episode unspecified,diagnosis_value_Oppositional defiant disorder,diagnosis_value_Other,diagnosis_value_Persistent depressive disorder dysthymia,diagnosis_value_Posttraumatic stress disorder,diagnosis_value_Unspecified anxiety disorder,diagnosis_value_Unspecified depressive disorder
0,246,0,1,0,0,1,1,2,1,0,...,0,1,0,0,0,0,0,0,0,0
1,307,1,0,1,0,1,1,1,2,0,...,0,0,0,0,0,1,0,0,0,0
2,313,0,0,1,0,1,0,1,1,1,...,0,0,0,0,0,1,0,0,0,0
3,601,1,0,2,0,0,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
4,176,1,0,0,0,0,0,0,1,1,...,0,0,0,0,1,0,0,0,0,0


In [3]:
#Drop some predictor features and male gender dummy variable to remove issue of perfect collinearity
#Define the features set
X = df_master.copy()
X = X.drop(['y','patient_sex_code_M','ethnic_origin_value_Not Hispanic','ethnic_origin_value_Unk','diagnosis_value_Major depressive disorder  recurrent episode  moderate'], axis=1)
X.head()

Unnamed: 0,length_of_stay,cans_ldf_develop,cans_ldf_family,cans_ldf_legal,cans_ldf_rec,cans_ldf_sleep,cans_ldf_social_func,cans_yb_anger,cans_yb_conduct,cans_yb_depression,...,diagnosis_value_Disruptive mood dysregulation disorder,diagnosis_value_Generalized anxiety disorder,diagnosis_value_Major depressive disorder single episode moderate,diagnosis_value_Major depressive disorder single episode unspecified,diagnosis_value_Oppositional defiant disorder,diagnosis_value_Other,diagnosis_value_Persistent depressive disorder dysthymia,diagnosis_value_Posttraumatic stress disorder,diagnosis_value_Unspecified anxiety disorder,diagnosis_value_Unspecified depressive disorder
0,246,1,0,0,1,1,2,1,0,2,...,0,0,0,0,0,0,0,0,0,0
1,307,0,1,0,1,1,1,2,0,2,...,0,0,0,0,0,1,0,0,0,0
2,313,0,1,0,1,0,1,1,1,2,...,0,0,0,0,0,1,0,0,0,0
3,601,0,2,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,1,0,0
4,176,0,0,0,0,0,0,1,1,2,...,0,0,0,0,1,0,0,0,0,0


In [4]:
X.shape

(1237, 54)

In [5]:
#Define the target set
y = df_master["y"].ravel()
y[:5]

array([0, 1, 0, 1, 1], dtype=int64)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=78)

In [7]:
#Creating a StandardScaler instance
scaler = StandardScaler()
#Fitting the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [8]:
#Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
#Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators = 500, random_state =78)

In [10]:
#Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [11]:
predictions = rf_model.predict(X_test_scaled)
predictions

array([1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1,

In [12]:
#Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
#Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,62,74
Actual 1,30,144


In [13]:
#Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [14]:
#Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,62,74
Actual 1,30,144


Accuracy Score : 0.6645161290322581
Classification Report
              precision    recall  f1-score   support

           0       0.67      0.46      0.54       136
           1       0.66      0.83      0.73       174

    accuracy                           0.66       310
   macro avg       0.67      0.64      0.64       310
weighted avg       0.67      0.66      0.65       310



In [15]:
#Calculate feature importance in the Random Forest Model
importances = rf_model.feature_importances_
importances

array([0.17463453, 0.01296736, 0.02582246, 0.01490564, 0.0212561 ,
       0.02106476, 0.02541608, 0.02528493, 0.02032914, 0.02516741,
       0.02560736, 0.0181071 , 0.02558424, 0.02728326, 0.0296664 ,
       0.0242469 , 0.02558776, 0.0220965 , 0.02376203, 0.02738204,
       0.02375218, 0.0243466 , 0.03030974, 0.03085514, 0.05745259,
       0.02759276, 0.01440389, 0.00957179, 0.00415925, 0.00341899,
       0.00941417, 0.00810076, 0.00832121, 0.        , 0.00806409,
       0.01042921, 0.00872364, 0.01173074, 0.00460008, 0.00281528,
       0.00640588, 0.00339657, 0.00396255, 0.00813116, 0.0043115 ,
       0.00582136, 0.00392279, 0.00697091, 0.00915535, 0.01312035,
       0.00380327, 0.00576937, 0.00618695, 0.00480791])

In [17]:
#We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.17463452638468982, 'length_of_stay'),
 (0.05745259027978526, 'age2'),
 (0.030855144369257344, 'MAXEpisode'),
 (0.030309735032642404, 'EPISODE_NUMBER'),
 (0.029666395658491374, 'cans_ys_family'),
 (0.027592756622842416, 'loc_num'),
 (0.027382036005055545, 'cans_ys_spirit'),
 (0.027283261542011376, 'cans_ys_community'),
 (0.025822458409661634, 'cans_ldf_family'),
 (0.02560735576611729, 'cans_yb_opposition'),
 (0.025587762748340067, 'cans_ys_optimism'),
 (0.025584241294678945, 'cans_yb_trauma'),
 (0.02541607626600431, 'cans_ldf_social_func'),
 (0.02528493190912224, 'cans_yb_anger'),
 (0.025167405487120296, 'cans_yb_depression'),
 (0.024346600060231434, 'cans_ys_resource'),
 (0.024246896560606895, 'cans_ys_interpersonal'),
 (0.023762027055435113, 'cans_ys_resiliency'),
 (0.023752180819152518, 'cans_ys_talent'),
 (0.022096504521486544, 'cans_ys_relationship'),
 (0.021256095230169497, 'cans_ldf_rec'),
 (0.02106475633148173, 'cans_ldf_sleep'),
 (0.020329139502519196, 'cans_yb_conduct'),
 