In [1]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Loading data
file_path = Path("Resources/master_data_encoded_rev.csv")
df_master = pd.read_csv(file_path)
df_master.head()

Unnamed: 0,patient_sex_code,length_of_stay,y,cans_ldf_develop,cans_ldf_family,cans_ldf_legal,cans_ldf_rec,cans_ldf_sleep,cans_ldf_social_func,cans_yb_anger,...,diagnosis_value_Generalized anxiety disorder,diagnosis_value_Major depressive disorder recurrent episode moderate,diagnosis_value_Major depressive disorder single episode moderate,diagnosis_value_Major depressive disorder single episode unspecified,diagnosis_value_Oppositional defiant disorder,diagnosis_value_Other,diagnosis_value_Persistent depressive disorder dysthymia,diagnosis_value_Posttraumatic stress disorder,diagnosis_value_Unspecified anxiety disorder,diagnosis_value_Unspecified depressive disorder
0,1,246,0,1,0,0,1,1,2,1,...,0,1,0,0,0,0,0,0,0,0
1,1,307,1,0,1,0,1,1,1,2,...,0,0,0,0,0,1,0,0,0,0
2,1,313,0,0,1,0,1,0,1,1,...,0,0,0,0,0,1,0,0,0,0
3,0,601,1,0,2,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,1,176,1,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0


In [3]:
#Drop some predictor features
#Define the features set
X = df_master.copy()
X = X.drop(['y','ethnic_origin_value_Not Hispanic','ethnic_origin_value_Unk','diagnosis_value_Major depressive disorder  recurrent episode  moderate'], axis=1)
X.head()

Unnamed: 0,patient_sex_code,length_of_stay,cans_ldf_develop,cans_ldf_family,cans_ldf_legal,cans_ldf_rec,cans_ldf_sleep,cans_ldf_social_func,cans_yb_anger,cans_yb_conduct,...,diagnosis_value_Disruptive mood dysregulation disorder,diagnosis_value_Generalized anxiety disorder,diagnosis_value_Major depressive disorder single episode moderate,diagnosis_value_Major depressive disorder single episode unspecified,diagnosis_value_Oppositional defiant disorder,diagnosis_value_Other,diagnosis_value_Persistent depressive disorder dysthymia,diagnosis_value_Posttraumatic stress disorder,diagnosis_value_Unspecified anxiety disorder,diagnosis_value_Unspecified depressive disorder
0,1,246,1,0,0,1,1,2,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,307,0,1,0,1,1,1,2,0,...,0,0,0,0,0,1,0,0,0,0
2,1,313,0,1,0,1,0,1,1,1,...,0,0,0,0,0,1,0,0,0,0
3,0,601,0,2,0,0,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
4,1,176,0,0,0,0,0,0,1,1,...,0,0,0,0,1,0,0,0,0,0


In [4]:
X.shape

(1237, 54)

In [5]:
#Define the target set
y = df_master["y"].ravel()
y[:5]

array([0, 1, 0, 1, 1], dtype=int64)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=78)

In [7]:
#Creating a StandardScaler instance
scaler = StandardScaler()
#Fitting the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [8]:
#Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
#Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators = 500, random_state =78)

In [10]:
#Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [11]:
predictions = rf_model.predict(X_test_scaled)
predictions

array([1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1,

In [12]:
#Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
#Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,63,73
Actual 1,31,143


In [13]:
#Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [14]:
#Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,63,73
Actual 1,31,143


Accuracy Score : 0.6645161290322581
Classification Report
              precision    recall  f1-score   support

           0       0.67      0.46      0.55       136
           1       0.66      0.82      0.73       174

    accuracy                           0.66       310
   macro avg       0.67      0.64      0.64       310
weighted avg       0.67      0.66      0.65       310



In [15]:
#Calculate feature importance in the Random Forest Model
importances = rf_model.feature_importances_
importances

array([0.01455029, 0.16910613, 0.0139517 , 0.02578674, 0.01561401,
       0.02234382, 0.02254133, 0.02593708, 0.02529569, 0.02072878,
       0.02412405, 0.02577484, 0.01820378, 0.02560165, 0.02785181,
       0.02841834, 0.02325277, 0.02551622, 0.02199705, 0.02306973,
       0.02686486, 0.02351409, 0.02631058, 0.03109352, 0.03061922,
       0.0575895 , 0.02737037, 0.00889697, 0.00462285, 0.00313853,
       0.01000932, 0.00834294, 0.00831413, 0.        , 0.00797498,
       0.010947  , 0.00840543, 0.01173407, 0.00464424, 0.00284481,
       0.00625853, 0.00347044, 0.00356567, 0.00867477, 0.00392095,
       0.00608323, 0.00397772, 0.00661479, 0.00926735, 0.01343434,
       0.00418533, 0.00617476, 0.00653594, 0.00493292])

In [16]:
#We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.16910613368905275, 'length_of_stay'),
 (0.057589498243740024, 'age2'),
 (0.031093523526235536, 'EPISODE_NUMBER'),
 (0.03061922195295575, 'MAXEpisode'),
 (0.028418339040979235, 'cans_ys_family'),
 (0.02785181425082267, 'cans_ys_community'),
 (0.027370370347069364, 'loc_num'),
 (0.026864858310666352, 'cans_ys_spirit'),
 (0.02631058139197955, 'cans_ys_resource'),
 (0.02593707677558346, 'cans_ldf_social_func'),
 (0.025786744511491638, 'cans_ldf_family'),
 (0.025774839977878806, 'cans_yb_opposition'),
 (0.025601651459708778, 'cans_yb_trauma'),
 (0.02551622305795781, 'cans_ys_optimism'),
 (0.025295689679623538, 'cans_yb_anger'),
 (0.024124052418674078, 'cans_yb_depression'),
 (0.023514087588880565, 'cans_ys_talent'),
 (0.023252767695893982, 'cans_ys_interpersonal'),
 (0.023069729701120752, 'cans_ys_resiliency'),
 (0.022541326356497908, 'cans_ldf_sleep'),
 (0.02234382400710603, 'cans_ldf_rec'),
 (0.02199705093704055, 'cans_ys_relationship'),
 (0.020728781263829748, 'cans_yb_conduct'),
 (0