In [1]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Loading data
file_path = Path("Resources/master_data_encoded.csv")
df_master = pd.read_csv(file_path)
df_master.head()

Unnamed: 0,patient_sex_code,length_of_stay,episode_number,y,cans_yb_anger,cans_yb_conduct,cans_yb_depression,cans_yb_impluse,cans_yb_opposition,cans_yb_pschosis,...,diagnosis_value_Adjustment disorder with depressed mood,diagnosis_value_Adjustment disorder with mixed anxiety and depressed mood,diagnosis_value_Attention-deficit hyperactivity disorder combined presentation,diagnosis_value_Generalized anxiety disorder,diagnosis_value_Major depressive disorder recurrent episode moderate,diagnosis_value_Major depressive disorder single episode unspecified,diagnosis_value_Oppositional defiant disorder,diagnosis_value_Other,diagnosis_value_Posttraumatic stress disorder,diagnosis_value_Unspecified anxiety disorder
0,1,246,2,0,1,0,2,1,1,0,...,0,0,0,0,1,0,0,0,0,0
1,1,307,3,1,2,0,2,1,1,1,...,0,0,0,0,0,0,0,1,0,0
2,1,313,3,0,1,1,2,0,1,1,...,0,0,0,0,0,0,0,1,0,0
3,0,601,10,1,2,0,1,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1,176,5,1,2,0,1,0,2,0,...,0,0,0,0,0,0,1,0,0,0


In [3]:
#Drop some predictor features
#Define the features set
X = df_master.copy()
X = X.drop(['y','ethnic_origin_value_Not Hispanic','ethnic_origin_value_Unk','diagnosis_value_Major depressive disorder  recurrent episode  moderate'], axis=1)
X.head()

Unnamed: 0,patient_sex_code,length_of_stay,episode_number,cans_yb_anger,cans_yb_conduct,cans_yb_depression,cans_yb_impluse,cans_yb_opposition,cans_yb_pschosis,cans_yb_substance,...,trauma_csi_value_Yes,diagnosis_value_Adjustment disorder with depressed mood,diagnosis_value_Adjustment disorder with mixed anxiety and depressed mood,diagnosis_value_Attention-deficit hyperactivity disorder combined presentation,diagnosis_value_Generalized anxiety disorder,diagnosis_value_Major depressive disorder single episode unspecified,diagnosis_value_Oppositional defiant disorder,diagnosis_value_Other,diagnosis_value_Posttraumatic stress disorder,diagnosis_value_Unspecified anxiety disorder
0,1,246,2,1,0,2,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,307,3,2,0,2,1,1,1,2,...,1,0,0,0,0,0,0,1,0,0
2,1,313,3,1,1,2,0,1,1,0,...,1,0,0,0,0,0,0,1,0,0
3,0,601,10,2,0,1,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,1,176,5,2,0,1,0,2,0,0,...,1,0,0,0,0,0,1,0,0,0


In [4]:
X.shape

(1264, 39)

In [5]:
#Define the target set
y = df_master["y"].ravel()
y[:5]

array([0, 1, 0, 1, 1], dtype=int64)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=78)

In [7]:
#Creating a StandardScaler instance
scaler = StandardScaler()
#Fitting the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [8]:
#Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
#Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators = 500, random_state =78)

In [10]:
#Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [11]:
predictions = rf_model.predict(X_test_scaled)
predictions

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,

In [12]:
#Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
#Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,67,67
Actual 1,30,152


In [13]:
#Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [14]:
#Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,67,67
Actual 1,30,152


Accuracy Score : 0.6930379746835443
Classification Report
              precision    recall  f1-score   support

           0       0.69      0.50      0.58       134
           1       0.69      0.84      0.76       182

    accuracy                           0.69       316
   macro avg       0.69      0.67      0.67       316
weighted avg       0.69      0.69      0.68       316



In [15]:
#Calculate feature importance in the Random Forest Model
importances = rf_model.feature_importances_
importances

array([2.21767816e-02, 2.26478432e-01, 4.21574722e-02, 4.04243674e-02,
       2.64510361e-02, 3.94089365e-02, 3.55818333e-02, 4.17731983e-02,
       1.52258781e-02, 2.66618275e-02, 3.92511873e-02, 4.43429360e-02,
       8.91456348e-02, 4.25325980e-02, 1.27502914e-02, 3.70720535e-03,
       5.45226576e-03, 1.19580342e-02, 1.11647072e-02, 1.23973118e-02,
       7.07109249e-05, 1.10030422e-02, 6.45862074e-03, 6.16431137e-03,
       1.87047562e-02, 1.44187093e-02, 1.74468206e-02, 1.41264881e-02,
       1.30943442e-02, 1.73524585e-02, 7.15369161e-03, 6.21381959e-03,
       9.16791939e-03, 8.67054302e-03, 9.86234525e-03, 1.00348277e-02,
       2.13005143e-02, 9.75447344e-03, 9.95966835e-03])

In [16]:
#We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.2264784323960909, 'length_of_stay'),
 (0.08914563475176686, 'age2'),
 (0.04434293599023404, 'maxepisode'),
 (0.04253259798161312, 'loc_num'),
 (0.042157472165773285, 'episode_number'),
 (0.041773198274549436, 'cans_yb_opposition'),
 (0.040424367380554195, 'cans_yb_anger'),
 (0.03940893647697648, 'cans_yb_depression'),
 (0.03925118728771311, 'cans_yb_trauma'),
 (0.03558183328600752, 'cans_yb_impluse'),
 (0.02666182751223457, 'cans_yb_substance'),
 (0.02645103614597457, 'cans_yb_conduct'),
 (0.02217678158958788, 'patient_sex_code'),
 (0.021300514335238153, 'diagnosis_value_Other'),
 (0.01870475624336647, 'gen_med_csi_value_2_No General Medical Condition'),
 (0.01744682063407667, 'gen_med_csi_value_2_Unknown'),
 (0.017352458539715083, 'trauma_csi_value_Yes'),
 (0.015225878141424757, 'cans_yb_pschosis'),
 (0.014418709332036995, 'gen_med_csi_value_2_Other'),
 (0.014126488096160716, 'trauma_csi_value_No'),
 (0.013094344237782926, 'trauma_csi_value_Unknown'),
 (0.012750291362116457, 'ethn