# Load Dataset

Load the dataset (as .csv file) as a Pandas dataframe, and visualize first 5 datapoints.

In [None]:
import pandas as pd
import numpy as np

acl1=pd.read_csv("placehoder_csv")
acl1.head()

# 1. Data pre-processing

### 1.1 Test-Train Split
This step seperates the features used in model development from the outcome of interest (overnight stay). The dataset is then divided into an 85:15 test-train split in which 85% of patients (training cohort) were used in model development while the remaining 15% (testing cohort) tested model performance.

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE

target = 'PLOS'
X=acl3[['SEX', 'RACE_NEW', 'CPT', 'INOUT', 'AGE', 'ANESTHES', 'ELECTSURG',
        'HEIGHT', 'WEIGHT', 'SMOKE', 'HXCHF', 'HYPERMED', 'EMERGNCY', 'WNDCLAS', 'ASACLAS', 'OPTIME',
        'ADMQTR', 'HTOODAY', '4th_Quartile', '2nd_Quartile', '3rd_Quartile', 'Elective',
        'Male', 'Non_Binary', 'Black_AA', 'Asian',
        'ANESTHES_Epidural', 'ANESTHES_Local', 'ANESTHES_IV_Sedation', 'ANESTHES_None', 'ANESTHES_Other', 'ANESTHES_Regional', 'ANESTHES_Spinal', 'ANESTHES_Unknown',
        'ASACLAS_Mild_Disturb', 'ASACLAS_Severe_Disturb', 'ASACLAS_Life_Threat', 'ASACLAS_Moribund', 'ASACLAS_None_Assigned', 'WNDCLAS_Clean_Contaminated', 'WNDCLAS_Contaminated', 'WNDCLAS_Dirty_Infected']]

y=acl3[['PLOS']]

# spilt the data
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                   random_state=231,
                                   test_size=0.15,
                                   shuffle=True,
                                   stratify=y[target])

### 1.2 Over-Sampling and Under Sampling
Due to an overrepresentation of patients who self-identified as White or had same-day discharge, a combination of Synthetic Minority Oversampling Technique (SMOTE) and undersampling was employed to synthetically balance the training cohort. SMOTE synthetically increased all minority classes including overnight stay and race to re-balance the racial disparities that exist within this dataset. Undersampling randomly decreased the number of patients with characteristics overrepresented in the dataset.

A counter was created to show the number of patients in the same=day discharge group prior to then after employing the afformentioned resampling techniques.

In [None]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

counter = Counter(y_train[target])
print('Before', counter)
counter = Counter(X_train)
print('Before', counter)

over = SMOTE(random_state=423, sampling_strategy = 0.1)
under = RandomUnderSampler(sampling_strategy=.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X_comp_train, y_comp_train = pipeline.fit_resample(X_train, y_train[target])
y_comp_test = y_test[target]

counter = Counter(y_comp_train)
print('After', counter)
counter = Counter(X_comp_train)
print('After', counter)

### 1.3 Stratify Test Data by Race
In order to assess if the model performs equitably irrespective of patient race, the dataset was stratified by whether a patient self-identified as White, Black or African American, or Asian. First, we  dropped the race variables from the test and train datasets. Second, we created new test variables that included patients of a speciic race. Third, we dropped all race variables from each of the race-specific datasets.


In [None]:
X_train = X_comp_train
y_train = y_comp_train
y_test = y_comp_test

X_train_no_re = X_train.drop(['RACE_NEW', 'Black_AA', 'Asian'], axis=1)
X_test_no_re = X_test.drop(['RACE_NEW', 'Black_AA', 'Asian'], axis=1)

X_test_White = X_test.loc[X_test['RACE_NEW']==0]

X_test_B_or_AA = X_test.loc[X_test['Black_AA']==1]

X_test_Asian = X_test.loc[X_test['Asian']==1]


X_test_White1 = X_test_White.drop(['RACE_NEW', 'Black_AA', 'Asian'], axis=1)

X_test_B_or_AA1 = X_test_B_or_AA.drop(['RACE_NEW','Black_AA', 'Asian'], axis=1)

X_test_Asian1 = X_test_Asian.drop(['RACE_NEW','Black_AA', 'Asian'], axis=1)


y_read_test_race = y_test

y_White = y_read_test_race.loc[X_test['RACE_NEW']==0]

y_Black_AA=y_read_test_race.loc[X_test['Black_AA']==1]

y_Asian=y_read_test_race.loc[X_test['Asian']==1]

# 2. Build and Train a Random Forest Classifier

### 2.1 Hyperparameter Tuning
 Randomized Search Cross Validation (RSCV), a resampling method that reduces bias and improves model generalizability by training decision trees on subsets of data, was performed to recognize hyperparameters that optimized the Random Forest model.


In [None]:
# this is a basic randomized search of the potential parameters (number of trees or estimators, depth, etc.).
# # The idea here is to find hyperparameters that are ideal for the training data.

from sklearn.model_selection import RandomizedSearchCV as RSCV

param_grid = {'n_estimators':np.arange(5, 300, 5),
              'max_depth': [3, 5, 7, 9, 11, 13, 15]}

rfm1 = RSCV(RandomForestClassifier(), param_grid, n_iter = 15).fit(X_train_no_re, y_train)

# 3. Plot Confusion Matrix and ROC Curve

### 3.1 Confusion Matrix and ROC Curve on Training Data
Model performance was assessed using the following measurements: area under the curve (AUC), accuracy, and F1 score for overnight stay. An AUC of 0.8 to 0.9 was considered good and > 0.9 considered excellent. Accuracy represents the number of correctly predicted true positives and true negatives divided by the total number of cases. F1 score represents the harmonic mean between the model’s precision and recall with a score >= 0.7 being considered good.

In [None]:
from sklearn.metrics import RocCurveDisplay as plot_roc_curve

plot_roc_curve.from_estimator(rfm1, X_train_no_re, y_train)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# this next line generates proedictions using the random forest with the test data
rfm1_predictions = rfm1.predict(X_train_no_re)

cm_rfm1 = confusion_matrix(y_train, rfm1_predictions, labels=rfm1.classes_)

disp = ConfusionMatrixDisplay(confusion_matrix=cm_rfm1,
                              display_labels=rfm1.classes_)
disp.plot()

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_train, rfm1_predictions))

### 3.2 Confusion Matrix and ROC Curve on Training Data, Stratified by Race
Model performance was assessed stratified by race, using the data subsets created in **step** **1.3**.

In [None]:
plot_roc_curve.from_estimator(rfm1, X_test_no_re, y_test)

In [None]:
rfm1_predictions = rfm1.predict(X_test_no_re)

cm_rfm1 = confusion_matrix(y_test, rfm1_predictions, labels=rfm1.classes_)

disp1 = ConfusionMatrixDisplay(confusion_matrix=cm_rfm1,
                              display_labels=rfm1.classes_)
disp1.plot()

In [None]:
print(classification_report(y_test, rfm1_predictions))

In [None]:
#Test White
plot_roc_curve.from_estimator(rfm1, X_test_White1, y_White)

In [None]:
rfm1_predictions = rfm1.predict(X_test_White1)

cm_rfm1 = confusion_matrix(y_White, rfm1_predictions, labels=rfm1.classes_)

disp2 = ConfusionMatrixDisplay(confusion_matrix=cm_rfm1,
                              display_labels=rfm1.classes_)
disp2.plot()

In [None]:
print(classification_report(y_White, rfm1_predictions))

In [None]:
#Test Black_AA
plot_roc_curve.from_estimator(rfm1, X_test_B_or_AA1, y_Black_AA)

In [None]:
rfm1_predictions = rfm1.predict(X_test_B_or_AA1)

cm_rfm1 = confusion_matrix(y_Black_AA, rfm1_predictions, labels=rfm1.classes_)

disp3 = ConfusionMatrixDisplay(confusion_matrix=cm_rfm1,
                              display_labels=rfm1.classes_)
disp3.plot()

In [None]:
print(classification_report(y_Black_AA, rfm1_predictions))

In [None]:
#Test Asian

plot_roc_curve.from_estimator(rfm1, X_test_Asian1, y_Asian)

In [None]:
rfm1_predictions = rfm1.predict(X_test_Asian1)

cm_rfm1 = confusion_matrix(y_Asian, rfm1_predictions, labels=rfm1.classes_)

disp4 = ConfusionMatrixDisplay(confusion_matrix=cm_rfm1,
                              display_labels=rfm1.classes_)
disp4.plot()

In [None]:
print(classification_report(y_Asian, rfm1_predictions))

### 3.3 Plot ROC Curve for all Patients
The following provides a visualization of how the model performed via ROC Curve stratified by race.

In [None]:
#ROC Curve
from sklearn.metrics import roc_curve

y_pred_prob0 = rfm1.predict_proba(X_test_no_re)[:,1]
fpr0 , tpr0, thresholds0 = roc_curve(y_test, y_pred_prob0)

y_pred_prob1 = rfm1.predict_proba(X_test_White1)[:,1]
fpr1 , tpr1, thresholds1 = roc_curve(y_White, y_pred_prob1)

y_pred_prob2 = rfm1.predict_proba(X_test_B_or_AA1)[:,1]
fpr2 , tpr2, thresholds2 = roc_curve(y_Black_AA, y_pred_prob2)

y_pred_prob3 = rfm1.predict_proba(X_test_Asian1)[:,1]
fpr3 , tpr3, thresholds3 = roc_curve(y_Asian, y_pred_prob3)

plt.plot([0,1],[0,1], 'k-.')
plt.plot(fpr0, tpr0, label= "All Patients")
plt.plot(fpr1, tpr1, label= "White (AUC=0.93")
plt.plot(fpr2, tpr2, label= "Black or African American")
plt.plot(fpr3, tpr3, label= "Asian")

plt.legend()
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title('Receiver Operating Characteristic')

plt.show()