# Random Forest and XGBoost

In [10]:
# importing relevant packages

# generic
import numpy as np
import pandas as pd

# for training models 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# pickling models
import joblib

# plots
import plotly.express as px
import plotly.graph_objects as go

# packages for model evaluation
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

In [4]:
# loading the data
df = pd.read_parquet('../data/cleaned/dataCleanWMedicalUrgency.parquet')

# saetting y to be the target variable
y = df['medical_urgency']

# importing the preprocessed data
X = pd.read_parquet('../data/cleaned/featuresPreprocessed.parquet')

# splitting the data
X_rem, X_test, y_rem, y_test = train_test_split(X, y, test_size=0.25, random_state=1234, stratify=y)

# resetting indices to allow models to run properly
X_rem.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_rem.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# setting up training and validation sets to guide choices for hyper parameters
X_train, X_val, y_train, y_val = train_test_split(X_rem, y_rem, test_size=0.25, random_state=1234, stratify=y_rem)
X_train.reset_index(drop=True, inplace=True)
X_val.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)

# making an experimental dataframe that has 10% of the rows of X_rem
X_exp, X_bin, y_exp, y_bin = train_test_split(X_rem, y_rem, test_size=0.999, random_state=1234, stratify=y_rem)
X_exp.reset_index(drop=True, inplace=True)
y_exp.reset_index(drop=True, inplace=True)

## Random Forest

In [13]:
depths = np.arange(25, 76, 5)
train_acc_scores_RF = list()
val_acc_scores_RF = list()

for i in depths:
    random_forest_model = RandomForestClassifier(max_depth=i)
    random_forest_model.fit(X_train, y_train)
    train_acc_scores_RF.append(random_forest_model.score(X_train, y_train))
    val_acc_scores_RF.append(random_forest_model.score(X_val, y_val))

# plotting the max_depth scores against train and validation accuracy scores 
fig1 = go.Figure()
train_scores_RF = go.Scatter(x=depths, y=train_acc_scores_RF, name='Train accuracy')
val_scores_RF = go.Scatter(x=depths, y=val_acc_scores_RF, name='Validation accuracy')
fig1.add_trace(train_scores_RF)
fig1.add_trace(val_scores_RF)
fig1.update_layout(title_text='Accuracy Score Against Max Depth', title_x=0.5, xaxis_title='Max Depth', yaxis_title='Accuracy')
fig1.show()

## XGBoost

Now let's go for maximum accuracy with XGBoost. 

In [11]:
# fitting a scaler 
scaler = StandardScaler()
scaler.fit_transform(X_rem)
scaler.transform(X_test)

# making the model 
xgboost_model = XGBClassifier()
xgboost_model.fit(X_rem, y_rem)

# finding the accuracy score
xgboost_model.score(X_test, y_test)

0.7385978178675597

In [14]:
joblib.dump(xgboost_model, '../model/xgboost_model')

['../model/xgboost_model']

In [12]:
y_pred = xgboost_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.69      0.73     42006
           1       0.71      0.72      0.71     58779
           2       0.76      0.82      0.79     38070

    accuracy                           0.74    138855
   macro avg       0.74      0.74      0.74    138855
weighted avg       0.74      0.74      0.74    138855

