# Random Forest Classification Model - Medical No Shows

### Import Dependencies

In [1]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

### Load Dataset for modelling

In [2]:
data = pd.read_csv('../data/cleanData/appointment_dataset.csv')
print(f"Shape of the data is: {data.shape}")
data.head()

Shape of the data is: (110521, 29)


Unnamed: 0,appointment_id,patient_id,repeat_patient_yn,gender_yn,time_between_sch_appt,same_day_appt_yn,within_week_appt_yn,advanced_appt_yn,monday_yn,tuesday_yn,...,young_adult_yn,adult_yn,senior_yn,welfare_assistance,hypertension,diabetes,alcoholism,handicap_yn,sms_received,no_show_yn
0,5698125,678814354693913,1,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
1,5698246,54593736353128,0,0,0,1,0,0,1,0,...,1,0,0,1,0,0,0,0,0,0
2,5699393,4369164743113,1,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
3,5694371,54523365344664,0,1,3,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,5698279,62917816238835,1,0,0,1,0,0,1,0,...,1,0,0,1,0,0,0,0,0,0


In [3]:
#check for nulls and verify that attributes for modelling are the correct datatypes
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110521 entries, 0 to 110520
Data columns (total 29 columns):
appointment_id                   110521 non-null int64
patient_id                       110521 non-null int64
repeat_patient_yn                110521 non-null int64
gender_yn                        110521 non-null int64
time_between_sch_appt            110521 non-null int64
same_day_appt_yn                 110521 non-null int64
within_week_appt_yn              110521 non-null int64
advanced_appt_yn                 110521 non-null int64
monday_yn                        110521 non-null int64
tuesday_yn                       110521 non-null int64
wednesday_yn                     110521 non-null int64
thursday_yn                      110521 non-null int64
friday_yn                        110521 non-null int64
saturday_yn                      110521 non-null int64
neighborhood_income_lower_yn     110521 non-null int64
neighborhood_income_middle_yn    110521 non-null int64
neigborho

### Split dataset for training vs testing

In [4]:
# store the target variable in y and everything else goes in X
y = data['no_show_yn']

# drop irrelevent columns 
X = data.drop(['appointment_id', 'patient_id', 'time_between_sch_appt', 'no_show_yn'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)

X_train.shape

(82890, 25)

### Create the classifier model and the parameter grid for GridSearch

In [18]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

param_grid = {"n_estimators" : [10, 20, 50, 100],
             "max_depth" : [None, 6, 8, 10],
             "max_leaf_nodes": [None, 5, 10, 20]}
             #"min_impurity_split": [0.1, 0.2, 0.3, 0.4]}

### Initialize the GridSearch to tune hyperparameters

In [19]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(model, param_grid=param_grid, cv=3, verbose=2)

### Execute model tuning

In [20]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits
[CV] max_depth=None, max_leaf_nodes=None, n_estimators=10 ............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=None, max_leaf_nodes=None, n_estimators=10, total=   0.3s
[CV] max_depth=None, max_leaf_nodes=None, n_estimators=10 ............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV]  max_depth=None, max_leaf_nodes=None, n_estimators=10, total=   0.3s
[CV] max_depth=None, max_leaf_nodes=None, n_estimators=10 ............
[CV]  max_depth=None, max_leaf_nodes=None, n_estimators=10, total=   0.4s
[CV] max_depth=None, max_leaf_nodes=None, n_estimators=20 ............
[CV]  max_depth=None, max_leaf_nodes=None, n_estimators=20, total=   0.6s
[CV] max_depth=None, max_leaf_nodes=None, n_estimators=20 ............
[CV]  max_depth=None, max_leaf_nodes=None, n_estimators=20, total=   0.6s
[CV] max_depth=None, max_leaf_nodes=None, n_estimators=20 ............
[CV]  max_depth=None, max_leaf_nodes=None, n_estimators=20, total=   0.6s
[CV] max_depth=None, max_leaf_nodes=None, n_estimators=50 ............
[CV]  max_depth=None, max_leaf_nodes=None, n_estimators=50, total=   1.5s
[CV] max_depth=None, max_leaf_nodes=None, n_estimators=50 ............
[CV]  max_depth=None, max_leaf_nodes=None, n_estimators=50, total=   1.5s
[CV] max_depth=None, max_leaf_nodes=None, n_estimators=5

[CV]  max_depth=6, max_leaf_nodes=None, n_estimators=100, total=   1.6s
[CV] max_depth=6, max_leaf_nodes=5, n_estimators=10 ..................
[CV] ... max_depth=6, max_leaf_nodes=5, n_estimators=10, total=   0.1s
[CV] max_depth=6, max_leaf_nodes=5, n_estimators=10 ..................
[CV] ... max_depth=6, max_leaf_nodes=5, n_estimators=10, total=   0.2s
[CV] max_depth=6, max_leaf_nodes=5, n_estimators=10 ..................
[CV] ... max_depth=6, max_leaf_nodes=5, n_estimators=10, total=   0.2s
[CV] max_depth=6, max_leaf_nodes=5, n_estimators=20 ..................
[CV] ... max_depth=6, max_leaf_nodes=5, n_estimators=20, total=   0.3s
[CV] max_depth=6, max_leaf_nodes=5, n_estimators=20 ..................
[CV] ... max_depth=6, max_leaf_nodes=5, n_estimators=20, total=   0.2s
[CV] max_depth=6, max_leaf_nodes=5, n_estimators=20 ..................
[CV] ... max_depth=6, max_leaf_nodes=5, n_estimators=20, total=   0.3s
[CV] max_depth=6, max_leaf_nodes=5, n_estimators=50 ..................
[CV] 

[CV] .. max_depth=8, max_leaf_nodes=5, n_estimators=100, total=   1.0s
[CV] max_depth=8, max_leaf_nodes=5, n_estimators=100 .................
[CV] .. max_depth=8, max_leaf_nodes=5, n_estimators=100, total=   1.0s
[CV] max_depth=8, max_leaf_nodes=5, n_estimators=100 .................
[CV] .. max_depth=8, max_leaf_nodes=5, n_estimators=100, total=   1.0s
[CV] max_depth=8, max_leaf_nodes=10, n_estimators=10 .................
[CV] .. max_depth=8, max_leaf_nodes=10, n_estimators=10, total=   0.1s
[CV] max_depth=8, max_leaf_nodes=10, n_estimators=10 .................
[CV] .. max_depth=8, max_leaf_nodes=10, n_estimators=10, total=   0.1s
[CV] max_depth=8, max_leaf_nodes=10, n_estimators=10 .................
[CV] .. max_depth=8, max_leaf_nodes=10, n_estimators=10, total=   0.1s
[CV] max_depth=8, max_leaf_nodes=10, n_estimators=20 .................
[CV] .. max_depth=8, max_leaf_nodes=10, n_estimators=20, total=   0.3s
[CV] max_depth=8, max_leaf_nodes=10, n_estimators=20 .................
[CV] .

[CV] . max_depth=10, max_leaf_nodes=10, n_estimators=50, total=   0.6s
[CV] max_depth=10, max_leaf_nodes=10, n_estimators=50 ................
[CV] . max_depth=10, max_leaf_nodes=10, n_estimators=50, total=   0.6s
[CV] max_depth=10, max_leaf_nodes=10, n_estimators=100 ...............
[CV]  max_depth=10, max_leaf_nodes=10, n_estimators=100, total=   1.2s
[CV] max_depth=10, max_leaf_nodes=10, n_estimators=100 ...............
[CV]  max_depth=10, max_leaf_nodes=10, n_estimators=100, total=   1.2s
[CV] max_depth=10, max_leaf_nodes=10, n_estimators=100 ...............
[CV]  max_depth=10, max_leaf_nodes=10, n_estimators=100, total=   1.2s
[CV] max_depth=10, max_leaf_nodes=20, n_estimators=10 ................
[CV] . max_depth=10, max_leaf_nodes=20, n_estimators=10, total=   0.2s
[CV] max_depth=10, max_leaf_nodes=20, n_estimators=10 ................
[CV] . max_depth=10, max_leaf_nodes=20, n_estimators=10, total=   0.2s
[CV] max_depth=10, max_leaf_nodes=20, n_estimators=10 ................
[CV] .

[Parallel(n_jobs=1)]: Done 192 out of 192 | elapsed:  2.1min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

### View model results

In [21]:
print(f"Best parameters: {grid.best_params_}")
print(f"Train set score: {grid.score(X_train, y_train)}")
print(f"Test set score: {grid.score(X_test, y_test)}")

Best parameters: {'max_depth': 8, 'max_leaf_nodes': None, 'n_estimators': 100}
Train set score: 0.7983833996863313
Test set score: 0.7979805291158482


## Save the model for hosting

In [12]:
import pickle

with open('RF_model.pkl','wb') as f:
    pickle.dump(grid, f)

### Load saved model

In [13]:
with open('RF_model.pkl', 'rb') as f:
    model = pickle.load(f)

### Test saved model

In [14]:
print(f"Test set score: {model.score(X_test, y_test)}")

Test set score: 0.7978719554123991


In [15]:
model

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

### Check out model coefficients

In [16]:
model.cv_results_

{'mean_fit_time': array([0.25135446, 0.44609737, 0.93140141, 1.83373499, 0.19823162,
        0.38460732, 0.93178463, 1.76930118, 0.14245407, 0.27375746,
        0.66328359, 1.36807934, 0.05706541, 0.11060365, 0.2316939 ,
        0.46664373, 0.11663636, 0.19531997, 0.42388463, 0.78667672,
        0.09348798, 0.17963998, 0.44452262, 0.87382865, 0.08717799,
        0.17395155, 0.41794745, 0.7731297 , 0.04969025, 0.09617106,
        0.22323434, 0.42494289, 0.10198418, 0.21165164, 0.52952663,
        1.07125839, 0.12346927, 0.24678938, 0.53830274, 1.01954333,
        0.1207246 , 0.284326  , 0.5342439 , 0.95034361, 0.05835263,
        0.11426822, 0.25363   , 0.47528537, 0.12762435, 0.24841936,
        0.59491467, 1.12935249, 0.13227177, 0.24080499, 0.57792703,
        1.13873641, 0.11375793, 0.20929567, 0.50495593, 1.05790742,
        0.06141329, 0.11127377, 0.28118499, 0.5338947 , 0.17684984,
        0.30959916, 0.6508348 , 1.27534056, 0.12686189, 0.23850274,
        0.58540853, 1.21663324,