# Random Forest Classification Model - Medical No Shows

### Import Dependencies

In [1]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

### Load Dataset for modelling

In [2]:
data = pd.read_csv('../data/cleanData/appointment_dataset2.csv')
print(f"Shape of the data is: {data.shape}")
data.head()

Shape of the data is: (110521, 29)


Unnamed: 0,appointment_id,patient_id,repeat_patient_yn,male_yn,female_yn,same_day_appt_yn,within_week_appt_yn,advanced_appt_yn,monday_yn,tuesday_yn,...,young_adult_yn,adult_yn,senior_yn,welfare_assistance,hypertension,diabetes,alcoholism,handicap_yn,sms_received,no_show_yn
0,5698125,678814000000000.0,1,0,1,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
1,5698246,54593700000000.0,0,0,1,1,0,0,1,0,...,1,0,0,1,0,0,0,0,0,0
2,5699393,4369160000000.0,1,0,1,1,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
3,5694371,54523400000000.0,0,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,5698279,62917800000000.0,1,0,1,1,0,0,1,0,...,1,0,0,1,0,0,0,0,0,0


In [3]:
#check for nulls and verify that attributes for modelling are the correct datatypes
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110521 entries, 0 to 110520
Data columns (total 29 columns):
appointment_id                   110521 non-null int64
patient_id                       110521 non-null float64
repeat_patient_yn                110521 non-null int64
male_yn                          110521 non-null int64
female_yn                        110521 non-null int64
same_day_appt_yn                 110521 non-null int64
within_week_appt_yn              110521 non-null int64
advanced_appt_yn                 110521 non-null int64
monday_yn                        110521 non-null int64
tuesday_yn                       110521 non-null int64
wednesday_yn                     110521 non-null int64
thursday_yn                      110521 non-null int64
friday_yn                        110521 non-null int64
saturday_yn                      110521 non-null int64
neighborhood_income_lower_yn     110521 non-null int64
neighborhood_income_middle_yn    110521 non-null int64
neigbor

### Split dataset for training vs testing

In [4]:
# store the target variable in y and everything else goes in X
y = data['no_show_yn']

# drop irrelevent columns 
X = data.drop(['appointment_id', 'patient_id', 'no_show_yn'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)

X_train.shape

(82890, 26)

### Create the classifier model and the parameter grid for GridSearch

In [5]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

param_grid = {"n_estimators" : [10, 20, 50, 100],
             "max_depth" : [None, 6, 8, 10],
             "max_leaf_nodes": [None, 5, 10, 20]}
             #"min_impurity_split": [0.1, 0.2, 0.3, 0.4]}

### Initialize the GridSearch to tune hyperparameters

In [6]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(model, param_grid=param_grid, cv=3, verbose=2)

### Execute model tuning

In [7]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits
[CV] max_depth=None, max_leaf_nodes=None, n_estimators=10 ............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=None, max_leaf_nodes=None, n_estimators=10, total=   0.3s
[CV] max_depth=None, max_leaf_nodes=None, n_estimators=10 ............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV]  max_depth=None, max_leaf_nodes=None, n_estimators=10, total=   0.3s
[CV] max_depth=None, max_leaf_nodes=None, n_estimators=10 ............
[CV]  max_depth=None, max_leaf_nodes=None, n_estimators=10, total=   0.3s
[CV] max_depth=None, max_leaf_nodes=None, n_estimators=20 ............
[CV]  max_depth=None, max_leaf_nodes=None, n_estimators=20, total=   0.5s
[CV] max_depth=None, max_leaf_nodes=None, n_estimators=20 ............
[CV]  max_depth=None, max_leaf_nodes=None, n_estimators=20, total=   0.5s
[CV] max_depth=None, max_leaf_nodes=None, n_estimators=20 ............
[CV]  max_depth=None, max_leaf_nodes=None, n_estimators=20, total=   0.5s
[CV] max_depth=None, max_leaf_nodes=None, n_estimators=50 ............
[CV]  max_depth=None, max_leaf_nodes=None, n_estimators=50, total=   1.1s
[CV] max_depth=None, max_leaf_nodes=None, n_estimators=50 ............
[CV]  max_depth=None, max_leaf_nodes=None, n_estimators=50, total=   1.1s
[CV] max_depth=None, max_leaf_nodes=None, n_estimators=5

[CV]  max_depth=6, max_leaf_nodes=None, n_estimators=100, total=   1.1s
[CV] max_depth=6, max_leaf_nodes=5, n_estimators=10 ..................
[CV] ... max_depth=6, max_leaf_nodes=5, n_estimators=10, total=   0.1s
[CV] max_depth=6, max_leaf_nodes=5, n_estimators=10 ..................
[CV] ... max_depth=6, max_leaf_nodes=5, n_estimators=10, total=   0.1s
[CV] max_depth=6, max_leaf_nodes=5, n_estimators=10 ..................
[CV] ... max_depth=6, max_leaf_nodes=5, n_estimators=10, total=   0.1s
[CV] max_depth=6, max_leaf_nodes=5, n_estimators=20 ..................
[CV] ... max_depth=6, max_leaf_nodes=5, n_estimators=20, total=   0.2s
[CV] max_depth=6, max_leaf_nodes=5, n_estimators=20 ..................
[CV] ... max_depth=6, max_leaf_nodes=5, n_estimators=20, total=   0.2s
[CV] max_depth=6, max_leaf_nodes=5, n_estimators=20 ..................
[CV] ... max_depth=6, max_leaf_nodes=5, n_estimators=20, total=   0.2s
[CV] max_depth=6, max_leaf_nodes=5, n_estimators=50 ..................
[CV] 

[CV] .. max_depth=8, max_leaf_nodes=5, n_estimators=100, total=   0.8s
[CV] max_depth=8, max_leaf_nodes=5, n_estimators=100 .................
[CV] .. max_depth=8, max_leaf_nodes=5, n_estimators=100, total=   0.8s
[CV] max_depth=8, max_leaf_nodes=5, n_estimators=100 .................
[CV] .. max_depth=8, max_leaf_nodes=5, n_estimators=100, total=   0.8s
[CV] max_depth=8, max_leaf_nodes=10, n_estimators=10 .................
[CV] .. max_depth=8, max_leaf_nodes=10, n_estimators=10, total=   0.1s
[CV] max_depth=8, max_leaf_nodes=10, n_estimators=10 .................
[CV] .. max_depth=8, max_leaf_nodes=10, n_estimators=10, total=   0.1s
[CV] max_depth=8, max_leaf_nodes=10, n_estimators=10 .................
[CV] .. max_depth=8, max_leaf_nodes=10, n_estimators=10, total=   0.1s
[CV] max_depth=8, max_leaf_nodes=10, n_estimators=20 .................
[CV] .. max_depth=8, max_leaf_nodes=10, n_estimators=20, total=   0.2s
[CV] max_depth=8, max_leaf_nodes=10, n_estimators=20 .................
[CV] .

[CV] . max_depth=10, max_leaf_nodes=10, n_estimators=50, total=   0.5s
[CV] max_depth=10, max_leaf_nodes=10, n_estimators=50 ................
[CV] . max_depth=10, max_leaf_nodes=10, n_estimators=50, total=   0.4s
[CV] max_depth=10, max_leaf_nodes=10, n_estimators=100 ...............
[CV]  max_depth=10, max_leaf_nodes=10, n_estimators=100, total=   0.9s
[CV] max_depth=10, max_leaf_nodes=10, n_estimators=100 ...............
[CV]  max_depth=10, max_leaf_nodes=10, n_estimators=100, total=   0.9s
[CV] max_depth=10, max_leaf_nodes=10, n_estimators=100 ...............
[CV]  max_depth=10, max_leaf_nodes=10, n_estimators=100, total=   0.9s
[CV] max_depth=10, max_leaf_nodes=20, n_estimators=10 ................
[CV] . max_depth=10, max_leaf_nodes=20, n_estimators=10, total=   0.1s
[CV] max_depth=10, max_leaf_nodes=20, n_estimators=10 ................
[CV] . max_depth=10, max_leaf_nodes=20, n_estimators=10, total=   0.1s
[CV] max_depth=10, max_leaf_nodes=20, n_estimators=10 ................
[CV] .

[Parallel(n_jobs=1)]: Done 192 out of 192 | elapsed:  1.6min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

### View model results

In [8]:
print(f"Best parameters: {grid.best_params_}")
print(f"Train set score: {grid.score(X_train, y_train)}")
print(f"Test set score: {grid.score(X_test, y_test)}")

Best parameters: {'max_depth': 8, 'max_leaf_nodes': None, 'n_estimators': 20}
Train set score: 0.7983954638677766
Test set score: 0.7979081466468821


## Save the model for hosting

In [9]:
import pickle

with open('RF_model.pkl','wb') as f:
    pickle.dump(grid, f)

### Load saved model

In [10]:
with open('RF_model.pkl', 'rb') as f:
    model = pickle.load(f)

### Test saved model

In [11]:
print(f"Test set score: {model.score(X_test, y_test)}")

Test set score: 0.7979081466468821


In [12]:
model

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

### Check out model coefficients

In [13]:
model.cv_results_

{'mean_fit_time': array([0.234833  , 0.43265375, 0.94742099, 1.82001567, 0.08517941,
        0.15204668, 0.36908825, 0.68033298, 0.09050409, 0.17028999,
        0.40310232, 0.77824759, 0.10242025, 0.19110703, 0.44750309,
        0.87100299, 0.11021638, 0.21293155, 0.48872105, 0.94914865,
        0.08043392, 0.14639433, 0.349739  , 0.68269126, 0.089269  ,
        0.16552949, 0.4136281 , 0.76870529, 0.09495322, 0.18108455,
        0.43757129, 0.84862924, 0.1222473 , 0.23340408, 0.55490406,
        1.08202275, 0.08104396, 0.14652626, 0.34389424, 0.68016283,
        0.09181182, 0.16847714, 0.3902932 , 0.77401694, 0.09633867,
        0.17905362, 0.4465371 , 1.02866244, 0.15200297, 0.31561526,
        0.80390509, 1.44104139, 0.08781044, 0.17259129, 0.40938663,
        0.77518654, 0.10107239, 0.17188597, 0.40044856, 0.788112  ,
        0.09440509, 0.18473522, 0.43913802, 0.86175625]),
 'std_fit_time': array([0.01714722, 0.01844728, 0.01335119, 0.00054674, 0.00662808,
        0.00471152, 0.015