# Random Forest Classification Model - Medical No Shows

### Import Dependencies

In [1]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

### Load Dataset for modelling

In [2]:
data = pd.read_csv('../data/cleanData/appointment_dataset_postconsult.csv')
print(f"Shape of the data is: {data.shape}")
data.head()

Shape of the data is: (93899, 29)


Unnamed: 0,appointment_id,patient_id,repeat_patient_yn,gender_yn,time_between_sch_appt,same_day_appt_yn,within_week_appt_yn,advanced_appt_yn,monday_yn,tuesday_yn,...,young_adult_yn,adult_yn,senior_yn,welfare_assistance,hypertension,diabetes,alcoholism,handicap_yn,sms_received,no_show_yn
0,5698125,679000000000000.0,1,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
1,5698246,54600000000000.0,0,0,0,1,0,0,1,0,...,1,0,0,1,0,0,0,0,0,0
2,5699393,4370000000000.0,1,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
3,5694371,54500000000000.0,0,1,3,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,5698279,62900000000000.0,1,0,0,1,0,0,1,0,...,1,0,0,1,0,0,0,0,0,0


In [3]:
#check for nulls and verify that attributes for modelling are the correct datatypes
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93899 entries, 0 to 93898
Data columns (total 29 columns):
appointment_id                   93899 non-null int64
patient_id                       93899 non-null float64
repeat_patient_yn                93899 non-null int64
gender_yn                        93899 non-null int64
time_between_sch_appt            93899 non-null int64
same_day_appt_yn                 93899 non-null int64
within_week_appt_yn              93899 non-null int64
advanced_appt_yn                 93899 non-null int64
monday_yn                        93899 non-null int64
tuesday_yn                       93899 non-null int64
wednesday_yn                     93899 non-null int64
thursday_yn                      93899 non-null int64
friday_yn                        93899 non-null int64
saturday_yn                      93899 non-null int64
neighborhood_income_lower_yn     93899 non-null int64
neighborhood_income_middle_yn    93899 non-null int64
neigborhood_income_higher

### Split dataset for training vs testing

In [4]:
# store the target variable in y and everything else goes in X
y = data['no_show_yn']

# drop irrelevent columns 
X = data.drop(['appointment_id', 'patient_id', 'time_between_sch_appt', 'no_show_yn'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

X_train.shape

(70424, 25)

### Create the classifier model and the parameter grid for GridSearch

In [5]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

param_grid = {'penalty': ['l1','l2'], 
              'C': [0.001, 0.01,0.1,1,10,100, 1000], 
             'random_state': [0, 1]}

### Initialize the GridSearch to tune hyperparameters

In [6]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(model, param_grid=param_grid, cv=3, verbose=2)

### Execute model tuning

In [7]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 28 candidates, totalling 84 fits
[CV] C=0.001, penalty=l1, random_state=0 .............................
[CV] .............. C=0.001, penalty=l1, random_state=0, total=   0.0s
[CV] C=0.001, penalty=l1, random_state=0 .............................
[CV] .............. C=0.001, penalty=l1, random_state=0, total=   0.0s
[CV] C=0.001, penalty=l1, random_state=0 .............................
[CV] .............. C=0.001, penalty=l1, random_state=0, total=   0.0s
[CV] C=0.001, penalty=l1, random_state=1 .............................
[CV] .............. C=0.001, penalty=l1, random_state=1, total=   0.0s
[CV] C=0.001, penalty=l1, random_state=1 .............................
[CV] .............. C=0.001, penalty=l1, random_state=1, total=   0.0s
[CV] C=0.001, penalty=l1, random_state=1 .............................
[CV] .............. C=0.001, penalty=l1, random_state=1, total=   0.0s
[CV] C=0.001, penalty=l2, random_state=0 .............................
[CV] ...........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] .............. C=0.001, penalty=l2, random_state=0, total=   0.1s
[CV] C=0.001, penalty=l2, random_state=1 .............................
[CV] .............. C=0.001, penalty=l2, random_state=1, total=   0.1s
[CV] C=0.001, penalty=l2, random_state=1 .............................
[CV] .............. C=0.001, penalty=l2, random_state=1, total=   0.1s
[CV] C=0.001, penalty=l2, random_state=1 .............................
[CV] .............. C=0.001, penalty=l2, random_state=1, total=   0.1s
[CV] C=0.01, penalty=l1, random_state=0 ..............................
[CV] ............... C=0.01, penalty=l1, random_state=0, total=   0.0s
[CV] C=0.01, penalty=l1, random_state=0 ..............................
[CV] ............... C=0.01, penalty=l1, random_state=0, total=   0.0s
[CV] C=0.01, penalty=l1, random_state=0 ..............................
[CV] ............... C=0.01, penalty=l1, random_state=0, total=   0.0s
[CV] C=0.01, penalty=l1, random_state=1 ..............................
[CV] .

[CV] ................ C=100, penalty=l2, random_state=0, total=   0.1s
[CV] C=100, penalty=l2, random_state=0 ...............................
[CV] ................ C=100, penalty=l2, random_state=0, total=   0.1s
[CV] C=100, penalty=l2, random_state=0 ...............................
[CV] ................ C=100, penalty=l2, random_state=0, total=   0.1s
[CV] C=100, penalty=l2, random_state=1 ...............................
[CV] ................ C=100, penalty=l2, random_state=1, total=   0.1s
[CV] C=100, penalty=l2, random_state=1 ...............................
[CV] ................ C=100, penalty=l2, random_state=1, total=   0.1s
[CV] C=100, penalty=l2, random_state=1 ...............................
[CV] ................ C=100, penalty=l2, random_state=1, total=   0.1s
[CV] C=1000, penalty=l1, random_state=0 ..............................
[CV] ............... C=1000, penalty=l1, random_state=0, total=   0.0s
[CV] C=1000, penalty=l1, random_state=0 ..............................
[CV] .

[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed:    4.3s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'penalty': ['l1', 'l2'], 'random_state': [0, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

### View model results

In [8]:
print(f"Best parameters: {grid.best_params_}")
print(f"Train set score: {grid.score(X_train, y_train)}")
print(f"Test set score: {grid.score(X_test, y_test)}")

Best parameters: {'C': 0.01, 'penalty': 'l2', 'random_state': 0}
Train set score: 0.8138276723844144
Test set score: 0.8175079872204473


## Save the model for hosting

In [9]:
import pickle

with open('LR_model_postconsult.pkl','wb') as f:
    pickle.dump(grid, f)

### Load saved model

In [10]:
with open('LR_model_postconsult.pkl', 'rb') as f:
    model = pickle.load(f)

### Test saved model

In [11]:
print(f"Test set score: {model.score(X_test, y_test)}")

Test set score: 0.8175079872204473


In [12]:
model

GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'penalty': ['l1', 'l2'], 'random_state': [0, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

### Check out model coefficients

In [13]:
model.cv_results_

{'mean_fit_time': array([0.00657272, 0.00447297, 0.05543407, 0.0540386 , 0.00445143,
        0.00388463, 0.08248568, 0.07816776, 0.00416994, 0.00327094,
        0.11290534, 0.10788457, 0.00403476, 0.00345508, 0.15891012,
        0.15366594, 0.00351866, 0.00332443, 0.09044949, 0.08421787,
        0.00347296, 0.00323574, 0.08438977, 0.08435766, 0.00371011,
        0.00319314, 0.08264764, 0.08297563]),
 'std_fit_time': array([2.67099172e-03, 8.20236586e-04, 7.30691149e-04, 9.33001110e-04,
        4.39478088e-04, 3.58229308e-04, 2.27781303e-03, 4.88049097e-03,
        5.84977592e-04, 2.39151253e-05, 8.83460175e-03, 1.20371887e-02,
        4.68492124e-04, 5.55797750e-05, 1.32477719e-02, 8.21424814e-03,
        2.83265376e-04, 1.17871460e-04, 1.71921808e-02, 1.52770961e-02,
        2.41170760e-04, 2.45098732e-05, 1.49371051e-02, 1.53574455e-02,
        2.97426219e-04, 8.55556248e-05, 1.49280754e-02, 1.49256698e-02]),
 'mean_score_time': array([0.        , 0.        , 0.00292516, 0.00274364, 