# Random Forest Classification Model - Medical No Shows

### Import Dependencies

In [1]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

### Load Dataset for modelling

In [2]:
data = pd.read_csv('../data/cleanData/appointment_dataset.csv')
print(f"Shape of the data is: {data.shape}")
data.head()

Shape of the data is: (110521, 29)


Unnamed: 0,appointment_id,patient_id,repeat_patient_yn,gender_yn,time_between_sch_appt,same_day_appt_yn,within_week_appt_yn,advanced_appt_yn,monday_yn,tuesday_yn,...,young_adult_yn,adult_yn,senior_yn,welfare_assistance,hypertension,diabetes,alcoholism,handicap_yn,sms_received,no_show_yn
0,5698125,678814354693913,1,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
1,5698246,54593736353128,0,0,0,1,0,0,1,0,...,1,0,0,1,0,0,0,0,0,0
2,5699393,4369164743113,1,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
3,5694371,54523365344664,0,1,3,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,5698279,62917816238835,1,0,0,1,0,0,1,0,...,1,0,0,1,0,0,0,0,0,0


In [3]:
#check for nulls and verify that attributes for modelling are the correct datatypes
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110521 entries, 0 to 110520
Data columns (total 29 columns):
appointment_id                   110521 non-null int64
patient_id                       110521 non-null int64
repeat_patient_yn                110521 non-null int64
gender_yn                        110521 non-null int64
time_between_sch_appt            110521 non-null int64
same_day_appt_yn                 110521 non-null int64
within_week_appt_yn              110521 non-null int64
advanced_appt_yn                 110521 non-null int64
monday_yn                        110521 non-null int64
tuesday_yn                       110521 non-null int64
wednesday_yn                     110521 non-null int64
thursday_yn                      110521 non-null int64
friday_yn                        110521 non-null int64
saturday_yn                      110521 non-null int64
neighborhood_income_lower_yn     110521 non-null int64
neighborhood_income_middle_yn    110521 non-null int64
neigborho

### Drop irrelevent columns

In [4]:
clean_data = data.drop(['appointment_id', 'patient_id', 'time_between_sch_appt'], axis=1)

print(f"Shape of the data is: {clean_data.shape}")

Shape of the data is: (110521, 26)


### Pull our target column from the data and create a list of our outcome values

In [5]:
target = clean_data['no_show_yn']

### Drop the target column from our data 

In [6]:
data = clean_data.drop('no_show_yn', axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,repeat_patient_yn,gender_yn,same_day_appt_yn,within_week_appt_yn,advanced_appt_yn,monday_yn,tuesday_yn,wednesday_yn,thursday_yn,friday_yn,...,child_yn,young_adult_yn,adult_yn,senior_yn,welfare_assistance,hypertension,diabetes,alcoholism,handicap_yn,sms_received
0,1,0,1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
2,1,0,1,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
3,0,1,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,1,0,1,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0


### Split the data into training and test sets

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=3)

### Create a Random Forest Classifier and fit the training data and score with the test data

In [8]:
rf = RandomForestClassifier(n_estimators=50)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.7913575332054577

### View features by importance

In [10]:
results = sorted(zip(rf.feature_importances_, feature_names), reverse=True)
results

[(0.20195459329848134, 'same_day_appt_yn'),
 (0.1311171011358645, 'advanced_appt_yn'),
 (0.07056535006071728, 'gender_yn'),
 (0.06769128646841704, 'sms_received'),
 (0.061944649011758106, 'repeat_patient_yn'),
 (0.05630286212839865, 'within_week_appt_yn'),
 (0.04268243065627403, 'hypertension'),
 (0.03842180518975638, 'welfare_assistance'),
 (0.03759428881558282, 'diabetes'),
 (0.02989153428383116, 'alcoholism'),
 (0.029113008798507523, 'handicap_yn'),
 (0.023752407316737934, 'senior_yn'),
 (0.020494967164569604, 'wednesday_yn'),
 (0.02034501892733706, 'tuesday_yn'),
 (0.019922505959960746, 'monday_yn'),
 (0.018854774385365064, 'friday_yn'),
 (0.018379758263525562, 'neighborhood_income_lower_yn'),
 (0.018213621815009504, 'day_after_holiday_yn'),
 (0.01819619063086221, 'thursday_yn'),
 (0.017054464687709478, 'neigborhood_income_higher_yn'),
 (0.01477867508253331, 'young_adult_yn'),
 (0.014334796352542579, 'neighborhood_income_middle_yn'),
 (0.014329759443920054, 'adult_yn'),
 (0.0133044

In [12]:
RFresults_df = pd.DataFrame(results, columns=['feature_importances','feature_names'])
RFresults_df = RFresults_df.sort_values(by='feature_importances', ascending=False)
RFresults_df

Unnamed: 0,feature_importances,feature_names
0,0.201955,same_day_appt_yn
1,0.131117,advanced_appt_yn
2,0.070565,gender_yn
3,0.067691,sms_received
4,0.061945,repeat_patient_yn
5,0.056303,within_week_appt_yn
6,0.042682,hypertension
7,0.038422,welfare_assistance
8,0.037594,diabetes
9,0.029892,alcoholism


### Export model results to csv

In [13]:
RFresults_df.to_csv('../data/cleanData/RFresults_df.csv', index = False, header=True)