In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from imblearn.over_sampling import SMOTE

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style("white")

pd.set_option("display.max_columns",100)
pd.set_option("display.max_rows",100)

In [2]:
med = pd.read_pickle('Data_Sets/Medical_Appointments_3.pkl')

In [3]:
med.head()

Unnamed: 0,Patient_ID,Appointment_ID,Gender,Scheduled_Day,Appointment_Day,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,No_Show,Age_Normalized,Waiting_Days_Normalized,Handicap_1,Handicap_2,Handicap_3,Handicap_4,SMS_Received_1,Day_Of_Week_Monday,Day_Of_Week_Saturday,Day_Of_Week_Thursday,Day_Of_Week_Tuesday,Day_Of_Week_Wednesday
0,29872499824296,5642903,1,2016-04-29,2016-04-29,Jardim Da Penha,0,1,0,0,0,0.53913,0.0,0,0,0,0,0,0,0,0,0,0
1,558997776694438,5642503,0,2016-04-29,2016-04-29,Jardim Da Penha,0,0,0,0,0,0.486957,0.0,0,0,0,0,0,0,0,0,0,0
2,4262962299951,5642549,1,2016-04-29,2016-04-29,Mata Da Praia,0,0,0,0,0,0.53913,0.0,0,0,0,0,0,0,0,0,0,0
3,867951213174,5642828,1,2016-04-29,2016-04-29,Pontal De Camburi,0,0,0,0,0,0.069565,0.0,0,0,0,0,0,0,0,0,0,0
4,8841186448183,5642494,1,2016-04-29,2016-04-29,Jardim Da Penha,0,1,1,0,0,0.486957,0.0,0,0,0,0,0,0,0,0,0,0


In [4]:
med.No_Show.value_counts()

0    88207
1    22314
Name: No_Show, dtype: int64

### 1. Define train X, y

In [5]:
y = med[['No_Show']]#target / response as dependent variable

X = med.drop(['Patient_ID',
              'Appointment_ID',
              'Scheduled_Day',
              'Appointment_Day',
              'No_Show',
              'Neighbourhood'], axis=1) #predictors as independent variables

In [6]:
y = pd.DataFrame.to_numpy(y)

### 2. Apply train-test-split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=666)

### 8. Fit Random Forest Model

In [8]:
rf_model = RandomForestClassifier(random_state=666, n_estimators=100)
rf_model_fitted = rf_model.fit(X_train, y_train.ravel())

### 9. Show Feature Importance of Random Forest Model

In [9]:
feature_importances = pd.DataFrame(rf_model_fitted.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)

feature_importances

Unnamed: 0,importance
Age_Normalized,0.460231
Waiting_Days_Normalized,0.430119
SMS_Received_1,0.018629
Gender,0.017962
Hypertension,0.009724
Day_Of_Week_Tuesday,0.009225
Day_Of_Week_Wednesday,0.008617
Day_Of_Week_Monday,0.008227
Day_Of_Week_Thursday,0.008015
Scholarship,0.007783


### 10. Apply Trained Random Forest Model on Test
    to predict accuracy on test set

In [10]:
print("Random Forest Accuracy Score: \n", rf_model_fitted.score(X_test, y_test))

Random Forest Accuracy Score: 
 0.7594209454874463


In [11]:
print("Random Forest Confusion Matrix: \n", confusion_matrix(y_test,rf_model_fitted.predict(X_test)))

Random Forest Confusion Matrix: 
 [[15733  1835]
 [ 3483  1054]]


### 11. Apply Trained Random Forest Model on Train

In [12]:
print("Random Forest Accuracy Score: \n", rf_model_fitted.score(X_train, y_train))

Random Forest Accuracy Score: 
 0.9095525696706478


In [13]:
print("Random Forest Confusion Matrix: \n", confusion_matrix(y_train,rf_model_fitted.predict(X_train)))

Random Forest Confusion Matrix: 
 [[69025  1614]
 [ 6383 11394]]
