In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from imblearn.over_sampling import SMOTE

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style("white")

pd.set_option("display.max_columns",100)
pd.set_option("display.max_rows",100)

In [2]:
med = pd.read_pickle('Data_Sets/Medical_Appointments_3.pkl')

In [3]:
med.head()

Unnamed: 0,Patient_ID,Appointment_ID,Gender,Scheduled_Day,Appointment_Day,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,No_Show,Age_Normalized,Waiting_Days_Normalized,Handicap_1,Handicap_2,Handicap_3,Handicap_4,SMS_Received_1,Day_Of_Week_Monday,Day_Of_Week_Saturday,Day_Of_Week_Thursday,Day_Of_Week_Tuesday,Day_Of_Week_Wednesday
0,29872499824296,5642903,1,2016-04-29,2016-04-29,Jardim Da Penha,0,1,0,0,0,0.53913,0.0,0,0,0,0,0,0,0,0,0,0
1,558997776694438,5642503,0,2016-04-29,2016-04-29,Jardim Da Penha,0,0,0,0,0,0.486957,0.0,0,0,0,0,0,0,0,0,0,0
2,4262962299951,5642549,1,2016-04-29,2016-04-29,Mata Da Praia,0,0,0,0,0,0.53913,0.0,0,0,0,0,0,0,0,0,0,0
3,867951213174,5642828,1,2016-04-29,2016-04-29,Pontal De Camburi,0,0,0,0,0,0.069565,0.0,0,0,0,0,0,0,0,0,0,0
4,8841186448183,5642494,1,2016-04-29,2016-04-29,Jardim Da Penha,0,1,1,0,0,0.486957,0.0,0,0,0,0,0,0,0,0,0,0


### next steps:

    (Test set as new df = 20.000 observations, target split equally (manually)
    Rest = Train set as new df)
    
 
    1. med: make index to column using .reset_index = med_index
    2. create two subsets for 50% 50% split '0' and '1' -> med_index0, med_index1
        2.a make med_index0 and med_index1 to list of dicts, using df.to_dict(orient='records') =
        med_index_records
        2.b create test set -> using random.sample(med_index_records, size=20000/2)
        -> test_set0 and test_set1
        2.c concat both test sets
        2.d convert test set to df = med_test
        2.e shuffle med_test
    3. med.drop(med_test["index"].to_list())
    4. rename med to train
        4.a drop index column of train
    5. apply SMOTE on train
    6. 5 fold cross validation

In [4]:
med.No_Show.value_counts()

0    88207
1    22314
Name: No_Show, dtype: int64

In [5]:
med_index=med.reset_index()

In [6]:
med_index.head()

Unnamed: 0,index,Patient_ID,Appointment_ID,Gender,Scheduled_Day,Appointment_Day,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,No_Show,Age_Normalized,Waiting_Days_Normalized,Handicap_1,Handicap_2,Handicap_3,Handicap_4,SMS_Received_1,Day_Of_Week_Monday,Day_Of_Week_Saturday,Day_Of_Week_Thursday,Day_Of_Week_Tuesday,Day_Of_Week_Wednesday
0,0,29872499824296,5642903,1,2016-04-29,2016-04-29,Jardim Da Penha,0,1,0,0,0,0.53913,0.0,0,0,0,0,0,0,0,0,0,0
1,1,558997776694438,5642503,0,2016-04-29,2016-04-29,Jardim Da Penha,0,0,0,0,0,0.486957,0.0,0,0,0,0,0,0,0,0,0,0
2,2,4262962299951,5642549,1,2016-04-29,2016-04-29,Mata Da Praia,0,0,0,0,0,0.53913,0.0,0,0,0,0,0,0,0,0,0,0
3,3,867951213174,5642828,1,2016-04-29,2016-04-29,Pontal De Camburi,0,0,0,0,0,0.069565,0.0,0,0,0,0,0,0,0,0,0,0
4,4,8841186448183,5642494,1,2016-04-29,2016-04-29,Jardim Da Penha,0,1,1,0,0,0.486957,0.0,0,0,0,0,0,0,0,0,0,0


In [7]:
med_index0 = med_index[med_index['No_Show'] == 0]

med_index1 = med_index[med_index['No_Show'] == 1]

In [8]:
med_index0_records=med_index0.to_dict(orient='records')

med_index1_records=med_index1.to_dict(orient='records')

In [9]:
import random

test_set0=random.sample(med_index0_records, 10000)

test_set1=random.sample(med_index1_records, 10000)

In [10]:
test0=pd.DataFrame.from_dict(test_set0)

In [11]:
test1=pd.DataFrame.from_dict(test_set1)

In [12]:
med_test = pd.concat([test0, test1])

In [13]:
test = med_test.sample(frac=1)

In [14]:
test

Unnamed: 0,index,Patient_ID,Appointment_ID,Gender,Scheduled_Day,Appointment_Day,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,No_Show,Age_Normalized,Waiting_Days_Normalized,Handicap_1,Handicap_2,Handicap_3,Handicap_4,SMS_Received_1,Day_Of_Week_Monday,Day_Of_Week_Saturday,Day_Of_Week_Thursday,Day_Of_Week_Tuesday,Day_Of_Week_Wednesday
8757,266,11981726819522,5639237,0,2016-04-29,2016-04-29,Maria Ortiz,0,0,0,0,0,0.000000,0.0,0,0,0,0,0,0,0,0,0,0
5781,27428,316668471597,5490749,0,2016-03-18,2016-05-10,Santa Cecília,1,0,0,0,0,0.069565,53.0,0,0,0,0,1,0,0,0,1,0
9335,2640,967542775735683,5499133,0,2016-03-22,2016-04-29,Jesus De Nazareth,0,0,0,0,1,0.321739,38.0,0,0,0,0,0,0,0,0,0,0
5984,57643,88654928262999,5702208,0,2016-05-16,2016-05-20,Santo Antônio,0,1,1,0,1,0.608696,4.0,0,0,0,0,0,0,0,0,0,0
8658,94337,22281227833,5747022,1,2016-05-30,2016-06-03,Ilha Do Príncipe,0,0,0,0,1,0.295652,4.0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6121,2936,68652999496825,5586216,1,2016-04-15,2016-04-29,Andorinhas,1,0,0,0,0,0.069565,14.0,0,0,0,0,1,0,0,0,0,0
8931,15115,9362452265247,5635500,0,2016-04-28,2016-05-02,República,0,1,0,0,0,0.478261,4.0,0,0,0,0,0,1,0,0,0,0
4690,14151,19829615653388,5684702,0,2016-05-11,2016-05-31,Parque Moscoso,0,0,0,0,1,0.008696,20.0,0,0,0,0,1,0,0,0,1,0
5507,60913,99959985531137,5697607,0,2016-05-13,2016-05-16,Jucutuquara,0,0,0,0,1,0.086957,3.0,0,0,0,0,0,1,0,0,0,0


In [15]:
train = med.drop(test["index"].to_list())

In [16]:
train = train.reset_index(drop=True)

In [17]:
train

Unnamed: 0,Patient_ID,Appointment_ID,Gender,Scheduled_Day,Appointment_Day,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,No_Show,Age_Normalized,Waiting_Days_Normalized,Handicap_1,Handicap_2,Handicap_3,Handicap_4,SMS_Received_1,Day_Of_Week_Monday,Day_Of_Week_Saturday,Day_Of_Week_Thursday,Day_Of_Week_Tuesday,Day_Of_Week_Wednesday
0,29872499824296,5642903,1,2016-04-29,2016-04-29,Jardim Da Penha,0,1,0,0,0,0.539130,0.0,0,0,0,0,0,0,0,0,0,0
1,558997776694438,5642503,0,2016-04-29,2016-04-29,Jardim Da Penha,0,0,0,0,0,0.486957,0.0,0,0,0,0,0,0,0,0,0,0
2,4262962299951,5642549,1,2016-04-29,2016-04-29,Mata Da Praia,0,0,0,0,0,0.539130,0.0,0,0,0,0,0,0,0,0,0,0
3,867951213174,5642828,1,2016-04-29,2016-04-29,Pontal De Camburi,0,0,0,0,0,0.069565,0.0,0,0,0,0,0,0,0,0,0,0
4,8841186448183,5642494,1,2016-04-29,2016-04-29,Jardim Da Penha,0,1,1,0,0,0.486957,0.0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90516,2572134369293,5651768,1,2016-05-03,2016-06-07,Maria Ortiz,0,0,0,0,0,0.486957,35.0,0,0,0,0,1,0,0,0,1,0
90517,3596266328735,5650093,1,2016-05-03,2016-06-07,Maria Ortiz,0,0,0,0,0,0.443478,35.0,0,0,0,0,1,0,0,0,1,0
90518,15576631729893,5630692,1,2016-04-27,2016-06-07,Maria Ortiz,0,0,0,0,0,0.182609,41.0,0,0,0,0,1,0,0,0,1,0
90519,92134931435557,5630323,1,2016-04-27,2016-06-07,Maria Ortiz,0,0,0,0,0,0.330435,41.0,0,0,0,0,1,0,0,0,1,0


In [18]:
train.No_Show.value_counts()

0    78207
1    12314
Name: No_Show, dtype: int64

In [19]:
train1 = train[train['No_Show'] == 1]

In [22]:
train0 = train[train['No_Show'] == 0]

In [24]:
train0_reduced = train0.sample(12314)

In [25]:
train_equally = pd.concat([train0_reduced, train1])

In [26]:
train_shuffled = train_equally.sample(frac=1)

In [27]:
train_shuffled

Unnamed: 0,Patient_ID,Appointment_ID,Gender,Scheduled_Day,Appointment_Day,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,No_Show,Age_Normalized,Waiting_Days_Normalized,Handicap_1,Handicap_2,Handicap_3,Handicap_4,SMS_Received_1,Day_Of_Week_Monday,Day_Of_Week_Saturday,Day_Of_Week_Thursday,Day_Of_Week_Tuesday,Day_Of_Week_Wednesday
46195,68354734136,5634112,0,2016-04-28,2016-05-05,Ilha Do Príncipe,0,0,0,0,1,0.208696,7.0,0,0,0,0,1,0,0,1,0,0
67337,69683178852498,5587283,0,2016-04-15,2016-05-05,Itararé,0,0,0,0,1,0.295652,20.0,0,0,0,0,1,0,0,1,0,0
74256,94518571429,5778419,1,2016-06-06,2016-06-06,Jardim Da Penha,0,0,0,0,0,0.382609,0.0,0,0,0,0,0,1,0,0,0,0
32090,99599829577539,5627346,0,2016-04-27,2016-05-09,Bento Ferreira,1,0,0,0,1,0.547826,12.0,0,0,0,0,0,1,0,0,0,0
74779,35879753119953,5613481,1,2016-04-25,2016-06-08,São Cristóvão,0,0,0,0,0,0.313043,44.0,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46539,26334145388597,5645950,0,2016-05-02,2016-05-16,Santa Tereza,1,0,0,0,1,0.069565,14.0,0,0,0,0,0,1,0,0,0,0
83422,72481695298863,5778177,0,2016-06-06,2016-06-06,Resistência,0,1,0,0,0,0.756522,0.0,0,0,0,0,0,1,0,0,0,0
83971,9617345862543,5681537,1,2016-05-10,2016-06-07,República,0,0,0,0,1,0.226087,28.0,0,0,0,0,1,0,0,0,1,0
868,1278463322627,5641170,0,2016-04-29,2016-04-29,Parque Moscoso,0,0,0,0,0,0.217391,0.0,0,0,0,0,0,0,0,0,0,0


In [30]:
train_shuffled.No_Show.value_counts()

1    12314
0    12314
Name: No_Show, dtype: int64

### 1. Define train X, y

In [28]:
y = train_shuffled[['No_Show']]#target / response as dependent variable

X = train_shuffled.drop(['Patient_ID',
                         'Appointment_ID',
                         'Scheduled_Day',
                         'Appointment_Day',
                         'No_Show',
                         'Neighbourhood'], axis=1) #predictors as independent variables

In [29]:
y = pd.DataFrame.to_numpy(y)

### 2. Apply train-test-split

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=666)

### 8. Fit Random Forest Model

In [32]:
rf_model = RandomForestClassifier(random_state=666, n_estimators=100)
rf_model_fitted = rf_model.fit(X_train, y_train.ravel())

### 9. Show Feature Importance of Random Forest Model

In [33]:
feature_importances = pd.DataFrame(rf_model_fitted.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)

feature_importances

Unnamed: 0,importance
Age_Normalized,0.438623
Waiting_Days_Normalized,0.428046
Gender,0.022913
SMS_Received_1,0.020054
Day_Of_Week_Wednesday,0.0119
Hypertension,0.011643
Day_Of_Week_Tuesday,0.011047
Day_Of_Week_Monday,0.010506
Scholarship,0.010142
Day_Of_Week_Thursday,0.009867


### 10. Define test X, y

In [34]:
ytest = test[['No_Show']]#target / response as dependent variable

Xtest = test.drop(['Patient_ID',
                   'Appointment_ID',
                   'Scheduled_Day',
                   'Appointment_Day',
                   'No_Show',
                   'Neighbourhood',
                   'index'], axis=1)#predictors as independent variables

### 11. Apply Trained Random Forest Model on Split Test
    to predict accuracy on test set

In [39]:
print("Random Forest Accuracy Score: \n", rf_model_fitted.score(X_test, y_test))

Random Forest Accuracy Score: 
 0.634997969955339


In [40]:
print("Random Forest Confusion Matrix: \n", confusion_matrix(y_test,rf_model_fitted.predict(X_test)))

Random Forest Confusion Matrix: 
 [[1556  973]
 [ 825 1572]]


### 12. Apply Trained Random Forest Model on Final Test
    to predict accuracy on test set

In [35]:
print("Random Forest Accuracy Score: \n", rf_model_fitted.score(Xtest, ytest))

Random Forest Accuracy Score: 
 0.63015


In [36]:
print("Random Forest Confusion Matrix: \n", confusion_matrix(ytest,rf_model_fitted.predict(Xtest)))

Random Forest Confusion Matrix: 
 [[6168 3832]
 [3565 6435]]


### 13. Apply Trained Random Forest Model on Train

In [37]:
print("Random Forest Accuracy Score: \n", rf_model_fitted.score(X_train, y_train))

Random Forest Accuracy Score: 
 0.9164044259466044


In [38]:
print("Random Forest Confusion Matrix: \n", confusion_matrix(y_train,rf_model_fitted.predict(X_train)))

Random Forest Confusion Matrix: 
 [[9039  746]
 [ 901 9016]]
