# 1. Prepare Data for Machine Learning

### 1.1 Import packages for modelling

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from imblearn.over_sampling import SMOTE

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style("white")

pd.set_option("display.max_columns",100)
pd.set_option("display.max_rows",100)

### 1.2 Load dataset

In [2]:
med = pd.read_pickle('Data_Sets/Medical_Appointments_2.pkl')

In [3]:
med.head()

Unnamed: 0,Patient_ID,Appointment_ID,Gender,Scheduled_Day,Appointment_Day,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMS_Received,No_Show,Day_Of_Week,Waiting_Days
0,29872499824296,5642903,1,2016-04-29,2016-04-29,62,Jardim Da Penha,0,1,0,0,0,0,0,Friday,0
1,558997776694438,5642503,0,2016-04-29,2016-04-29,56,Jardim Da Penha,0,0,0,0,0,0,0,Friday,0
2,4262962299951,5642549,1,2016-04-29,2016-04-29,62,Mata Da Praia,0,0,0,0,0,0,0,Friday,0
3,867951213174,5642828,1,2016-04-29,2016-04-29,8,Pontal De Camburi,0,0,0,0,0,0,0,Friday,0
4,8841186448183,5642494,1,2016-04-29,2016-04-29,56,Jardim Da Penha,0,1,1,0,0,0,0,Friday,0


### 1.3 Apply min-max-scaler to columns where numeric values exceed 1

    Convert Age values to floats between 0 and 1:

In [4]:
# Create Age_floats to store age values as floats
Age_floats = med[['Age']].values.astype(float)

# Create min & max processor object
min_max_scaler = preprocessing.MinMaxScaler()

# Create an object to transform the data to fit minmax processor
Age_scaled = min_max_scaler.fit_transform(Age_floats)

In [5]:
med['Age_Normalized'] = Age_scaled

In [6]:
med.drop(['Age'], axis=1, inplace=True)

    Convert Waiting Days values to floats between 0 and 1:

In [7]:
Waiting_days_floats = med[['Waiting_Days']].values.astype(float)

# Create an object to transform the data to fit minmax processor
Waiting_days_scaled = min_max_scaler.fit_transform(Waiting_days_floats)

In [8]:
med['Waiting_Days_Normalized'] = Waiting_days_floats

In [9]:
med.drop(['Waiting_Days'], axis=1, inplace=True)

In [10]:
med.head()

Unnamed: 0,Patient_ID,Appointment_ID,Gender,Scheduled_Day,Appointment_Day,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMS_Received,No_Show,Day_Of_Week,Age_Normalized,Waiting_Days_Normalized
0,29872499824296,5642903,1,2016-04-29,2016-04-29,Jardim Da Penha,0,1,0,0,0,0,0,Friday,0.53913,0.0
1,558997776694438,5642503,0,2016-04-29,2016-04-29,Jardim Da Penha,0,0,0,0,0,0,0,Friday,0.486957,0.0
2,4262962299951,5642549,1,2016-04-29,2016-04-29,Mata Da Praia,0,0,0,0,0,0,0,Friday,0.53913,0.0
3,867951213174,5642828,1,2016-04-29,2016-04-29,Pontal De Camburi,0,0,0,0,0,0,0,Friday,0.069565,0.0
4,8841186448183,5642494,1,2016-04-29,2016-04-29,Jardim Da Penha,0,1,1,0,0,0,0,Friday,0.486957,0.0


### 1.4 Get dummies for remaining categorical values

In [11]:
med_ml = pd.get_dummies(med,  columns=["Handicap", "SMS_Received", "Day_Of_Week"], drop_first=True)

# "Neighbourhood" excluded
med_ml.head()

Unnamed: 0,Patient_ID,Appointment_ID,Gender,Scheduled_Day,Appointment_Day,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,No_Show,Age_Normalized,Waiting_Days_Normalized,Handicap_1,Handicap_2,Handicap_3,Handicap_4,SMS_Received_1,Day_Of_Week_Monday,Day_Of_Week_Saturday,Day_Of_Week_Thursday,Day_Of_Week_Tuesday,Day_Of_Week_Wednesday
0,29872499824296,5642903,1,2016-04-29,2016-04-29,Jardim Da Penha,0,1,0,0,0,0.53913,0.0,0,0,0,0,0,0,0,0,0,0
1,558997776694438,5642503,0,2016-04-29,2016-04-29,Jardim Da Penha,0,0,0,0,0,0.486957,0.0,0,0,0,0,0,0,0,0,0,0
2,4262962299951,5642549,1,2016-04-29,2016-04-29,Mata Da Praia,0,0,0,0,0,0.53913,0.0,0,0,0,0,0,0,0,0,0,0
3,867951213174,5642828,1,2016-04-29,2016-04-29,Pontal De Camburi,0,0,0,0,0,0.069565,0.0,0,0,0,0,0,0,0,0,0,0
4,8841186448183,5642494,1,2016-04-29,2016-04-29,Jardim Da Penha,0,1,1,0,0,0.486957,0.0,0,0,0,0,0,0,0,0,0,0


In [12]:
med_ml.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110521 entries, 0 to 110526
Data columns (total 23 columns):
Patient_ID                 110521 non-null int64
Appointment_ID             110521 non-null int64
Gender                     110521 non-null int32
Scheduled_Day              110521 non-null datetime64[ns]
Appointment_Day            110521 non-null datetime64[ns]
Neighbourhood              110521 non-null object
Scholarship                110521 non-null int64
Hypertension               110521 non-null int64
Diabetes                   110521 non-null int64
Alcoholism                 110521 non-null int64
No_Show                    110521 non-null int32
Age_Normalized             110521 non-null float64
Waiting_Days_Normalized    110521 non-null float64
Handicap_1                 110521 non-null uint8
Handicap_2                 110521 non-null uint8
Handicap_3                 110521 non-null uint8
Handicap_4                 110521 non-null uint8
SMS_Received_1             110521

In [13]:
med_ml.to_pickle('Data_Sets/Medical_Appointments_3.pkl')

### 1.5 Balance dataset
    The target variable in the dataset is clearly imbalanced, as there are only 22314 occurences (20,2%)
    where patients did not show up to their appointments and 88207 occurences (79,8%) where they did.
    Hence, the dataset needs to be balanced.
   
I use **SMOTE (Synthetic Minority Over-sampling Technique)** to balance the data:

In [None]:
y = med_ml[['No_Show']]#target / response as dependent variable

X = med_ml.drop(['Patient_ID',
                 'Appointment_ID',
                 'Scheduled_Day', 
                 'Appointment_Day',
                 'No_Show',
                 'Neighbourhood'], axis=1) #predictors as independent variables

In [None]:
med_ml.No_Show.value_counts()

In [None]:
y = pd.DataFrame.to_numpy(y)

In [None]:
sm = SMOTE() 
X_resampled, y_resampled = sm.fit_sample(X, y.ravel())

In [None]:
# describes info about train and test set 
print("Number transactions X_resampled dataset: ", X_resampled.shape) 
print("Number transactions y_resampled dataset: ", y_resampled.shape) 

In [None]:
np.unique(y_resampled, return_counts=True)

### 1.6 Split dataset into train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.20, random_state=666)

In [None]:
# describes info about train and test set 
print("Number transactions X_train dataset: ", X_train.shape) 
print("Number transactions y_train dataset: ", y_train.shape) 
print("Number transactions X_test dataset: ", X_test.shape) 
print("Number transactions y_test dataset: ", y_test.shape) 

In [None]:
np.unique(y_test, return_counts=True)

# 2. Build Models

### 2.1 Random Forest Model

In [None]:
rf_model = RandomForestClassifier(random_state=666, n_estimators=100).fit(X_train, y_train.ravel())

In [None]:
feature_importances = pd.DataFrame(rf_model.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)

feature_importances

In [None]:
print("Random Forest Accuracy Score: \n", rf_model.score(X_test, y_test))

In [None]:
print("Random Forest Confusion Matrix: \n", confusion_matrix(y_test,rf_model.predict(X_test)))

### 2.2 Bernoulli Naive Bayes Model

In [None]:
bnb_model =  BernoulliNB().fit(X_train, y_train.ravel())

In [None]:
print("Bernoulli Naive Bayes Accuracy Score: ", bnb_model.score(X_test, y_test))

In [None]:
print("Bernoulli Naive Bayes Confusion Matrix: \n", confusion_matrix(y_test,bnb_model.predict(X_test)))

### 2.3 Support Vector Machines Model

In [None]:
svm_model = svm.SVC(gamma='auto').fit(X_train, y_train.ravel())

In [None]:
print("Support Vector Machines Accuracy Score: ",svm_model.score(X_test, y_test))

In [None]:
print("Support Vector Machines Confusion Matrix: \n", confusion_matrix(y_test,svm_model.predict(X_test)))

### 2.4 Logistic Regression Model

In [None]:
lg_model = LogisticRegression(random_state=666, solver='lbfgs', max_iter=2000).fit(X_train, y_train.ravel())

In [None]:
print("Logistic Regression Accuracy Score: ",lg_model.score(X_test, y_test))

In [None]:
print("Logistic Regression Confusion Matrix: \n", confusion_matrix(y_test,lg_model.predict(X_test)))