# 1. Prepare Data for Machine Learning

### 1.1 Import packages for modelling

In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm

sns.set_style("white")

pd.set_option("display.max_columns",100)
pd.set_option("display.max_rows",100)

### 1.2 Load dataset

In [2]:
med = pd.read_pickle("med_ml.pkl")

In [3]:
med.head()

Unnamed: 0,Patient_ID,Appointment_ID,Gender,Scheduled_Day,Appointment_Day,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMS_Received,No_Show,Day_Of_Week,Waiting_Days
0,29872499824296,5642903,1,2016-04-29,2016-04-29,62,Jardim Da Penha,0,1,0,0,0,0,0,Friday,0
1,558997776694438,5642503,0,2016-04-29,2016-04-29,56,Jardim Da Penha,0,0,0,0,0,0,0,Friday,0
2,4262962299951,5642549,1,2016-04-29,2016-04-29,62,Mata Da Praia,0,0,0,0,0,0,0,Friday,0
3,867951213174,5642828,1,2016-04-29,2016-04-29,8,Pontal De Camburi,0,0,0,0,0,0,0,Friday,0
4,8841186448183,5642494,1,2016-04-29,2016-04-29,56,Jardim Da Penha,0,1,1,0,0,0,0,Friday,0


### 1.3 Apply min-max-scaler to columns where numeric values exceed 1

    Convert Age values to floats between 0 and 1:

In [4]:
# Create Age_floats to store age values as floats
Age_floats = med[['Age']].values.astype(float)

# Create min & max processor object
min_max_scaler = preprocessing.MinMaxScaler()

# Create an object to transform the data to fit minmax processor
Age_scaled = min_max_scaler.fit_transform(Age_floats)

In [5]:
med['Age_Normalized'] = Age_scaled

In [6]:
med.drop(['Age'], axis=1, inplace=True)

    Convert Waiting Days values to floats between 0 and 1:

In [7]:
Waiting_days_floats = med[['Waiting_Days']].values.astype(float)

# Create min & max processor object
min_max_scaler = preprocessing.MinMaxScaler()

# Create an object to transform the data to fit minmax processor
Waiting_days_scaled = min_max_scaler.fit_transform(Waiting_days_floats)

In [8]:
med['Waiting_Days_Normalized'] = Waiting_days_floats

In [9]:
med.drop(['Waiting_Days'], axis=1, inplace=True)

In [10]:
med.head()

Unnamed: 0,Patient_ID,Appointment_ID,Gender,Scheduled_Day,Appointment_Day,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMS_Received,No_Show,Day_Of_Week,Age_Normalized,Waiting_Days_Normalized
0,29872499824296,5642903,1,2016-04-29,2016-04-29,Jardim Da Penha,0,1,0,0,0,0,0,Friday,0.53913,0.0
1,558997776694438,5642503,0,2016-04-29,2016-04-29,Jardim Da Penha,0,0,0,0,0,0,0,Friday,0.486957,0.0
2,4262962299951,5642549,1,2016-04-29,2016-04-29,Mata Da Praia,0,0,0,0,0,0,0,Friday,0.53913,0.0
3,867951213174,5642828,1,2016-04-29,2016-04-29,Pontal De Camburi,0,0,0,0,0,0,0,Friday,0.069565,0.0
4,8841186448183,5642494,1,2016-04-29,2016-04-29,Jardim Da Penha,0,1,1,0,0,0,0,Friday,0.486957,0.0


### 1.4 Get dummies for remaining categorical values

In [11]:
med_ml = pd.get_dummies(med,  columns=["Neighbourhood", "Handicap", "SMS_Received", "Day_Of_Week"], drop_first=True)
med_ml.head()

Unnamed: 0,Patient_ID,Appointment_ID,Gender,Scheduled_Day,Appointment_Day,Scholarship,Hypertension,Diabetes,Alcoholism,No_Show,Age_Normalized,Waiting_Days_Normalized,Neighbourhood_Andorinhas,Neighbourhood_Antônio Honório,Neighbourhood_Ariovaldo Favalessa,Neighbourhood_Barro Vermelho,Neighbourhood_Bela Vista,Neighbourhood_Bento Ferreira,Neighbourhood_Boa Vista,Neighbourhood_Bonfim,Neighbourhood_Caratoíra,Neighbourhood_Centro,Neighbourhood_Comdusa,Neighbourhood_Conquista,Neighbourhood_Consolação,Neighbourhood_Cruzamento,Neighbourhood_Da Penha,Neighbourhood_De Lourdes,Neighbourhood_Do Cabral,Neighbourhood_Do Moscoso,Neighbourhood_Do Quadro,Neighbourhood_Enseada Do Suá,Neighbourhood_Estrelinha,Neighbourhood_Fonte Grande,Neighbourhood_Forte São João,Neighbourhood_Fradinhos,Neighbourhood_Goiabeiras,Neighbourhood_Grande Vitória,Neighbourhood_Gurigica,Neighbourhood_Horto,Neighbourhood_Ilha Das Caieiras,Neighbourhood_Ilha De Santa Maria,Neighbourhood_Ilha Do Boi,Neighbourhood_Ilha Do Frade,Neighbourhood_Ilha Do Príncipe,Neighbourhood_Ilhas Oceânicas De Trindade,Neighbourhood_Inhanguetá,Neighbourhood_Itararé,Neighbourhood_Jabour,Neighbourhood_Jardim Camburi,...,Neighbourhood_Joana D´Arc,Neighbourhood_Jucutuquara,Neighbourhood_Maria Ortiz,Neighbourhood_Maruípe,Neighbourhood_Mata Da Praia,Neighbourhood_Monte Belo,Neighbourhood_Morada De Camburi,Neighbourhood_Mário Cypreste,Neighbourhood_Nazareth,Neighbourhood_Nova Palestina,Neighbourhood_Parque Industrial,Neighbourhood_Parque Moscoso,Neighbourhood_Piedade,Neighbourhood_Pontal De Camburi,Neighbourhood_Praia Do Canto,Neighbourhood_Praia Do Suá,Neighbourhood_Redenção,Neighbourhood_República,Neighbourhood_Resistência,Neighbourhood_Romão,Neighbourhood_Santa Cecília,Neighbourhood_Santa Clara,Neighbourhood_Santa Helena,Neighbourhood_Santa Luíza,Neighbourhood_Santa Lúcia,Neighbourhood_Santa Martha,Neighbourhood_Santa Tereza,Neighbourhood_Santo André,Neighbourhood_Santo Antônio,Neighbourhood_Santos Dumont,Neighbourhood_Santos Reis,Neighbourhood_Segurança Do Lar,Neighbourhood_Solon Borges,Neighbourhood_São Benedito,Neighbourhood_São Cristóvão,Neighbourhood_São José,Neighbourhood_São Pedro,Neighbourhood_Tabuazeiro,Neighbourhood_Universitário,Neighbourhood_Vila Rubim,Handicap_1,Handicap_2,Handicap_3,Handicap_4,SMS_Received_1,Day_Of_Week_Monday,Day_Of_Week_Saturday,Day_Of_Week_Thursday,Day_Of_Week_Tuesday,Day_Of_Week_Wednesday
0,29872499824296,5642903,1,2016-04-29,2016-04-29,0,1,0,0,0,0.53913,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,558997776694438,5642503,0,2016-04-29,2016-04-29,0,0,0,0,0,0.486957,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,4262962299951,5642549,1,2016-04-29,2016-04-29,0,0,0,0,0,0.53913,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,867951213174,5642828,1,2016-04-29,2016-04-29,0,0,0,0,0,0.069565,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,8841186448183,5642494,1,2016-04-29,2016-04-29,0,1,1,0,0,0.486957,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
med_ml.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110521 entries, 0 to 110526
Columns: 102 entries, Patient_ID to Day_Of_Week_Wednesday
dtypes: datetime64[ns](2), float64(2), int32(2), int64(6), uint8(90)
memory usage: 19.6 MB


### 1.5 Split dataset into train and test

In [13]:
y = med_ml[['No_Show']]#target / response as dependent variable

X = med_ml.drop(['Patient_ID',
                 'Appointment_ID',
                 'Scheduled_Day', 
                 'Appointment_Day',
                 'No_Show'], axis=1) #predictors as independent variables

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=666)

# 2. Build Models

### 2.1 Random Forest

In [15]:
rf_model = RandomForestClassifier(random_state=666, n_estimators=100)

rf_model.fit(X_train, y_train.values.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=666,
                       verbose=0, warm_start=False)

In [16]:
rf_model.score(X_test, y_test)

0.7776068762723366

In [17]:
print("Model Feature Importances:\n")

for name, importance in zip(X.columns, np.sort(rf_model.feature_importances_)[::-1]):
    print(name,": {:.2f}".format(importance))

Model Feature Importances:

Gender : 0.33
Scholarship : 0.28
Hypertension : 0.03
Diabetes : 0.02
Alcoholism : 0.02
Age_Normalized : 0.02
Waiting_Days_Normalized : 0.02
Neighbourhood_Andorinhas : 0.02
Neighbourhood_Antônio Honório : 0.01
Neighbourhood_Ariovaldo Favalessa : 0.01
Neighbourhood_Barro Vermelho : 0.01
Neighbourhood_Bela Vista : 0.01
Neighbourhood_Bento Ferreira : 0.01
Neighbourhood_Boa Vista : 0.01
Neighbourhood_Bonfim : 0.01
Neighbourhood_Caratoíra : 0.01
Neighbourhood_Centro : 0.01
Neighbourhood_Comdusa : 0.01
Neighbourhood_Conquista : 0.01
Neighbourhood_Consolação : 0.01
Neighbourhood_Cruzamento : 0.01
Neighbourhood_Da Penha : 0.00
Neighbourhood_De Lourdes : 0.00
Neighbourhood_Do Cabral : 0.00
Neighbourhood_Do Moscoso : 0.00
Neighbourhood_Do Quadro : 0.00
Neighbourhood_Enseada Do Suá : 0.00
Neighbourhood_Estrelinha : 0.00
Neighbourhood_Fonte Grande : 0.00
Neighbourhood_Forte São João : 0.00
Neighbourhood_Fradinhos : 0.00
Neighbourhood_Goiabeiras : 0.00
Neighbourhood_Grand

### 2.2 Multinominal Naive Bayes

In [18]:
nb_model =  MultinomialNB().fit(X_train, y_train.values.ravel())

In [19]:
nb_model.score(X_test, y_test)

0.7399230943225514

### 2.3 Support Vector Machines

In [None]:
svm_model = svm.SVC(gamma='auto').fit(X_train, y_train.values.ravel())

In [None]:
svm_model.score(X_test, y_test) 