In [1]:
# package imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# import pickle file
#df_patients = pd.read_pickle('../data/interim/patients_df.pickle')
df_appointments = pd.read_pickle('../data/interim/appointments_df.pickle')
df_clean = pd.read_pickle('../data/interim/clean_df.pickle')

In [3]:
df_clean.set_index('Appointment_ID', inplace=True)

In [4]:
df_model = df_appointments.join(df_clean, rsuffix='_clean')

In [5]:
drop_columns = ['Patient_ID_clean',
                'Gender_clean',
                'Scheduled_Date_clean',
                'Appointment_Date_clean',
                'SMS_sent_clean',
                'No_show_clean',
                'Neighborhood']
df_model.drop(columns=drop_columns, inplace=True)

In [6]:
df_model['days_diff'] = df_model.date_diff.dt.days

In [7]:
df_model = df_model.join(pd.get_dummies(df_model.Appointment_Date.dt.dayofweek))

In [8]:
df_model.columns = ['Patient_ID',
                    'Gender',
                    'Scheduled_Date',
                    'Appointment_Date',
                    'SMS_sent',
                    'No_show',
                    'date_diff',
                    'Age',
                    'Welfare',
                    'Hypertension',
                    'Diabetes',
                    'Alcoholism',
                    'Disability',
                    'days_diff',
                    'Mon',
                    'Tue',
                    'Wed',
                    'Thu',
                    'Fri',
                    'Sat']

In [9]:
df_model.columns

Index(['Patient_ID', 'Gender', 'Scheduled_Date', 'Appointment_Date',
       'SMS_sent', 'No_show', 'date_diff', 'Age', 'Welfare', 'Hypertension',
       'Diabetes', 'Alcoholism', 'Disability', 'days_diff', 'Mon', 'Tue',
       'Wed', 'Thu', 'Fri', 'Sat'],
      dtype='object')

In [10]:
unneeded_columns = ['Patient_ID',
                    'Scheduled_Date',
                    'Appointment_Date',
                    'No_show',
                    'date_diff',
                    'Gender']

In [11]:
# Need to produce arrays for the features and the response variable
y = df_model['No_show'].values
x = df_model.drop(columns=unneeded_columns).values


# Logistic Regression

In [None]:
# Create training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state=42)

# Create the classifier: logreg
logreg = LogisticRegression()

# Fit the classifier to the training data
logreg.fit(x_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = logreg.predict(x_test)

# Compute and print the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
for i in [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]:
    print(df_model.drop(columns=unneeded_columns).columns.values[i])
    print(logreg.coef_[i])
 

In [None]:
tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel()

# Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [19]:
# Create training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state=42)


rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=123456)
rf.fit(x_train, y_train)

predicted = rf.predict(x_test)
accuracy = accuracy_score(y_test, predicted)

print(rf.oob_score)
print(accuracy)



True
0.7601728076722988


In [22]:
# Compute and print the confusion matrix and classification report
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))

[[31619  3750]
 [ 6853  1989]]
             precision    recall  f1-score   support

         No       0.82      0.89      0.86     35369
        Yes       0.35      0.22      0.27      8842

avg / total       0.73      0.76      0.74     44211



In [27]:
1989/(6853+1989)

0.22494910653698258

# Random Forest/Grid Search

Source: https://www.fabienplisson.com/random-forest-and-grid-search/

In [15]:
# Part 1

seed = 123
 
# RFC with fixed hyperparameters max_depth, max_features and min_samples_leaf
rfc = RandomForestClassifier(n_jobs=-1, oob_score = True, max_depth=10, max_features='sqrt', min_samples_leaf = 1) 

# Range of `n_estimators` values to explore.
n_estim = filter(lambda x: x % 2 == 0, list(range(10,100)))

cv_scores = []

for i in n_estim:
    rfc.set_params(n_estimators=i)
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    scores = model_selection.cross_val_score(rfc, x_train, y_train, cv=kfold, scoring='accuracy')
    cv_scores.append(scores.mean()*100)
    
optimal_n_estim = n_estim[cv_scores.index(max(cv_scores))]
print("The optimal number of estimators is %d with %0.1f%%" % (optimal_n_estim, cv_scores[optimal_n_estim]))

plt.plot(n_estim, cv_scores)
plt.xlabel('Number of Estimators')
plt.ylabel('Train Accuracy')
plt.show()

NameError: name 'model_selection' is not defined

In [31]:
# Part 2

from sklearn.grid_search import GridSearchCV

rfc = RandomForestClassifier(n_jobs=-1) 

# Use a grid over parameters of interest
param_grid = { 
           "n_estimators" : [9, 25, 63],
           "max_depth" : [1, 10, 30],
           "min_samples_leaf" : [1, 5, 10]}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 2)
CV_rfc.fit(x_train, y_train)
print(CV_rfc.best_params_)

{'max_depth': 10, 'min_samples_leaf': 1, 'n_estimators': 63}


In [37]:
cv_predicted = CV_rfc.predict(x_test)

In [39]:
print(confusion_matrix(y_test, cv_predicted))
print(classification_report(y_test, cv_predicted))

[[35350    19]
 [ 8798    44]]
             precision    recall  f1-score   support

         No       0.80      1.00      0.89     35369
        Yes       0.70      0.00      0.01      8842

avg / total       0.78      0.80      0.71     44211



In [41]:
importances = rf.feature_importances_

In [48]:
len(importances)

14

In [51]:
df_model

Unnamed: 0_level_0,Patient_ID,Gender,Scheduled_Date,Appointment_Date,SMS_sent,No_show,date_diff,Age,Welfare,Hypertension,Diabetes,Alcoholism,Disability,days_diff,Mon,Tue,Wed,Thu,Fri,Sat
Appointment_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
5642903,2.987250e+13,F,2016-04-29 18:38:08,2016-04-29,0,No,0 days,62,0,1,0,0,0,0,0,0,0,0,1,0
5642503,5.589978e+14,M,2016-04-29 16:08:27,2016-04-29,0,No,0 days,56,0,0,0,0,0,0,0,0,0,0,1,0
5642549,4.262962e+12,F,2016-04-29 16:19:04,2016-04-29,0,No,0 days,62,0,0,0,0,0,0,0,0,0,0,1,0
5642828,8.679512e+11,F,2016-04-29 17:29:31,2016-04-29,0,No,0 days,8,0,0,0,0,0,0,0,0,0,0,1,0
5642494,8.841186e+12,F,2016-04-29 16:07:23,2016-04-29,0,No,0 days,56,0,1,1,0,0,0,0,0,0,0,1,0
5626772,9.598513e+13,F,2016-04-27 08:36:51,2016-04-29,0,No,2 days,76,0,1,0,0,0,2,0,0,0,0,1,0
5630279,7.336882e+14,F,2016-04-27 15:05:12,2016-04-29,0,Yes,2 days,23,0,0,0,0,0,2,0,0,0,0,1,0
5630575,3.449833e+12,F,2016-04-27 15:39:58,2016-04-29,0,Yes,2 days,39,0,0,0,0,0,2,0,0,0,0,1,0
5638447,5.639473e+13,F,2016-04-29 08:02:16,2016-04-29,0,No,0 days,21,0,0,0,0,0,0,0,0,0,0,1,0
5629123,7.812456e+13,F,2016-04-27 12:48:25,2016-04-29,0,No,2 days,19,0,0,0,0,0,2,0,0,0,0,1,0


In [53]:
from tabulate import tabulate

ModuleNotFoundError: No module named 'tabulate'

In [52]:
headers = ["name", "score"]
values = sorted(zip(df_model.columns, rf.feature_importances_), key=lambda x: x[1] * -1)
print(tabulate(values, headers, tablefmt="plain"))

NameError: name 'tabulate' is not defined

In [None]:
# Part 3

seed = 123

# Optimized RF classifier
rfc = RandomForestClassifier(n_estimators=36, max_depth=5, max_features='sqrt', min_samples_leaf = 4)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

# fit the model with training set
scores = model_selection.cross_val_score(rfc, x_train, y_train, cv=kfold, scoring='accuracy')
cv_scores.append(scores.mean()*100)
print("Train accuracy %0.2f (+/- %0.2f)" % (scores.mean()*100, scores.std()*100))

# predict on testing set
preds = model_selection.cross_val_predict(rfc, x_test, y_test, cv=kfold)
cv_preds.append(metrics.accuracy_score(y_test, preds)*100)
print("Test accuracy %0.2f" % (100*metrics.accuracy_score(y_test, preds)))