Rework this into three parts:

1. Setup
2. Logistic Regression
3. [GridSearchCV](https://campus.datacamp.com/courses/supervised-learning-with-scikit-learn/fine-tuning-your-model?ex=10)

# Setup

1. Import packages
2. Import datasets (from pickle files)
3. Prepare dataframes/arrays

In [40]:
# package imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# import pickle file
df_appointments = pd.read_pickle('../data/interim/appointments_df.pickle')
df_clean = pd.read_pickle('../data/interim/clean_df.pickle')

In [3]:
# set index for dataframe
df_clean.set_index('Appointment_ID', inplace=True)

In [4]:
# add columns from df_clean back into df_appointments
df_model = df_appointments.join(df_clean, rsuffix='_clean')

In [5]:
# remove duplicate columns from model dataframe
drop_columns = ['Patient_ID_clean',
                'Gender_clean',
                'Scheduled_Date_clean',
                'Appointment_Date_clean',
                'SMS_sent_clean',
                'No_show_clean',
                'Neighborhood']
df_model.drop(columns=drop_columns, inplace=True)

In [6]:
# create new column: days_diff to show number of days between scheduled date and appointment date
df_model['days_diff'] = df_model.date_diff.dt.days

In [7]:
# create dummy vars for days of week
df_model = df_model.join(pd.get_dummies(df_model.Appointment_Date.dt.dayofweek))

In [8]:
# clean up column names
df_model.columns = ['Patient_ID',
                    'Gender',
                    'Scheduled_Date',
                    'Appointment_Date',
                    'SMS_sent',
                    'No_show',
                    'date_diff',
                    'Age',
                    'Welfare',
                    'Hypertension',
                    'Diabetes',
                    'Alcoholism',
                    'Disability',
                    'days_diff',
                    'Mon',
                    'Tue',
                    'Wed',
                    'Thu',
                    'Fri',
                    'Sat']

In [9]:
# extraneous columns not needed for modeling
unneeded_columns = ['Patient_ID',
                    'Scheduled_Date',
                    'Appointment_Date',
                    'No_show',
                    'date_diff',
                    'Gender']

In [10]:
# Need to produce arrays for the features and the response variable
y = df_model['No_show'].values
X = df_model.drop(columns=unneeded_columns).values


# Logistic Regression

In [49]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42)

# Create the classifier: logreg
logreg = LogisticRegression()

# Fit the classifier to the training data
logreg.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = logreg.predict(X_test)

# Compute and print the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[35050   319]
 [ 8688   154]]
             precision    recall  f1-score   support

         No       0.80      0.99      0.89     35369
        Yes       0.33      0.02      0.03      8842

avg / total       0.71      0.80      0.72     44211



# Random Forest


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42)


rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=123456)
rf.fit(X_train, y_train)

predicted = rf.predict(X_test)
accuracy = accuracy_score(y_test, predicted)

print(rf.oob_score)
print(accuracy)



In [None]:
# Compute and print the confusion matrix and classification report
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))

# Random Forest/Grid Search

Source: https://www.fabienplisson.com/random-forest-and-grid-search/

In [None]:
# Part 1

seed = 123
 
# RFC with fixed hyperparameters max_depth, max_features and min_samples_leaf
rfc = RandomForestClassifier(n_jobs=-1, oob_score = True, max_depth=10, max_features='sqrt', min_samples_leaf = 1) 

# Range of `n_estimators` values to explore.
n_estim = filter(lambda x: x % 2 == 0, list(range(10,100)))

cv_scores = []

for i in n_estim:
    rfc.set_params(n_estimators=i)
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    scores = model_selection.cross_val_score(rfc, x_train, y_train, cv=kfold, scoring='accuracy')
    cv_scores.append(scores.mean()*100)
    
optimal_n_estim = n_estim[cv_scores.index(max(cv_scores))]
print("The optimal number of estimators is %d with %0.1f%%" % (optimal_n_estim, cv_scores[optimal_n_estim]))

plt.plot(n_estim, cv_scores)
plt.xlabel('Number of Estimators')
plt.ylabel('Train Accuracy')
plt.show()

In [None]:
# Part 2

from sklearn.grid_search import GridSearchCV

rfc = RandomForestClassifier(n_jobs=-1) 

# Use a grid over parameters of interest
param_grid = { 
           "n_estimators" : [9, 25, 63],
           "max_depth" : [1, 10, 30],
           "min_samples_leaf" : [1, 5, 10]}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 2)
CV_rfc.fit(X_train, y_train)
print(CV_rfc.best_params_)

In [None]:
cv_predicted = CV_rfc.predict(X_test)

In [None]:
print(confusion_matrix(y_test, cv_predicted))
print(classification_report(y_test, cv_predicted))

In [None]:
importances = rf.feature_importances_

In [None]:
# Part 3

seed = 123

# Optimized RF classifier
rfc = RandomForestClassifier(n_estimators=36, max_depth=5, max_features='sqrt', min_samples_leaf = 4)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

# fit the model with training set
scores = model_selection.cross_val_score(rfc, X_train, y_train, cv=kfold, scoring='accuracy')
cv_scores.append(scores.mean()*100)
print("Train accuracy %0.2f (+/- %0.2f)" % (scores.mean()*100, scores.std()*100))

# predict on testing set
preds = model_selection.cross_val_predict(rfc, X_test, y_test, cv=kfold)
cv_preds.append(metrics.accuracy_score(y_test, preds)*100)
print("Test accuracy %0.2f" % (100*metrics.accuracy_score(y_test, preds)))