Comparing features:
- features as is
- features including no-show rate, reschedue, follow-up

Comparing models for two versions:
- LR
- KNN
- SVC
- RFC

Comparing models with oversampling

In [211]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import calendar
import pickle

%matplotlib inline
plt.style.use('seaborn-pastel')

import seaborn as sns
sns.set(style="whitegrid")

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score, roc_curve

%run evaluation_functions.py

In [212]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, 
                              AdaBoostClassifier, BaggingRegressor)

In [213]:
with open('pickles/appt.pickle','rb') as read_file:
    appt = pickle.load(read_file)
with open('pickles/appt_v2.pickle','rb') as read_file:
    appt_v2 = pickle.load(read_file)
with open('pickles/appt_v2_all1.pickle','rb') as read_file:
    appt_v3 = pickle.load(read_file)

In [214]:
#Set features
# run for appt - all appts, basic features
feature_cols = ['Gender','Age','Scholarship','Hipertension','Diabetes','Alcoholism','Handcap','SMS_received',
                'ApptDayofWeek','SchedDayofWeek','Neighbourhood','DayDiff']
a_Xv1 = appt[feature_cols]
a_Xv1 = pd.get_dummies(a_Xv1, columns = ['Gender','Neighbourhood','ApptDayofWeek','SchedDayofWeek'])
a_Xv1.drop(columns=['Gender_F','Neighbourhood_VILA RUBIM','SchedDayofWeek_Monday','ApptDayofWeek_Monday'],inplace=True)
Xv1_cols = np.array(a_Xv1.columns)
a_Xv1.info()

## run for appt_v2 - second or more appts, adds historical features
feature_cols = ['Gender','Age2','Neighbourhood2','Scholarship','Hipertension','Diabetes','Alcoholism',
                'Handcap_bin','SMS_received','SchedDayofWeek','ApptDayofWeek','DayDiff2',
                'no_show_rate2','Reschedule','Follow_up']
a_Xv2 = appt_v2[feature_cols]
a_Xv2 = pd.get_dummies(a_Xv2, columns = ['Gender','Neighbourhood2','SchedDayofWeek','ApptDayofWeek'])
a_Xv2.drop(columns=['Gender_F','Neighbourhood2_OTHER','SchedDayofWeek_Monday','ApptDayofWeek_Monday'],inplace=True)
Xv2_cols = np.array(a_Xv2.columns)
a_Xv2.info()
# a_X.head(5)

## run for appt_v2_all - all appts, adds historical features (has N/As)
feature_cols = ['Gender','Age2','Neighbourhood2','Scholarship','Hipertension','Diabetes','Alcoholism',
                'Handcap_bin','SMS_received','SchedDayofWeek','ApptDayofWeek','DayDiff2',
                'no_show_rate2','Reschedule','Follow_up']
a_Xv3 = appt_v3[feature_cols]
a_Xv3 = pd.get_dummies(a_Xv3, columns = ['Gender','Neighbourhood2','SchedDayofWeek','ApptDayofWeek'])
a_Xv3.drop(columns=['Gender_F','Neighbourhood2_OTHER','SchedDayofWeek_Monday','ApptDayofWeek_Monday'],inplace=True)
Xv3_cols = np.array(a_Xv3.columns)
a_Xv3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71163 entries, 0 to 71958
Data columns (total 97 columns):
Age                                          71163 non-null int64
Scholarship                                  71163 non-null int64
Hipertension                                 71163 non-null int64
Diabetes                                     71163 non-null int64
Alcoholism                                   71163 non-null int64
Handcap                                      71163 non-null int64
SMS_received                                 71163 non-null int64
DayDiff                                      71163 non-null int64
Gender_M                                     71163 non-null uint8
Neighbourhood_AEROPORTO                      71163 non-null uint8
Neighbourhood_ANDORINHAS                     71163 non-null uint8
Neighbourhood_ANTÔNIO HONÓRIO                71163 non-null uint8
Neighbourhood_ARIOVALDO FAVALESSA            71163 non-null uint8
Neighbourhood_BARRO VERMELHO     

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71163 entries, 65088 to 68006
Data columns (total 85 columns):
Age2                                  71163 non-null int64
Scholarship                           71163 non-null int64
Hipertension                          71163 non-null int64
Diabetes                              71163 non-null int64
Alcoholism                            71163 non-null int64
Handcap_bin                           71163 non-null int64
SMS_received                          71163 non-null int64
DayDiff2                              71163 non-null int64
no_show_rate2                         71163 non-null float64
Reschedule                            71163 non-null int64
Follow_up                             71163 non-null int64
Gender_M                              71163 non-null uint8
Neighbourhood2_ANDORINHAS             71163 non-null uint8
Neighbourhood2_BARRO VERMELHO         71163 non-null uint8
Neighbourhood2_BELA VISTA             71163 non-null uint8


In [215]:
# split into training and test sets

#appt
Xv1, yv1 = a_Xv1, appt['No_show_num']
Xv1_train, Xv1_test, yv1_train, yv1_test = train_test_split(Xv1, yv1, test_size = 0.25, random_state = 10, stratify = yv1)
#appt_v2
Xv2, yv2 = a_Xv2, appt_v2['No_show_num']
Xv2_train, Xv2_test, yv2_train, yv2_test = train_test_split(Xv2, yv2, test_size = 0.25, random_state = 10, stratify = yv2)
#appt_v3
Xv3, yv3 = a_Xv3, appt_v3['No_show_num']
Xv3_train, Xv3_test, yv3_train, yv3_test = train_test_split(Xv3, yv3, test_size = 0.25, random_state = 10, stratify = yv3)


In [216]:
# create standardized versions for LR, KNN, and SVC
#Standardize
def stan(X_train,X_test):
    std = StandardScaler()
    std.fit(X_train)
    return std.transform(X_train), std.transform(X_test)

Xv1_train_std, Xv1_test_std = stan(Xv1_train,Xv1_test)
Xv2_train_std, Xv2_test_std = stan(Xv2_train,Xv2_test)
Xv3_train_std, Xv3_test_std = stan(Xv3_train,Xv3_test)

### Models
skipping Decision Trees and Naive Bayes
#### Logistic Regression

In [217]:
def fit_LR(X_train,y_train,X_test):
    LR = LogisticRegression(solver='liblinear')
    LR.fit(X_train,y_train)
    return LR.predict(X_test), LR.predict_proba(X_test)[:,1]

In [218]:
yv1_test_LR, yv1_test_LR_prob = fit_LR(Xv1_train_std,yv1_train,Xv1_test_std)
yv2_test_LR, yv2_test_LR_prob = fit_LR(Xv2_train_std,yv2_train,Xv2_test_std)
yv3_test_LR, yv3_test_LR_prob = fit_LR(Xv3_train_std,yv3_train,Xv3_test_std)

In [219]:
print_scores(yv1_test,yv1_test_LR)

Accuracy score: 0.7164858636389185
F1 score: 0.011368090944727556
Precision (true + / all predicted +): 0.4142857142857143
Recall (true + / all actual +): 0.005763116057233704


In [220]:
print_scores(yv2_test,yv2_test_LR)

Accuracy score: 0.7353865493400377
F1 score: 0.2846219201359388
Precision (true + / all predicted +): 0.5866900175131349
Recall (true + / all actual +): 0.1878855860908581


In [221]:
print_scores(yv3_test,yv3_test_LR)

Accuracy score: 0.7194086897869709
F1 score: 0.11926605504587158
Precision (true + / all predicted +): 0.5314465408805031
Recall (true + / all actual +): 0.06717011128775835


#### KNN

In [222]:
def fit_KNN(X_train,y_train,X_test):
    KNN = KNeighborsClassifier(n_neighbors=30)
    KNN.fit(X_train,y_train)
    return KNN.predict(X_test), KNN.predict_proba(X_test)[:,1]

In [None]:
yv1_test_KNN, yv1_test_KNN_prob = fit_KNN(Xv1_train_std,yv1_train,Xv1_test_std)
yv2_test_KNN, yv2_test_KNN_prob = fit_KNN(Xv2_train_std,yv2_train,Xv2_test_std)
yv3_test_KNN, yv3_test_KNN_prob = fit_KNN(Xv3_train_std,yv3_train,Xv3_test_std)

In [None]:
print_scores(yv1_test,yv1_test_KNN)

In [None]:
print_scores(yv2_test,yv2_test_KNN)

In [None]:
print_scores(yv3_test,yv3_test_KNN)

#### SVC

In [None]:
def fit_SVC(X_train,y_train,X_test):
    SVCmodel = SVC()
    SVCmodel.fit(X_train,y_train)
    return SVCmodel.predict(X_test)

In [None]:
# takes about 10-15 minutes
# yv1_test_SVC = fit_SVC(Xv1_train_std,yv1_train,Xv1_test_std)
# yv2_test_SVC = fit_SVC(Xv2_train_std,yv2_train,Xv2_test_std)

In [None]:
# print_scores(yv1_test,yv1_test_SVC)

In [None]:
# print_scores(yv2_test,yv2_test_SVC)

#### Random Forests

In [None]:
def fit_RFC(X_train,y_train,X_test):
    RFC = RandomForestClassifier(n_estimators=100,random_state=88)
    RFC.fit(X_train,y_train)
    return RFC.predict(X_test), RFC.predict_proba(X_test)[:,1]

In [None]:
yv1_test_RFC, yv1_test_RFC_prob = fit_RFC(Xv1_train,yv1_train,Xv1_test)
yv2_test_RFC, yv2_test_RFC_prob = fit_RFC(Xv2_train,yv2_train,Xv2_test)
yv3_test_RFC, yv3_test_RFC_prob = fit_RFC(Xv3_train,yv3_train,Xv3_test)
# yv3_test_RFC = fit_RFC(Xv3_train,yv3_train,Xv3_test)

In [None]:
print_scores(yv1_test,yv1_test_RFC)

In [None]:
print_scores(yv2_test,yv2_test_RFC)

In [None]:
print_scores(yv3_test,yv3_test_RFC)

In [None]:
# add random forest with all data - handles NaNs

#### XGBoost

In [None]:
def fit_XGB(X_train,y_train,X_test):
    XGB = XGBClassifier()
    XGB.fit(X_train,y_train)
    return XGB.predict(X_test), XGB.predict_proba(X_test)[:,1]

In [None]:
yv1_test_XGB, yv1_test_XGB_prob = fit_XGB(Xv1_train,yv1_train,Xv1_test)
yv2_test_XGB, yv2_test_XGB_prob = fit_XGB(Xv2_train,yv2_train,Xv2_test)
yv3_test_XGB, yv3_test_XGB_prob = fit_XGB(Xv3_train,yv3_train,Xv3_test)

In [None]:
print_scores(yv1_test,yv1_test_XGB)

In [None]:
print_scores(yv2_test,yv2_test_XGB)

In [None]:
print_scores(yv3_test,yv3_test_XGB)

### Feature Importance

In [None]:
def Sort_Tuple(tup):  
  
    # reverse = None (Sorts in Ascending order)  
    # key is set to sort using second element of  
    # sublist lambda has been used  
    tup.sort(key = lambda x: x[1],reverse=True)  
    return tup   

In [None]:
a = list(zip(Xv2_cols,LR.coef_[0]))
for coef in Sort_Tuple(a):
    print(coef)

In [None]:
# RFC
RFC = RandomForestClassifier(n_estimators=100,random_state=88)
RFC.fit(Xv2_train,yv2_train)
# for i in range(len(Xv2_cols)):
#     print(Xv2_cols[i],":",RFC.feature_importances_[i])

In [None]:
a = list(zip(Xv2_cols,RFC.feature_importances_))
  
# printing the sorted list of tuples
for coef in Sort_Tuple(a):
    print(coef)

### Oversampling - v2
Use RandomOverSampler after analysis from workbook 4

In [None]:
# Now add some random oversampling of the minority classes
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
Xv2_resampled, yv2_resampled = ros.fit_sample(Xv2_train,yv2_train)
Counter(yv2_resampled)

Xv3_resampled, yv3_resampled = ros.fit_sample(Xv3_train,yv3_train)
Counter(yv3_resampled)

#### Logistic Regression

In [None]:
LR = LogisticRegression(solver='liblinear')
LR.fit(Xv2_resampled,yv2_resampled)
yv2_test_LR_o = LR.predict(Xv2_test)
yv2_test_LR_o_prob = LR.predict_proba(Xv2_test)[:,1]

In [None]:
print_scores(yv2_test,yv2_test_LR_o)

In [None]:
yv3_test_LR_o = LR.predict(Xv3_test)
yv3_test_LR_o_prob = LR.predict_proba(Xv3_test)[:,1]
print_scores(yv3_test,yv3_test_LR_o)

#### RFC

In [None]:
RFC = RandomForestClassifier(n_estimators=100,random_state=88)
RFC.fit(Xv2_resampled,yv2_resampled)
yv2_test_RFC_o = RFC.predict(Xv2_test)
yv2_test_RFC_o_prob = RFC.predict_proba(Xv2_test)[:,1]

In [None]:
print_scores(yv2_test,yv2_test_RFC_o)

In [None]:
yv3_test_RFC_o = RFC.predict(Xv3_test)
yv3_test_RFC_o_prob = RFC.predict_proba(Xv3_test)[:,1]
print_scores(yv3_test,yv3_test_RFC_o)

#### XGBoost

In [None]:
XGB = XGBClassifier()
XGB.fit(Xv2_resampled,yv2_resampled)
# yv2_test_XGB_o = XGB.predict(Xv2_test)
# yv2_test_XGB_o_prob = XGB.predict_proba(Xv2_test)[:,1]

In [None]:
print_scores(yv2_test,yv2_test_XGB_o)

### Ensembling

In [None]:
# create voting classifier
model_list = [('LR',eval('LR')),('RFC',eval('RFC'))]

def ensembling(X_train,y_train,X_test,y_test,vote_type,weights):
    voting_classifer = VotingClassifier(estimators=model_list,
                                        voting=vote_type,
                                        weights = weights,
                                        n_jobs=-1)
    voting_classifer.fit(X_train, y_train)

    yv2_test_VC = voting_classifer.predict(X_test)
#     yv2_test_VC, yv2_test_VC_prob = voting_classifer.predict(Xv2_test), voting_classifer.predict_proba(Xv2_test)[:,1]
    return print_scores(y_test, yv2_test_VC)

In [None]:
ensembling(Xv2_resampled,yv2_resampled,Xv2_test,yv2_test,'hard',['0.5','0.5'])

In [None]:
# ensembling('soft',['0.5','0.5'])

In [None]:
ensembling(Xv2_resampled,yv2_resampled,Xv2_test,yv2_test,'hard',['0.9','0.1'])
#ends up voting with LR

In [None]:
# ensembling('soft',['0.7','0.3'])

In [None]:
ensembling(Xv3_resampled,yv3_resampled,Xv3_test,yv3_test,'hard',['0.5','0.5'])

In [None]:
ensembling(Xv3_resampled,yv3_resampled,Xv3_test,yv3_test,'hard',['0.9','0.1'])

### Visualization

In [None]:
# Compare ROCs for:
fpr1, tpr1, thresholds1 = roc_curve(yv1_test, yv1_test_LR_prob) # LR
fpr2, tpr2, thresholds1 = roc_curve(yv1_test, yv1_test_KNN_prob) # KNN
fpr3, tpr3, thresholds1 = roc_curve(yv1_test, yv1_test_RFC_prob) # RFC
fpr4, tpr4, thresholds1 = roc_curve(yv1_test, yv1_test_XGB_prob) # XGB

fpr5, tpr5, thresholds1 = roc_curve(yv2_test, yv2_test_LR_prob) # LR
fpr6, tpr6, thresholds1 = roc_curve(yv2_test, yv2_test_KNN_prob) # KNN
fpr7, tpr7, thresholds1 = roc_curve(yv2_test, yv2_test_RFC_prob) # RFC
fpr8, tpr8, thresholds1 = roc_curve(yv2_test, yv2_test_XGB_prob) # XGB

fpr9, tpr9, thresholds9 = roc_curve(yv2_test, yv2_test_LR_o_prob) # LR
fpr10, tpr10, thresholds10 = roc_curve(yv2_test, yv2_test_RFC_o_prob) # RFC

fpr11, tpr11, thresholds11 = roc_curve(yv3_test, yv3_test_LR_o_prob) # LR
fpr12, tpr12, thresholds12 = roc_curve(yv3_test, yv3_test_RFC_o_prob) # RFC


plt.figure(figsize=(15,7))
# plt.plot(fpr1, tpr1,lw=2,label='LR')
# plt.plot(fpr2, tpr2,lw=2,label='KNN')
# plt.plot(fpr3, tpr3,lw=2,label='RFC')
# plt.plot(fpr4, tpr4,lw=2,label='XGB')

# plt.plot(fpr5, tpr5,lw=2,label='LR - v2')
# plt.plot(fpr6, tpr6,lw=2,label='KNN - v2')
# plt.plot(fpr7, tpr7,lw=2,label='RFC - v2')
# plt.plot(fpr8, tpr8,lw=2,label='XGB - v2')

plt.plot(fpr9, tpr9,lw=2,label='LR o - v2')
plt.plot(fpr10, tpr10,lw=2,label='RFC o - v2')

plt.plot(fpr11, tpr11,lw=2,label='LR o - v3')
plt.plot(fpr12, tpr12,lw=2,label='RFC o - v3')

plt.plot([0,1],[0,1],c='violet',ls='--')
plt.xlim([-0.05,1.05])
plt.ylim([-0.05,1.05])

plt.legend(loc='lower right')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve for no-show problem');

In [None]:
print(f'LR - v1: {roc_auc_score(yv1_test, yv1_test_LR_prob):.3f}') # LR
print(f'KNN - v1: {roc_auc_score(yv1_test, yv1_test_KNN_prob):.3f}') # KNN
print(f'RFC - v1: {roc_auc_score(yv1_test, yv1_test_RFC_prob):.3f}') # RFC
print(f'XGB - v1: {roc_auc_score(yv1_test, yv1_test_XGB_prob):.3f}') # XGB

print(f'LR - v2: {roc_auc_score(yv2_test, yv2_test_LR_prob):.3f}') # LR
print(f'KNN - v2: {roc_auc_score(yv2_test, yv2_test_KNN_prob):.3f}') # KNN
print(f'RFC - v2: {roc_auc_score(yv2_test, yv2_test_RFC_prob):.3f}') # RFC
print(f'XGB - v2: {roc_auc_score(yv2_test, yv2_test_XGB_prob):.3f}') # XGB

print(f'LR o - v2: {roc_auc_score(yv2_test, yv2_test_LR_o_prob):.3f}') # LR
print(f'RFC o - v2: {roc_auc_score(yv2_test, yv2_test_RFC_o_prob):.3f}') # RFC

print(f'LR o - v3: {roc_auc_score(yv3_test, yv3_test_LR_o_prob):.3f}') # LR
print(f'RFC o - v3: {roc_auc_score(yv3_test, yv3_test_RFC_o_prob):.3f}') # RFC