In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from imblearn.over_sampling import SMOTE

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style("white")

pd.set_option("display.max_columns",100)
pd.set_option("display.max_rows",100)

In [2]:
med = pd.read_pickle("med_k_fold.pkl")

In [3]:
med.head()

Unnamed: 0,Patient_ID,Appointment_ID,Gender,Scheduled_Day,Appointment_Day,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,No_Show,Age_Normalized,Waiting_Days_Normalized,Handicap_1,Handicap_2,Handicap_3,Handicap_4,SMS_Received_1,Day_Of_Week_Monday,Day_Of_Week_Saturday,Day_Of_Week_Thursday,Day_Of_Week_Tuesday,Day_Of_Week_Wednesday
0,29872499824296,5642903,1,2016-04-29,2016-04-29,Jardim Da Penha,0,1,0,0,0,0.53913,0.0,0,0,0,0,0,0,0,0,0,0
1,558997776694438,5642503,0,2016-04-29,2016-04-29,Jardim Da Penha,0,0,0,0,0,0.486957,0.0,0,0,0,0,0,0,0,0,0,0
2,4262962299951,5642549,1,2016-04-29,2016-04-29,Mata Da Praia,0,0,0,0,0,0.53913,0.0,0,0,0,0,0,0,0,0,0,0
3,867951213174,5642828,1,2016-04-29,2016-04-29,Pontal De Camburi,0,0,0,0,0,0.069565,0.0,0,0,0,0,0,0,0,0,0,0
4,8841186448183,5642494,1,2016-04-29,2016-04-29,Jardim Da Penha,0,1,1,0,0,0.486957,0.0,0,0,0,0,0,0,0,0,0,0


### next steps:

    (Test set as new df = 20.000 observations, target split equally (manually)
    Rest = Train set as new df)
    
 
    1. med: make index to column using .reset_index = med_index
    2. create two subsets for 50% 50% split '0' and '1' -> med_index0, med_index1
        2.a make med_index0 and med_index1 to list of dicts, using df.to_dict(orient='records') =
        med_index_records
        2.b create test set -> using random.sample(med_index_records, size=20000/2)
        -> test_set0 and test_set1
        2.c concat both test sets
        2.d convert test set to df = med_test
        2.e shuffle med_test
    3. med.drop(med_test["index"].to_list())
    4. rename med to train
        4.a drop index column of train
    5. apply SMOTE on train
    6. 5 fold cross validation

In [4]:
med.No_Show.value_counts()

0    88207
1    22314
Name: No_Show, dtype: int64

In [5]:
med_index=med.reset_index()

In [6]:
med_index.head()

Unnamed: 0,index,Patient_ID,Appointment_ID,Gender,Scheduled_Day,Appointment_Day,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,No_Show,Age_Normalized,Waiting_Days_Normalized,Handicap_1,Handicap_2,Handicap_3,Handicap_4,SMS_Received_1,Day_Of_Week_Monday,Day_Of_Week_Saturday,Day_Of_Week_Thursday,Day_Of_Week_Tuesday,Day_Of_Week_Wednesday
0,0,29872499824296,5642903,1,2016-04-29,2016-04-29,Jardim Da Penha,0,1,0,0,0,0.53913,0.0,0,0,0,0,0,0,0,0,0,0
1,1,558997776694438,5642503,0,2016-04-29,2016-04-29,Jardim Da Penha,0,0,0,0,0,0.486957,0.0,0,0,0,0,0,0,0,0,0,0
2,2,4262962299951,5642549,1,2016-04-29,2016-04-29,Mata Da Praia,0,0,0,0,0,0.53913,0.0,0,0,0,0,0,0,0,0,0,0
3,3,867951213174,5642828,1,2016-04-29,2016-04-29,Pontal De Camburi,0,0,0,0,0,0.069565,0.0,0,0,0,0,0,0,0,0,0,0
4,4,8841186448183,5642494,1,2016-04-29,2016-04-29,Jardim Da Penha,0,1,1,0,0,0.486957,0.0,0,0,0,0,0,0,0,0,0,0


In [7]:
med_index0 = med_index[med_index['No_Show'] == 0]

med_index1 = med_index[med_index['No_Show'] == 1]

In [8]:
med_index0_records=med_index0.to_dict(orient='records')

med_index1_records=med_index1.to_dict(orient='records')

In [9]:
import random

test_set0=random.sample(med_index0_records, 10000)

test_set1=random.sample(med_index1_records, 10000)

In [10]:
test0=pd.DataFrame.from_dict(test_set0)

In [11]:
test1=pd.DataFrame.from_dict(test_set1)

In [12]:
med_test = pd.concat([test0, test1])

In [13]:
test = med_test.sample(frac=1)

In [14]:
test

Unnamed: 0,index,Patient_ID,Appointment_ID,Gender,Scheduled_Day,Appointment_Day,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,No_Show,Age_Normalized,Waiting_Days_Normalized,Handicap_1,Handicap_2,Handicap_3,Handicap_4,SMS_Received_1,Day_Of_Week_Monday,Day_Of_Week_Saturday,Day_Of_Week_Thursday,Day_Of_Week_Tuesday,Day_Of_Week_Wednesday
7412,80242,4341456552971,5603043,1,2016-04-19,2016-05-05,Piedade,0,0,0,0,1,0.000000,16.0,0,0,0,0,1,0,0,1,0,0
1515,107308,65889193283777,5539692,1,2016-04-01,2016-06-07,Fradinhos,0,0,0,0,1,0.173913,67.0,0,0,0,0,1,0,0,0,1,0
588,5639,3542394547493,5662358,1,2016-05-05,2016-05-10,Forte São João,0,0,0,0,1,0.295652,5.0,0,0,0,0,1,0,0,0,1,0
142,60258,65595654463159,5586747,1,2016-04-15,2016-05-13,Fonte Grande,0,0,0,0,0,0.069565,28.0,0,0,0,0,0,0,0,0,0,0
2145,62211,82236556986853,5636384,1,2016-04-28,2016-05-25,Maria Ortiz,1,0,0,0,0,0.573913,27.0,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5141,68799,779318614218216,5613498,1,2016-04-25,2016-05-03,Tabuazeiro,1,0,0,0,0,0.330435,8.0,0,0,0,0,1,0,0,0,1,0
1677,34074,56863938424923,5618694,1,2016-04-26,2016-05-20,Gurigica,0,1,1,0,1,0.591304,24.0,0,0,0,0,0,0,0,0,0,0
6055,59998,71694358855728,5620966,0,2016-04-26,2016-05-16,Fonte Grande,0,0,0,0,1,0.026087,20.0,0,0,0,0,0,1,0,0,0,0
7417,11767,72488133273339,5722055,1,2016-05-20,2016-05-24,Santo André,0,0,0,0,0,0.313043,4.0,0,0,0,0,1,0,0,0,1,0


In [17]:
train = med.drop(test["index"].to_list())

In [20]:
train = train.reset_index(drop=True)

### 1. Define train X, y

In [21]:
y = train[['No_Show']]#target / response as dependent variable

X = train.drop(['Patient_ID',
                 'Appointment_ID',
                 'Scheduled_Day', 
                 'Appointment_Day',
                 'No_Show',
                 'Neighbourhood'], axis=1) #predictors as independent variables

In [22]:
train.No_Show.value_counts()

0    78207
1    12314
Name: No_Show, dtype: int64

In [23]:
y = pd.DataFrame.to_numpy(y)

### 2. Apply SMOTE on train

In [24]:
sm = SMOTE() 
X_resampled, y_resampled = sm.fit_sample(X, y.ravel())

In [25]:
# describes info about train and test set 
print("Number transactions X_resampled dataset: ", X_resampled.shape) 
print("Number transactions y_resampled dataset: ", y_resampled.shape) 

Number transactions X_resampled dataset:  (156414, 17)
Number transactions y_resampled dataset:  (156414,)


In [28]:
np.unique(y_resampled, return_counts=True)

(array([0, 1]), array([78207, 78207], dtype=int64))

### 3. Apply KFold on train X

In [35]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

In [33]:
kf = KFold()
kf.get_n_splits(X)
print(kf)

KFold(n_splits=5, random_state=None, shuffle=False)


In [34]:
for train, test in kf.split(X):
    print("%s %s" % (train, test))

[18105 18106 18107 ... 90518 90519 90520] [    0     1     2 ... 18102 18103 18104]
[    0     1     2 ... 90518 90519 90520] [18105 18106 18107 ... 36206 36207 36208]
[    0     1     2 ... 90518 90519 90520] [36209 36210 36211 ... 54310 54311 54312]
[    0     1     2 ... 90518 90519 90520] [54313 54314 54315 ... 72414 72415 72416]
[    0     1     2 ... 72414 72415 72416] [72417 72418 72419 ... 90518 90519 90520]
