In [24]:
import pandas as pd

import os
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
from sklearn.model_selection import train_test_split


In [13]:
path = "../dataset_diabetes/diabetic_data.csv"

diabetes = pd.read_csv(path, header = 0)
diabetes

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,35754,82637451,Caucasian,Male,[50-60),?,2,1,2,3,...,No,Steady,No,No,No,No,No,No,Yes,>30
6,55842,84259809,Caucasian,Male,[60-70),?,3,1,2,4,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
7,63768,114882984,Caucasian,Male,[70-80),?,1,1,7,5,...,No,No,No,No,No,No,No,No,Yes,>30
8,12522,48330783,Caucasian,Female,[80-90),?,2,1,4,13,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
9,15738,63555939,Caucasian,Female,[90-100),?,3,3,4,12,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [9]:
race_count = diabetes.race.value_counts()

In [24]:
race_tab = pd.crosstab(index = diabetes['race'], columns = 'count')
race_tab['ratio'] = race_tab/race_tab.sum()
race_tab

col_0,count,ratio
race,Unnamed: 1_level_1,Unnamed: 2_level_1
?,2273,0.022336
AfricanAmerican,19210,0.188766
Asian,641,0.006299
Caucasian,76099,0.747784
Hispanic,2037,0.020017
Other,1506,0.014799


In [26]:
diabetes.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [64]:
diabetes.A1Cresult.value_counts()

None    84748
>8       8216
Norm     4990
>7       3812
Name: A1Cresult, dtype: int64

In [30]:
diabetes['readmitted'].value_counts()

NO     54864
>30    35545
<30    11357
Name: readmitted, dtype: int64

In [47]:
readmit_race = pd.crosstab(index = diabetes['readmitted'], columns = diabetes['race'], margins = True)
readmit_race.columns = ['Unknown','African American','Asian','Caucasian', 'Hispanic','Other','rowtotal']
readmit_race.index = ['<30','>30', 'NO', 'coltotal']
readmit_race
readmit_race/readmit_race.loc['coltotal']
#Caucasian are mostly likely to readmitted 


Unnamed: 0,Unknown,African American,Asian,Caucasian,Hispanic,Other,rowtotal
<30,0.08271,0.112181,0.101404,0.112906,0.104075,0.096282,0.111599
>30,0.236692,0.345341,0.25117,0.35643,0.315169,0.296149,0.349282
NO,0.680598,0.542478,0.647426,0.530664,0.580756,0.60757,0.539119
coltotal,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [55]:
gender_tab = pd.crosstab(index = diabetes['gender'], columns = 'count')
gender_tab


col_0,count
gender,Unnamed: 1_level_1
Female,54708
Male,47055
Unknown/Invalid,3


## Classifiers 

In [121]:
diabetes.dtypes
#need to clean up more data 

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [152]:
## Drop unnecessary variables 
df = pd.DataFrame(diabetes)
#type(df)
#df_data = df.drop(['encounter_id', 'patient_nbr','weight'], axis = 1)
#df = df.drop(['examide', 'citoglipton'])
#drop_elements = ['examide','citoglipton']
#print(df.shape)


In [148]:
drop_elements = ['examide','citoglipton']

df_clean = df.drop(drop_elements,axis=1)

In [168]:
df_clean = df_clean.replace('?', np.nan)

In [169]:
df_clean.isnull().values.any()

True

In [166]:
dummies = pd.get_dummies(df_clean)
dummies.head(3)

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes,readmitted_<30,readmitted_>30,readmitted_NO
0,6,25,1,1,41,0,1,0,0,0,...,0,1,0,0,1,1,0,0,0,1
1,1,1,7,3,59,0,18,0,0,0,...,0,1,0,1,0,0,1,0,1,0
2,1,1,7,2,11,5,13,2,0,1,...,0,1,0,0,1,0,1,0,0,1


In [202]:
#df_clean.columns[1]
for i in df_clean.columns:
    #print(i)
    if (len(set(df_clean[i])) > 2):
        print(i)
    else:
        


race
gender
age
admission_type_id
discharge_disposition_id
admission_source_id
time_in_hospital
payer_code
medical_specialty
num_lab_procedures
num_procedures
num_medications
number_outpatient
number_emergency
number_inpatient
diag_1
diag_2
diag_3
number_diagnoses
max_glu_serum
A1Cresult
metformin
repaglinide
nateglinide
chlorpropamide
glimepiride
glipizide
glyburide
pioglitazone
rosiglitazone
acarbose
miglitol
tolazamide
insulin
glyburide-metformin
readmitted


In [222]:
df_numeric = df_clean.loc[:,['admission_source_id','discharge_disposition_id','admission_type_id','time_in_hospital','num_lab_procedures','num_procedures','num_medications','number_outpatient','number_emergency','number_inpatient','number_diagnoses']]

In [234]:
df_numeric.index[0]

0

In [223]:
df_nonnum = df_clean.drop(df_numeric,axis=1)

In [224]:
df_nonnum.dtypes

race                        object
gender                      object
age                         object
payer_code                  object
medical_specialty           object
diag_1                      object
diag_2                      object
diag_3                      object
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
acetohexamide               object
glipizide                   object
glyburide                   object
tolbutamide                 object
pioglitazone                object
rosiglitazone               object
acarbose                    object
miglitol                    object
troglitazone                object
tolazamide                  object
insulin                     object
glyburide-metformin         object
glipizide-metformin         object
glimepiride-pioglita

In [185]:
for col in df_clean:
    print ("the unique values for %s is %s"%(df_clean[col].unique()))

the unique values for Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'payer_code', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object') is ['Caucasian' 'AfricanAmerican' nan 'Other' 'Asian' 'Hispanic']
the unique values for Index(['race', 'gender', 'age', '

In [63]:
#Split dataset to training and test dataset 

y = df.readmitted
df.shape
#diabetes = numpy.random.rand(100,5)

(101766, 50)

In [31]:
#x_train, x_test, y_train, y_test = train_test_split(df, y, test_size = 0.2)
#print (x_train.shape, y_train.shape)
#print(x_test.shape, y_test.shape)


(81412, 50) (81412,)
(20354, 50) (20354,)


In [64]:
train, test = train_test_split(df, test_size = 0.2)
print(train.shape)
print(test.shape)

(81412, 50)
(20354, 50)


A class helps to extend some code/program for creating objects(variables) as well as to implement function and methods specific to that class

The section below writes a class SklearnHealper that allows one to extend the inbuilt methods(such as train, perdict and fit) common to all the Sklearn classifiers. So we don't need to write same methods five times

In [65]:
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)

In [38]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)



In [113]:
oof_train = np.zeros((ntrain,))
oof_test = np.zeros((ntest,))
oof_test_skf = np.empty((NFOLDS, ntest))
for i, (train_index, test_index) in enumerate(kf):
    x_tr = x_train[train_index]
    y_tr = y_train[train_index]
    x_te = x_train[test_index]
    


False

In [88]:
#et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) 

## Generate base first-level models 
1. Random Forest classifier 
2. Extra trees classifier 
3. AdaBoost classifier 
4. Gradient Boosting classifier 
5. Support Vector machine 

In [66]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [67]:
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)


In [72]:
y_train = train['readmitted']#.ravel() #the .ravel() makes y_train an array
x_train = train.values
x_test = test.values 
## x_test and x_train are arrays 

In [75]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = = get_oof(et, x_train, y_train, x_test) # Extra Trees
#rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
#ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
#gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
#svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier

ValueError: could not convert string to float: 'NO'