In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,accuracy_score,roc_auc_score

In [2]:
df = pd.read_csv('/Users/jesskim/Downloads/diabetic_data.csv')

In [3]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
#first, replace '?' with NaN
df = df.replace('?', np.nan)
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [5]:
#According to Table1 column 'weight', 'payer_code' and 'Medical Specialty' has missing value exceeding 50% of the data
#remove, 
df.drop(['weight', 'payer_code', 'medical_specialty'], axis = 1, inplace = True)
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),6,25,1,1,41,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,59,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,11,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,44,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,51,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [6]:
#check if gender has any other values.
df.gender.unique()

array(['Female', 'Male', 'Unknown/Invalid'], dtype=object)

In [7]:
#removing unkown/invalid gender from the dataset
df = df[df.gender != 'Unknown/Invalid']
df.gender.unique()

array(['Female', 'Male'], dtype=object)

In [8]:
#drop the ID columns which will have no affect on the model.
df.drop(['encounter_id','patient_nbr','admission_type_id','discharge_disposition_id',
         'admission_source_id'],axis=1,inplace=True)

In [9]:
df.dropna(inplace = True)


In [10]:
df.head()

Unnamed: 0,race,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
1,Caucasian,Female,[10-20),3,59,0,18,0,0,0,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),2,11,5,13,2,0,1,...,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),2,44,1,16,0,0,0,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),1,51,0,8,0,0,0,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,Caucasian,Male,[50-60),3,31,6,16,0,0,0,...,No,Steady,No,No,No,No,No,No,Yes,>30


In [11]:
diag_cols = ['diag_1','diag_2','diag_3']
for col in diag_cols:
    df[col] = df[col].str.replace('E','-')
    df[col] = df[col].str.replace('V','-')
    condition = df[col].str.contains('250')
    df.loc[condition,col] = '250'

df[diag_cols] = df[diag_cols].astype(float)  

In [12]:
# diagnosis grouping
for col in diag_cols:
    df['temp']=np.nan
    
    condition = df[col]==250
    df.loc[condition,'temp']='Diabetes'
    
    condition = (df[col]>=390) & (df[col]<=458) | (df[col]==785)
    df.loc[condition,'temp']='Circulatory'
    
    condition = (df[col]>=460) & (df[col]<=519) | (df[col]==786)
    df.loc[condition,'temp']='Respiratory'
    
    condition = (df[col]>=520) & (df[col]<=579) | (df[col]==787)
    df.loc[condition,'temp']='Digestive'
    
    condition = (df[col]>=580) & (df[col]<=629) | (df[col]==788)
    df.loc[condition,'temp']='Genitourinary'
    
    condition = (df[col]>=800) & (df[col]<=999)
    df.loc[condition,'temp']='Injury'
    
    condition = (df[col]>=710) & (df[col]<=739)
    df.loc[condition,'temp']='Muscoloskeletal'
    
    condition = (df[col]>=140) & (df[col]<=239)
    df.loc[condition,'temp']='Neoplasms'
    
    condition = df[col]==0
    df.loc[condition,col]='?'
    df['temp']=df['temp'].fillna('Others')
    condition = df['temp']=='0'
    df.loc[condition,'temp']=np.nan
    df[col]=df['temp']
    df.drop('temp',axis=1,inplace=True)

df.dropna(inplace=True)


In [13]:
df['age'] = df['age'].str[1:].str.split('-',expand=True)[0]
df['age'] = df['age'].astype(int)
df.age.head()

1    10
2    20
3    30
4    40
5    50
Name: age, dtype: int64

In [14]:
target = ['time_in_hospital']
for col in target:
    df['target']=np.nan
    
    condition = df[col]<7
    df.loc[condition,'target']=1
    
    condition = df[col]>=7
    df.loc[condition, 'target'] =0
    
    df[col]=df['target']
    #df[col] = df[col].astype(int)
    df.drop('target',axis=1,inplace=True)

df['time_in_hospital'].astype(int)

In [22]:
cat_cols = list(df.select_dtypes('object').columns)
class_dict = {}
for col in cat_cols:
    df = pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col])], axis=1)
df.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,AfricanAmerican,...,No,No.1,Steady,Ch,No.2,No.3,Yes,<30,>30,NO
1,10,1.0,59,0,18,0,0,0,9,0,...,1,1,0,1,0,0,1,0,1,0
2,20,1.0,11,5,13,2,0,1,6,1,...,1,1,0,0,1,0,1,0,0,1
3,30,1.0,44,1,16,0,0,0,7,0,...,1,1,0,1,0,0,1,0,0,1
4,40,1.0,51,0,8,0,0,0,5,0,...,1,1,0,1,0,0,1,0,0,1
5,50,1.0,31,6,16,0,0,0,9,0,...,1,1,0,0,1,0,1,0,1,0


In [25]:
X_train,X_test,y_train,y_test = train_test_split(df.drop('time_in_hospital',axis=1),
                                                 df['time_in_hospital'],
                                                 test_size=0.05,
                                                 random_state=0)
X_train.shape,X_test.shape

((93149, 127), (4903, 127))

In [26]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

CLASSIFIERS = {
    'gaussian': GaussianNB(),
    'rf': RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0),
    'dt': DecisionTreeClassifier(criterion = 'entropy', random_state = 0),
    'svc': SVC(kernel = 'linear', random_state = 0),
    'abc': AdaBoostClassifier(random_state = 0),
    'bnb': BernoulliNB(),
    'knc': KNeighborsClassifier(),
    'log': LogisticRegression(solver='liblinear',random_state=0)    
}

classifier = CLASSIFIERS['log']
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)

np.trace(cm)


classifier.fit(X_train, y_train)
pred = classifier.predict(X_test)
accuracy = np.mean(pred == y_test)
print('accuracy: ', accuracy*100, '%')

[[ 286  715]
 [ 142 3760]]
accuracy:  82.52090556801957 %
