# **Diabetic Patient Readmission -- Pre-processing and Training Data Development**

This dataset was analyzed by numerous Virginia Commonwealth University faculty in a recent research article which is accompanied by feature descriptions. These can be found at https://www.hindawi.com/journals/bmri/2014/781670/tab1/.

In [1]:
import os

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import pipeline, svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.feature_selection import chi2
from sklearn.metrics import classification_report, accuracy_score

%matplotlib inline

In [2]:
df1 = pd.read_csv('clean_data2.csv')
df1.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),6,25,1,1,Pediatrics-Endocrinology,...,No,No,No,No,No,No,No,No,No,Other
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,missing,...,No,Up,No,No,No,No,No,Ch,Yes,Other
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,missing,...,No,No,No,No,No,No,No,No,Yes,Other
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,missing,...,No,Up,No,No,No,No,No,Ch,Yes,Other
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,missing,...,No,Steady,No,No,No,No,No,Ch,Yes,Other


In [3]:
df1 = df1.drop(columns=['encounter_id','patient_nbr'])    #irrelevant columns for modeling

In [4]:
X = df1.drop(columns=['readmitted'])
y = df1[['readmitted']]
X.shape, y.shape

((101766, 43), (101766, 1))

In [5]:
X = pd.get_dummies(X, drop_first=True)
X.head()

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,insulin_Up,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady,change_No,diabetesMed_Yes
0,6,25,1,1,41,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,1,1,7,3,59,0,18,0,0,0,...,1,1,0,0,0,0,0,0,0,1
2,1,1,7,2,11,5,13,2,0,1,...,0,1,0,0,0,0,0,0,1,1
3,1,1,7,2,44,1,16,0,0,0,...,1,1,0,0,0,0,0,0,0,1
4,1,1,7,1,51,0,8,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [6]:
chi_sq = chi2(X, y)
chi_sq

(array([1.42599707e+01, 1.95199373e+03, 9.89249834e+00, ...,
        1.25618025e-01, 1.78868771e+01, 1.72186810e+01]),
 array([1.59215094e-04, 0.00000000e+00, 1.65953905e-03, ...,
        7.23019404e-01, 2.34432061e-05, 3.33144287e-05]))

In [7]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
# Split the data into a training and test set.
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.25, random_state=42)

In [9]:
logreg = LogisticRegression()     #Default C=1.0
logreg.fit(X_train, y_train)
print(accuracy_score(logreg.predict(X_test), y_test))

0.8854256740822263


In [10]:
knn3 = KNeighborsClassifier(n_neighbors=3)
knn3.fit(X_train, y_train)
print(accuracy_score(knn3.predict(X_test), y_test))

0.8645546733747347


In [11]:
knn5 = KNeighborsClassifier(n_neighbors=5)
knn5.fit(X_train, y_train)
print(accuracy_score(knn5.predict(X_test), y_test))

0.8785865891046302


In [12]:
dtree = DecisionTreeClassifier()    #Default max_depth=None
dtree.fit(X_train, y_train)
print(accuracy_score(dtree.predict(X_test), y_test))

0.8202185362785944


In [13]:
rfclf = RandomForestClassifier(random_state=42)    #Default n_estimators=100 and max_depth=None
rfclf.fit(X_train, y_train)
print(accuracy_score(rfclf.predict(X_test), y_test))

0.8883342504520085


In [14]:
svmclf = svm.LinearSVC(random_state=42)    #Default C=1.0
svmclf.fit(X_train, y_train)
print(accuracy_score(svmclf.predict(X_test), y_test))

0.8593270969263422


In [15]:
gbc = GradientBoostingClassifier(max_features=80, learning_rate=1, random_state=42)    #Default n_estimators=100, criterion='friedman_mse', max_depth=3
gbc.fit(X_train, y_train)
print(accuracy_score(gbc.predict(X_test), y_test))

0.8833031994340068


In [21]:
train = xgb.DMatrix(X_train, y_train, enable_categorical=True)
test = xgb.DMatrix(X_test, enable_categorical=True)
xgbooster = xgb.Booster()
xgbooster.train(train)
print(accuracy_score(xgbooster.predict(test), y_test))

ValueError: DataFrame.dtypes for data must be int, float, bool or category.  When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`. Invalid columns:readmitted