# **Diabetic Patient Readmission -- Pre-processing and Training Data Development**

This dataset was analyzed by numerous Virginia Commonwealth University faculty in a recent research article which is accompanied by feature descriptions. These can be found at https://www.hindawi.com/journals/bmri/2014/781670/tab1/.

In [1]:
import os

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import pipeline, svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.feature_selection import chi2
from sklearn.metrics import classification_report, accuracy_score, make_scorer

%matplotlib inline

In [2]:
df1 = pd.read_csv('clean_data2.csv')
df1.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),6,25,1,1,Pediatrics-Endocrinology,...,No,No,No,No,No,No,No,No,No,Other
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,missing,...,No,Up,No,No,No,No,No,Ch,Yes,Other
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,missing,...,No,No,No,No,No,No,No,No,Yes,Other
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,missing,...,No,Up,No,No,No,No,No,Ch,Yes,Other
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,missing,...,No,Steady,No,No,No,No,No,Ch,Yes,Other


In [3]:
df1 = df1.drop(columns=['encounter_id','patient_nbr'])    #irrelevant columns for modeling

In [4]:
X = df1.drop(columns=['readmitted'])
y = df1[['readmitted']]
X.shape, y.shape

((101766, 43), (101766, 1))

In [5]:
X = pd.get_dummies(X, drop_first=True)
X.head()

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,insulin_Up,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady,change_No,diabetesMed_Yes
0,6,25,1,1,41,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,1,1,7,3,59,0,18,0,0,0,...,1,1,0,0,0,0,0,0,0,1
2,1,1,7,2,11,5,13,2,0,1,...,0,1,0,0,0,0,0,0,1,1
3,1,1,7,2,44,1,16,0,0,0,...,1,1,0,0,0,0,0,0,0,1
4,1,1,7,1,51,0,8,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [6]:
chi_sq = chi2(X, y)
chi_sq

(array([1.42599707e+01, 1.95199373e+03, 9.89249834e+00, ...,
        1.25618025e-01, 1.78868771e+01, 1.72186810e+01]),
 array([1.59215094e-04, 0.00000000e+00, 1.65953905e-03, ...,
        7.23019404e-01, 2.34432061e-05, 3.33144287e-05]))

In [7]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
# Split the data into a training and test set.
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.25, random_state=42)

In [9]:
logreg = LogisticRegression()     #Default C=1.0
logreg_cv_results = cross_validate(logreg, X_train, y_train, cv=5, scoring='f1_weighted', n_jobs=-1) #'f1' producing 'nan' for all scores
logreg_cv_scores = logreg_cv_results['test_score']
print(np.mean(logreg_cv_scores), np.std(logreg_cv_scores))

0.8402788531271586 0.0012654842316573232


In [10]:
def class_report(clf, X, y):
    clf = clf.fit(X, y)
    y_pred = clf.predict(X)
    y_true = y
    print(classification_report(y_true, y_pred))

In [11]:
class_report(logreg, X_train, y_train)

              precision    recall  f1-score   support

         <30       0.63      0.04      0.08      8508
       Other       0.89      1.00      0.94     67816

    accuracy                           0.89     76324
   macro avg       0.76      0.52      0.51     76324
weighted avg       0.86      0.89      0.85     76324



In [12]:
knn3 = KNeighborsClassifier(n_neighbors=3)
knn3_cv_results = cross_validate(knn3, X_train, y_train, cv=5, 
                                    scoring='f1_weighted', n_jobs=-1) #'f1' producing 'nan' for all scores
knn3_cv_scores = knn3_cv_results['test_score']
print(np.mean(knn3_cv_scores), np.std(knn3_cv_scores))

0.8321285434174606 0.000810181128554641


In [13]:
class_report(knn3, X_train, y_train)

              precision    recall  f1-score   support

         <30       0.72      0.24      0.36      8508
       Other       0.91      0.99      0.95     67816

    accuracy                           0.90     76324
   macro avg       0.82      0.61      0.65     76324
weighted avg       0.89      0.90      0.88     76324



In [14]:
knn5 = KNeighborsClassifier(n_neighbors=5)
knn5_cv_results = cross_validate(knn5, X_train, y_train, cv=5, 
                                    scoring='f1_weighted', n_jobs=-1) #'f1' producing 'nan' for all scores
knn5_cv_scores = knn5_cv_results['test_score']
print(np.mean(knn5_cv_scores), np.std(knn5_cv_scores))

0.8360957220214129 0.0006341719243474624


In [15]:
class_report(knn5, X_train, y_train)

              precision    recall  f1-score   support

         <30       0.66      0.09      0.16      8508
       Other       0.90      0.99      0.94     67816

    accuracy                           0.89     76324
   macro avg       0.78      0.54      0.55     76324
weighted avg       0.87      0.89      0.86     76324



In [16]:
dtree = DecisionTreeClassifier()    #Default max_depth=None
dtree_cv_results = cross_validate(dtree, X_train, y_train, cv=5, 
                                    scoring='f1_weighted', n_jobs=-1) #'f1' producing 'nan' for all scores
dtree_cv_scores = dtree_cv_results['test_score']
print(np.mean(dtree_cv_scores), np.std(dtree_cv_scores))

0.8185659566436037 0.0014226651221841584


In [17]:
class_report(dtree, X_train, y_train)

              precision    recall  f1-score   support

         <30       1.00      1.00      1.00      8508
       Other       1.00      1.00      1.00     67816

    accuracy                           1.00     76324
   macro avg       1.00      1.00      1.00     76324
weighted avg       1.00      1.00      1.00     76324



In [18]:
rfclf = RandomForestClassifier(random_state=42)    #Default n_estimators=100 and max_depth=None
rfclf_cv_results = cross_validate(rfclf, X_train, y_train, cv=5, 
                                    scoring='f1_weighted', n_jobs=-1) #'f1' producing 'nan' for all scores
rfclf_cv_scores = rfclf_cv_results['test_score']
print(np.mean(rfclf_cv_scores), np.std(rfclf_cv_scores))

0.8367650447037785 0.00010619024598811543


In [19]:
class_report(rfclf, X_train, y_train)

              precision    recall  f1-score   support

         <30       1.00      1.00      1.00      8508
       Other       1.00      1.00      1.00     67816

    accuracy                           1.00     76324
   macro avg       1.00      1.00      1.00     76324
weighted avg       1.00      1.00      1.00     76324



In [20]:
svmclf = svm.LinearSVC(random_state=42)    #Default C=1.0
svm_cv_results = cross_validate(svmclf, X_train, y_train, cv=5, 
                                    scoring='f1_weighted', n_jobs=-1) #'f1' producing 'nan' for all scores
svm_cv_scores = svm_cv_results['test_score']
print(np.mean(svm_cv_scores), np.std(svm_cv_scores))

0.8305078768135129 0.0019464326041995612


In [21]:
class_report(svmclf, X_train, y_train)

              precision    recall  f1-score   support

         <30       0.21      0.07      0.10      8508
       Other       0.89      0.97      0.93     67816

    accuracy                           0.87     76324
   macro avg       0.55      0.52      0.52     76324
weighted avg       0.82      0.87      0.84     76324



In [22]:
gbc = GradientBoostingClassifier(max_features=80, learning_rate=1, random_state=42)    #Default n_estimators=100, criterion='friedman_mse', max_depth=3
gbc_cv_results = cross_validate(gbc, X_train, y_train, cv=5, 
                                    scoring='f1_weighted', n_jobs=-1) #'f1' producing 'nan' for all scores
gbc_cv_scores = gbc_cv_results['test_score']
print(np.mean(gbc_cv_scores), np.std(gbc_cv_scores))

0.839034061781024 0.00042559714327267123


In [23]:
class_report(gbc, X_train, y_train)

              precision    recall  f1-score   support

         <30       0.63      0.06      0.11      8508
       Other       0.89      1.00      0.94     67816

    accuracy                           0.89     76324
   macro avg       0.76      0.53      0.53     76324
weighted avg       0.86      0.89      0.85     76324



In [24]:
xgbooster = xgb.XGBClassifier()
xgb_cv_results = cross_validate(xgbooster, X_train, y_train, cv=5, 
                                    scoring='f1_weighted', n_jobs=-1) #'f1' producing 'nan' for all scores
xgb_cv_scores = xgb_cv_results['test_score']
print(np.mean(xgb_cv_scores), np.std(xgb_cv_scores))

0.8400447559508795 0.0005316555915480847


In [25]:
class_report(xgbooster, X_train, y_train)

              precision    recall  f1-score   support

         <30       0.90      0.05      0.10      8508
       Other       0.89      1.00      0.94     67816

    accuracy                           0.89     76324
   macro avg       0.90      0.53      0.52     76324
weighted avg       0.89      0.89      0.85     76324

