# **Diabetic Patient Readmission: Pre-processing, Training Data Development, and Initial Models**

This dataset was analyzed by numerous Virginia Commonwealth University faculty in a recent research article which is accompanied by feature descriptions. These can be found at https://www.hindawi.com/journals/bmri/2014/781670/tab1/.

In [1]:
import os

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import pipeline, svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.feature_selection import chi2
from sklearn.metrics import classification_report, accuracy_score, make_scorer

%matplotlib inline

In [2]:
df1 = pd.read_csv('clean_data2.csv')
df1.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),6,25,1,1,Pediatrics-Endocrinology,...,No,No,No,No,No,No,No,No,No,Other
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,missing,...,No,Up,No,No,No,No,No,Ch,Yes,Other
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,missing,...,No,No,No,No,No,No,No,No,Yes,Other
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,missing,...,No,Up,No,No,No,No,No,Ch,Yes,Other
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,missing,...,No,Steady,No,No,No,No,No,Ch,Yes,Other


# Pre-processing:

In [3]:
df1 = df1.drop(columns=['encounter_id','patient_nbr'])    #irrelevant columns for modeling

In [4]:
X = df1.drop(columns=['readmitted'])
y = df1[['readmitted']]
X.shape, y.shape

((97294, 43), (97294, 1))

In [5]:
X = pd.get_dummies(X, drop_first=True)
X.head()

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,insulin_Up,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady,change_No,diabetesMed_Yes
0,6,25,1,1,41,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,1,1,7,3,59,0,18,0,0,0,...,1,1,0,0,0,0,0,0,0,1
2,1,1,7,2,11,5,13,2,0,1,...,0,1,0,0,0,0,0,0,1,1
3,1,1,7,2,44,1,16,0,0,0,...,1,1,0,0,0,0,0,0,0,1
4,1,1,7,1,51,0,8,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split:

In [7]:
# Split the data into a training and test set.
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.25, random_state=42)

# Inital Models:

In [8]:
logreg = LogisticRegression()     #Default C=1.0
logreg_cv_results = cross_validate(logreg, X_train, y_train, cv=5, scoring='f1_weighted', n_jobs=-1) #'f1' producing 'nan' for all scores
logreg_cv_scores = logreg_cv_results['test_score']
print(np.mean(logreg_cv_scores), np.std(logreg_cv_scores))

0.8390732941524386 0.000911942551961447


In [9]:
def class_report(clf, X, y):
    clf = clf.fit(X, y)
    y_pred = clf.predict(X)
    y_true = y
    print(classification_report(y_true, y_pred))

In [10]:
class_report(logreg, X_train, y_train)

              precision    recall  f1-score   support

         <30       0.59      0.04      0.07      8154
       Other       0.89      1.00      0.94     64816

    accuracy                           0.89     72970
   macro avg       0.74      0.52      0.51     72970
weighted avg       0.86      0.89      0.84     72970



In [11]:
knn = KNeighborsClassifier()     # Default n_neighbors=5
knn_cv_results = cross_validate(knn, X_train, y_train, cv=5, 
                                    scoring='f1_weighted', n_jobs=-1) #'f1' producing 'nan' for all scores
knn_cv_scores = knn_cv_results['test_score']
print(np.mean(knn_cv_scores), np.std(knn_cv_scores))

0.8343890098978795 0.0007200713181354968


In [12]:
class_report(knn, X_train, y_train)

              precision    recall  f1-score   support

         <30       0.66      0.09      0.16      8154
       Other       0.90      0.99      0.94     64816

    accuracy                           0.89     72970
   macro avg       0.78      0.54      0.55     72970
weighted avg       0.87      0.89      0.86     72970



In [13]:
dtree = DecisionTreeClassifier()    #Default max_depth=None
dtree_cv_results = cross_validate(dtree, X_train, y_train, cv=5, 
                                    scoring='f1_weighted', n_jobs=-1) #'f1' producing 'nan' for all scores
dtree_cv_scores = dtree_cv_results['test_score']
print(np.mean(dtree_cv_scores), np.std(dtree_cv_scores))

0.8181262774882108 0.0016340115059804703


In [14]:
class_report(dtree, X_train, y_train)

              precision    recall  f1-score   support

         <30       1.00      1.00      1.00      8154
       Other       1.00      1.00      1.00     64816

    accuracy                           1.00     72970
   macro avg       1.00      1.00      1.00     72970
weighted avg       1.00      1.00      1.00     72970



This result makes sense for a Decision Tree Classifier with no max depth.

In [15]:
rfclf = RandomForestClassifier(random_state=42)    #Default n_estimators=100 and max_depth=None
rfclf_cv_results = cross_validate(rfclf, X_train, y_train, cv=5, 
                                    scoring='f1_weighted', n_jobs=-1) #'f1' producing 'nan' for all scores
rfclf_cv_scores = rfclf_cv_results['test_score']
print(np.mean(rfclf_cv_scores), np.std(rfclf_cv_scores))

0.8363163409003096 0.000382319325557314


In [16]:
class_report(rfclf, X_train, y_train)

              precision    recall  f1-score   support

         <30       1.00      1.00      1.00      8154
       Other       1.00      1.00      1.00     64816

    accuracy                           1.00     72970
   macro avg       1.00      1.00      1.00     72970
weighted avg       1.00      1.00      1.00     72970



In [17]:
svmclf = svm.LinearSVC(random_state=42)    #Default C=1.0
svm_cv_results = cross_validate(svmclf, X_train, y_train, cv=5, 
                                    scoring='f1_weighted', n_jobs=-1) #'f1' producing 'nan' for all scores
svm_cv_scores = svm_cv_results['test_score']
print(np.mean(svm_cv_scores), np.std(svm_cv_scores))

0.8292039310675205 0.0007536205251484708


In [18]:
class_report(svmclf, X_train, y_train)

              precision    recall  f1-score   support

         <30       0.22      0.08      0.11      8154
       Other       0.89      0.97      0.93     64816

    accuracy                           0.87     72970
   macro avg       0.56      0.52      0.52     72970
weighted avg       0.82      0.87      0.84     72970



In [19]:
gbc = GradientBoostingClassifier(max_features=80, learning_rate=1, random_state=42)    #Default n_estimators=100, criterion='friedman_mse', max_depth=3
gbc_cv_results = cross_validate(gbc, X_train, y_train, cv=5, 
                                    scoring='f1_weighted', n_jobs=-1) #'f1' producing 'nan' for all scores
gbc_cv_scores = gbc_cv_results['test_score']
print(np.mean(gbc_cv_scores), np.std(gbc_cv_scores))

0.8380959949338823 0.0010261218651382267


In [20]:
class_report(gbc, X_train, y_train)

              precision    recall  f1-score   support

         <30       0.63      0.07      0.12      8154
       Other       0.89      1.00      0.94     64816

    accuracy                           0.89     72970
   macro avg       0.76      0.53      0.53     72970
weighted avg       0.87      0.89      0.85     72970



In [21]:
xgbooster = xgb.XGBClassifier()
xgb_cv_results = cross_validate(xgbooster, X_train, y_train, cv=5, 
                                    scoring='f1_weighted', n_jobs=-1) #'f1' producing 'nan' for all scores
xgb_cv_scores = xgb_cv_results['test_score']
print(np.mean(xgb_cv_scores), np.std(xgb_cv_scores))

0.8401334142690748 0.0009237479352603954


In [22]:
class_report(xgbooster, X_train, y_train)

              precision    recall  f1-score   support

         <30       0.87      0.05      0.10      8154
       Other       0.89      1.00      0.94     64816

    accuracy                           0.89     72970
   macro avg       0.88      0.53      0.52     72970
weighted avg       0.89      0.89      0.85     72970



At a glance, it seems we will want to dig deeper with **xgbooster**, **Random Forest Classifier**, and **Logistic Regression**