# **Diabetic Patient Readmission: Modeling**

This dataset was analyzed by numerous Virginia Commonwealth University faculty in a recent research article which is accompanied by feature descriptions. These can be found at https://www.hindawi.com/journals/bmri/2014/781670/tab1/.

In [1]:
import os

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, cross_validate

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.feature_selection import chi2
from sklearn.metrics import classification_report, accuracy_score

from timeit import default_timer as timer

%matplotlib inline

In [2]:
df1 = pd.read_csv("clean_data2.csv")
df1.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),6,25,1,1,Pediatrics-Endocrinology,...,No,No,No,No,No,No,No,No,No,Other
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,missing,...,No,Up,No,No,No,No,No,Ch,Yes,Other
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,missing,...,No,No,No,No,No,No,No,No,Yes,Other
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,missing,...,No,Up,No,No,No,No,No,Ch,Yes,Other
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,missing,...,No,Steady,No,No,No,No,No,Ch,Yes,Other


# Pre-processing:

In [3]:
df1 = df1.drop(columns=['encounter_id','patient_nbr'])    #irrelevant for modeling

In [4]:
X = df1.drop(columns=['readmitted'])
y = df1[['readmitted']]
X.shape, y.shape

((97294, 43), (97294, 1))

In [5]:
X = pd.get_dummies(X, drop_first=True)
X.head()

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,insulin_Up,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady,change_No,diabetesMed_Yes
0,6,25,1,1,41,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,1,1,7,3,59,0,18,0,0,0,...,1,1,0,0,0,0,0,0,0,1
2,1,1,7,2,11,5,13,2,0,1,...,0,1,0,0,0,0,0,0,1,1
3,1,1,7,2,44,1,16,0,0,0,...,1,1,0,0,0,0,0,0,0,1
4,1,1,7,1,51,0,8,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [6]:
start = timer()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

end = timer()
print(f'{round(end - start)} seconds elapsed.')

5 seconds elapsed.


# Train-test split:

In [7]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.25, random_state=42)

In [8]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(72970, 2398) (24324, 2398) (72970, 1) (24324, 1)
