# Pipeline Modelling

In [1]:
import pandas as pd
import numpy as np

In [2]:
diabetes = pd.read_csv('cleaned_diabetes.csv')
diabetes = diabetes.drop(columns={'Unnamed: 0'})
diabetes.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,metformin,repaglinide,...,change_yes,change_no,diabetesMed_yes,diabetesMed_no,age_encoding,glu_serum_encoding,A1C_encoded,total_visits,polypharmacy,num_meds_chanaged
0,1,41,0,1,0,0,0,1,No,No,...,0,1,0,1,0,0,0,0,0,0
1,3,59,0,18,0,0,0,9,No,No,...,1,0,1,0,1,0,0,0,1,1
2,2,11,5,13,2,0,1,6,No,No,...,0,1,1,0,2,0,0,3,1,0
3,2,44,1,16,0,0,0,7,No,No,...,1,0,1,0,3,0,0,0,1,1
4,1,51,0,8,0,0,0,5,No,No,...,1,0,1,0,4,0,0,0,1,0


In [3]:
X = diabetes.drop(columns=['readmitted'])
y = diabetes['readmitted'] # what we're predicting

In [4]:
diabetes['readmitted'].value_counts(normalize=True)

readmitted
0    0.888424
1    0.111576
Name: proportion, dtype: float64

Since only 11.16% of the data is labelled as readmitted within 30 days, this is a **class imbalance**. We will need to account for it to avoid the mostly being biased and select the majority class, No or > 30, every time. 

In [5]:
diabetes['tolazamide'].unique()

array(['No', 'Steady', 'Up'], dtype=object)

In [6]:
# 80% train, 10% validation, 10% test
from sklearn.model_selection import train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [7]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score

In [8]:
# We also need to encode the medication! 
meds = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'tolazamide', 'acetohexamide', 'glimepiride-pioglitazone',
       'glipizide-metformin', 'glyburide-metformin', 'insulin', 'troglitazone',
       'miglitol', 'acarbose', 'rosiglitazone']

med_encoding = {'No':0, 'Down':1, 'Steady':2, 'Up':3}
for col in meds:
    if col in X_train.columns:
        X_train[col] = X_train[col].map(med_encoding)
        X_val[col] = X_val[col].map(med_encoding)
        X_test[col] = X_test[col].map(med_encoding)


In [9]:
num_cols = [
    'time_in_hospital', 'num_lab_procedures', 'num_procedures',
    'num_medications', 'number_outpatient', 'number_emergency',
    'number_inpatient', 'number_diagnoses', 'age_encoding',
    'glu_serum_encoding', 'A1C_encoded', 'total_visits'
]

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val[num_cols] = scaler.transform(X_val[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])


In [10]:
X_train[num_cols]

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,age_encoding,glu_serum_encoding,A1C_encoded,total_visits
66885,0.877409,1.318861,-0.784057,0.123183,-0.290766,-0.21195,-0.503031,0.816139,-0.061008,-0.211883,-0.411774,-0.522983
35306,-0.801694,-0.002451,-0.784057,-0.739663,-0.290766,-0.21195,1.075045,-0.737551,0.565411,-0.211883,-0.411774,0.342590
68823,-0.465874,2.030337,0.977442,-0.123344,-0.290766,-0.21195,-0.503031,0.816139,-3.193104,-0.211883,2.953128,-0.522983
24469,1.213230,-0.459829,0.977442,0.123183,-0.290766,-0.21195,-0.503031,-0.737551,-0.061008,-0.211883,-0.411774,-0.522983
49708,-0.465874,-2.086060,0.977442,0.369711,-0.290766,-0.21195,0.286007,0.816139,0.565411,-0.211883,-0.411774,-0.090196
...,...,...,...,...,...,...,...,...,...,...,...,...
3244,0.541588,-0.663108,1.564608,-0.246608,-0.290766,-0.21195,-0.503031,-2.291240,-1.313847,-0.211883,-0.411774,-0.522983
70193,1.549050,0.302467,0.390276,2.711723,-0.290766,-0.21195,-0.503031,0.816139,-0.687427,-0.211883,-0.411774,-0.522983
26185,2.220692,0.861484,-0.784057,0.246447,-0.290766,-0.21195,-0.503031,0.816139,1.191830,-0.211883,-0.411774,-0.522983
18151,-1.137515,-0.002451,-0.784057,-0.862927,-0.290766,-0.21195,0.286007,-0.737551,0.565411,-0.211883,1.831494,-0.090196


In [11]:
diabetes['miglitol'].unique()

array(['No', 'Steady', 'Down', 'Up'], dtype=object)

In [12]:
X_train.isnull().sum().sort_values(ascending=False).head(10)


miglitol                    81393
insulin                     81393
troglitazone                81393
glyburide-metformin         81393
acarbose                    81393
rosiglitazone               81393
glipizide-metformin         81393
glimepiride-pioglitazone    81393
acetohexamide               81393
tolazamide                  81393
dtype: int64

In [13]:
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
log_reg.fit(X_train, y_train)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values