## 1. Imports and loading data

In [18]:
import pandas as pd
import numpy as np
import sklearn
import random
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA as sklearn_pca

np.set_printoptions(formatter={'float':'{:0.4f}'.format})
pd.set_option('display.precision', 5)

filePath = "C:/Chang/NEU/courses/cs6220 datamining/project/dataset_diabetes/diabetic_data.csv"
# Read dataset using pandas
data = pd.read_csv(filePath, sep=',',header=0)

## 2. Preprocessing
This data set contains both numeric and nominal data types. For numeric data we need to do normalization, while for categorical data, we perform one-hot encoding.

In [19]:
# Drop encounter_id and patient_nbr. Drop weight and payer_code which has 97% missing data
df = data.drop(['encounter_id', 'patient_nbr', 'weight', 'payer_code'], axis = 1)

In [20]:
# TODO: Currently performed binary encoding. Should try data binning.
df['diag_1'] = np.where(df['diag_1'].str.contains('250'), 1, 0)
df['diag_2'] = np.where(df['diag_2'].str.contains('250'), 1, 0)
df['diag_3'] = np.where(df['diag_3'].str.contains('250'), 1, 0)

# Encode readmitted into 0 or 1
df['readmitted'] = np.where(df['readmitted'].str.contains('<30'), 1, 0)

In [21]:
# Standardizing numeric data
from sklearn.preprocessing import StandardScaler
numData = df[['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 
              'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']]
numData_std = StandardScaler().fit_transform(numData)

In [22]:
# Confirm that the numeric data are standardized
df_numData = pd.DataFrame(numData_std)
df_numData.describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,2.35731e-14,3.24394e-16,4.23327e-15,-2.18372e-14,1.00082e-13,2.50927e-14,-4.3145e-14,1.28417e-13
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.13765,-2.13963,-0.785398,-1.84827,-0.291461,-0.21262,-0.503276,-3.3216
25%,-0.802651,-0.614795,-0.785398,-0.74092,-0.291461,-0.21262,-0.503276,-0.735733
50%,-0.132655,0.0459666,-0.199162,-0.125726,-0.291461,-0.21262,-0.503276,0.298612
75%,0.537341,0.706728,0.387074,0.489467,-0.291461,-0.21262,0.288579,0.815784
max,3.21732,4.51881,2.73202,7.99483,32.8509,81.4667,16.1257,4.43599


In [23]:
df[['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 
              'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']] = df_numData

In [24]:
# Apply one-hot encoding to categorical columns
oneHotData = pd.get_dummies(df, columns=['race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id',
                                         'admission_source_id', 'medical_specialty', 
                                         'diag_1', 
                                         'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 
                                         'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 
                                         'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 
                                         'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 
                                         'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin',
                                         'glipizide-metformin', 'glimepiride-pioglitazone', 
                                         'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 
                                         'diabetesMed', 'readmitted'],
                            prefix=['race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id', 
                                    'admission_source_id', 'medical_specialty', 
                                    'diag_1', 'diag_2', 'diag_3', 
                                    'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 
                                    'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 
                                    'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 
                                    'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin',
                                    'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 
                                    'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed',
                                    'readmitted'])
oneHotData.describe()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_?,race_AfricanAmerican,...,metformin-rosiglitazone_No,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes,readmitted_0,readmitted_1
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,...,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,2.35731e-14,3.24394e-16,4.23327e-15,-2.18372e-14,1.00082e-13,2.50927e-14,-4.3145e-14,1.28417e-13,0.02234,0.18877,...,0.99998,2e-05,0.99999,9.82646e-06,0.46195,0.53805,0.22997,0.77003,0.8884,0.1116
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.14777,0.39132,...,0.00443,0.00443,0.00313,0.00313472,0.49855,0.49855,0.42081,0.42081,0.31487,0.31487
min,-1.13765,-2.13963,-0.785398,-1.84827,-0.291461,-0.21262,-0.503276,-3.3216,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.802651,-0.614795,-0.785398,-0.74092,-0.291461,-0.21262,-0.503276,-0.735733,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
50%,-0.132655,0.0459666,-0.199162,-0.125726,-0.291461,-0.21262,-0.503276,0.298612,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
75%,0.537341,0.706728,0.387074,0.489467,-0.291461,-0.21262,0.288579,0.815784,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
max,3.21732,4.51881,2.73202,7.99483,32.8509,81.4667,16.1257,4.43599,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## 3. Train Logistic Regression Model to predict readmission

In [25]:
# prepare train and test data for logistic regression
# Group data according to readmitted or not
readmitted_pos = oneHotData[oneHotData['readmitted_1'] == 1]
readmitted_neg = oneHotData[oneHotData['readmitted_0'] == 1]
readmitted_pos.shape

(11357, 242)

In [124]:
# Dataframe to list
readmitted_pos_list = readmitted_pos.values.tolist()
readmitted_neg_list = readmitted_neg.values.tolist()

# Randomly select negative data to have same size as pos data
sample_neg = random.sample(readmitted_neg_list, 12000)

# Prepare train and test data
train = readmitted_pos_list[0:8500] + sample_neg[0:8500]
test = readmitted_pos_list[8500:] + sample_neg[8500:]

# The last column is 'readmitted' label
train_X = []
train_Y = []
for x in train:
    train_X.append(x[:-2])
    train_Y.append([x[-1]])

test_X = []
test_Y = []
for x in test:
    test_X.append(x[:-2])
    test_Y.append([x[-1]])

In [125]:
# train logesitc regression model
lr = sklearn.linear_model.LogisticRegression()

lr.fit(train_X, train_Y)

predicts = lr.predict(test_X)

count = 0
for p, y in zip(predicts, test_Y):
    if p == y:
        count += 1
print(count/len(test_Y))

  y = column_or_1d(y, warn=True)


0.5656756331602958


The accuracy of the model is 56.57%, next we want to reduce the dimensionality of the training data using PCA and retrain the model to see if we can improve the performance.

## 4. PCA

In [126]:
pca = sklearn_pca(n_components=6)
data = pca.fit_transform(train_X)
data.shape

(17000, 6)

In [127]:
test_X = pca.transform(test_X)
test_X.shape

(6357, 6)

In [128]:
print(pca.explained_variance_ratio_)

[0.1256 0.1111 0.0678 0.0630 0.0559 0.0522]


In [129]:
# Retrain the model
lr.fit(data, train_Y)
predicts = lr.predict(test_X)

count = 0
for p, y in zip(predicts, test_Y):
    if p == y:
        count += 1
print(count/len(test_Y))

0.6064181217555451


  y = column_or_1d(y, warn=True)


The accuracy of the model has been improved to 60.64% after training with low-dimensional data.