In [1]:
import pandas as pd
import numpy as np
import sklearn
import random
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA as sklearn_pca

np.set_printoptions(formatter={'float':'{:0.4f}'.format})
pd.set_option('display.precision', 5)

filePath = "C:/Chang/NEU/courses/cs6220 datamining/project/dataset_diabetes/diabetic_data.csv"
# Read dataset using pandas
data = pd.read_csv(filePath, sep=',',header=0)

Here we tried to build a logistic regression model to predict if a patient has high probablity of readmission, using features including 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses'. Our model had an accuracy of 54.86%. 

Using PCA we reduced the dimensionality of the original data to 2, and retrained the model. The accuracy was slightly improved to 55.05%.

In [2]:
# Drop encounter_id and patient_nbr. Drop weight and payer_code which has 97% missing data
df = data.drop(['encounter_id', 'patient_nbr', 'weight', 'payer_code'], axis = 1)

In [3]:
# Encode readmitted into 0 or 1
df['readmitted'] = np.where(df['readmitted'].str.contains('<30'), 1, 0)

In [4]:
# Select numeric data
numData = df[['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 
              'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'readmitted']]

In [5]:
# Group data according to readmitted or not
readmitted_pos = numData[numData['readmitted'] == 1]
readmitted_neg = numData[numData['readmitted'] == 0]
readmitted_pos.shape

(11357, 9)

In [6]:
# Dataframe to list
readmitted_pos_list = readmitted_pos.values.tolist()
readmitted_neg_list = readmitted_neg.values.tolist()

# Randomly select negative data to have same size as pos data
sample_neg = random.sample(readmitted_neg_list, 12000)
    

In [25]:
# Prepare train and test data
train = readmitted_pos_list[0:5500] + sample_neg[0:5500]
test = readmitted_pos_list[5500:] + sample_neg[5500:]

# The last column is 'readmitted' label
train_X = []
train_Y = []
for x in train:
    train_X.append(x[:-1])
    train_Y.append([x[-1]])

test_X = []
test_Y = []
for x in test:
    test_X.append(x[:-1])
    test_Y.append([x[-1]])

In [26]:
# train logesitc regression model
lr = sklearn.linear_model.LogisticRegression()

In [27]:
lr.fit(train_X, train_Y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
predicts = lr.predict(test_X)

In [29]:
count = 0
for p, y in zip(predicts, test_Y):
    if p == y:
        count += 1
print(count/len(test_Y))

0.5485959375252893


In [30]:
pca = sklearn_pca(n_components=2)
data = pca.fit_transform(train_X)
data.shape

(11000, 2)

In [31]:
test_X = pca.transform(test_X)
test_X.shape

(12357, 2)

In [32]:
lr.fit(data, train_Y)
predicts = lr.predict(test_X)

count = 0
for p, y in zip(predicts, test_Y):
    if p == y:
        count += 1
print(count/len(test_Y))

0.5505381565104799


  y = column_or_1d(y, warn=True)
