In [16]:
#import packages
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt


In [2]:
#import data (X - selected features)
X1=np.load('dataset_diabetes/tree_selected_25_X.npy',allow_pickle=True) #decision tree top 25
X2=np.load('dataset_diabetes/SVD_selected_25_X.npy',allow_pickle=True) # SVD top 25

#import and format Y (labels for our classifier - readmitted=1 or not (at least not soon)=0)
readmission=np.load('dataset_diabetes/Y.npy',allow_pickle=True)
my_list = np.where(readmission == 'NO', 0, readmission)
my_list2 = np.where(my_list == '>30', 0, my_list)
Y0 = np.where(my_list2 == '<30', 1, my_list2)
Y=list(Y0)

# Check to make sure data is size I expect

In [3]:
X1.shape # features chosen by decision tree (binary, meaningful)

(101766, 25)

In [4]:
X2.shape # features chosen by SVD - not binary, not rly meaningful

(101766, 25)

In [5]:
len(Y)

101766

# Split into train and test data sets

In [6]:
# relabel X arrays
X_DT = X1
X_SVD = X2

# Split data
XDT_train, XDT_test, YDT_train, YDT_test = train_test_split(X_DT, Y, test_size=0.33, random_state=1)
XSVD_train, XSVD_test, YSVD_train, YSVD_test = train_test_split(X_SVD, Y, test_size=0.33, random_state=1)

In [7]:
XDT_train.shape

(68183, 25)

In [8]:
XSVD_train.shape

(68183, 25)

# Logistic Regression Models

In [9]:
DT_logit = LogisticRegression(random_state=0, max_iter=10000,tol=1e-8).fit(XDT_train, YDT_train)
SVD_logit = LogisticRegression(random_state=0, max_iter=10000,tol=1e-8).fit(XSVD_train, YSVD_train)

# Test accuracy

In [10]:
YhatDT_train = DT_logit.predict(XDT_train)
YhatDT_test = DT_logit.predict(XDT_test)

YhatSVD_train = SVD_logit.predict(XSVD_train)
YhatSVD_test = SVD_logit.predict(XSVD_test)

In [11]:
YhatSVD_test.shape

(33583,)

# Confusion Matricies

In [12]:
CM_SVD_Train = confusion_matrix(YSVD_train,YhatSVD_train);
CM_Tree_Train = confusion_matrix(YDT_train,YhatDT_train);

CM_SVD_Test = confusion_matrix(YSVD_test,YhatSVD_test);
CM_Tree_Test = confusion_matrix(YDT_test,YhatDT_test);

print('SVD Train Conf. Matrix: '+'\n'+str(CM_SVD_Train)+'\n')
print('Tree Train Conf. Matrix: '+'\n'+str(CM_Tree_Train)+'\n')

print('SVD Test Conf. Matrix: '+'\n'+str(CM_SVD_Test)+'\n')
print('Tree Test Conf. Matrix: '+'\n'+str(CM_Tree_Test))

SVD Train Conf. Matrix: 
[[60483     0]
 [ 7700     0]]

Tree Train Conf. Matrix: 
[[60418    65]
 [ 7633    67]]

SVD Test Conf. Matrix: 
[[29926     0]
 [ 3657     0]]

Tree Test Conf. Matrix: 
[[29873    53]
 [ 3630    27]]


In [13]:
SVD_train_total = np.sum(CM_SVD_Train,axis=1)[1];
SVD_train_correct = CM_SVD_Train[1][1];

Tree_train_total = np.sum(CM_Tree_Train,axis=1)[1];
Tree_train_correct = CM_Tree_Train[1][1];

print('SVD Training Data Set:')
print('Total Readmitted Patients: '+str(SVD_train_total))
print('Total Correctly Predicted: '+str(SVD_train_correct)+'\n')
print('Tree Training Data Set:')
print('Total Readmitted Patients: '+str(Tree_train_total))
print('Total Correctly Predicted: '+str(Tree_train_correct))

SVD Training Data Set:
Total Readmitted Patients: 7700
Total Correctly Predicted: 0

Tree Training Data Set:
Total Readmitted Patients: 7700
Total Correctly Predicted: 67


In [15]:
SVD_test_total = np.sum(CM_SVD_Test,axis=1)[1];
SVD_test_correct = CM_SVD_Test[1][1];

Tree_test_total = np.sum(CM_Tree_Test,axis=1)[1];
Tree_test_correct = CM_Tree_Test[1][1];

print('SVD Test Data Set:')
print('Total Readmitted Patients: '+str(SVD_test_total))
print('Total Correctly Predicted: '+str(SVD_test_correct)+'\n')
print('Tree Test Data Set:')
print('Total Readmitted Patients: '+str(Tree_test_total))
print('Total Correctly Predicted: '+str(Tree_test_correct))

SVD Test Data Set:
Total Readmitted Patients: 3657
Total Correctly Predicted: 0

Tree Test Data Set:
Total Readmitted Patients: 3657
Total Correctly Predicted: 27
