In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

## Load the data

In [2]:
X_SVD=np.load('dataset_diabetes/SVD_selected_25_X.npy',allow_pickle=True)
X_tree=np.load('dataset_diabetes/tree_selected_25_X.npy',allow_pickle=True)
readmission=np.load('dataset_diabetes/Y.npy',allow_pickle=True)
encounter_id=np.load('dataset_diabetes/encounter_id.npy',allow_pickle=True)
patient_nbr=np.load('dataset_diabetes/patient_nbr.npy',allow_pickle=True)

In [3]:
my_list = np.where(readmission == 'NO', 0, readmission)
my_list2 = np.where(my_list == '>30', 0, my_list)
Y0 = np.where(my_list2 == '<30', 1, my_list2)
Y=list(Y0)

## Split into 2/3 training and 1/3 testing data sets

In [4]:
X0_train, X0_test, Y0_train, Y0_test = train_test_split(X_SVD, Y, test_size=0.33, random_state=1)
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X_tree, Y, test_size=0.33, random_state=1)

print(np.shape(X0_train))
print(np.shape(X1_train))

(68183, 25)
(68183, 25)


## Train Neural Network

Used 10 hidden layers with 100 nodes each for deep architecture. Set max iterations to be high and tolerance to be low to give each network ample training time to converge. Report final loss values for each network.

In [5]:
n_nodes = 10;
n_layers = 100;
nnet_SVD = MLPClassifier(hidden_layer_sizes=tuple(np.full(n_nodes,n_layers)),random_state=1, max_iter=10000,tol=1e-8).fit(X0_train,Y0_train)
nnet_tree = MLPClassifier(hidden_layer_sizes=tuple(np.full(n_nodes,n_layers)),random_state=1, max_iter=10000,tol=1e-8).fit(X1_train,Y1_train)

In [6]:
print('SVD Loss: '+str(nnet_SVD.loss_))
print('Tree Loss: '+str(nnet_tree.loss_))

SVD Loss: 0.07482104190971772
Tree Loss: 0.3199722656256294


## Confusion Matrices

Use to analyze how well classification was done for readmitted cases, since these are arguably more important to get correct.

In [7]:
Yhat_SVD_train = nnet_SVD.predict(X0_train)
Yhat_tree_train = nnet_tree.predict(X1_train)
Yhat_SVD_test = nnet_SVD.predict(X0_test)
Yhat_tree_test = nnet_tree.predict(X1_test)

CM_SVD_Train = confusion_matrix(Y0_train,Yhat_SVD_train);
CM_Tree_Train = confusion_matrix(Y1_train,Yhat_tree_train);
CM_SVD_Test = confusion_matrix(Y0_test,Yhat_SVD_test);
CM_Tree_Test = confusion_matrix(Y1_test,Yhat_tree_test);

print('SVD Train Conf. Matrix: '+'\n'+str(CM_SVD_Train)+'\n')
print('Tree Train Conf. Matrix: '+'\n'+str(CM_Tree_Train)+'\n')

print('SVD Test Conf. Matrix: '+'\n'+str(CM_SVD_Test)+'\n')
print('Tree Test Conf. Matrix: '+'\n'+str(CM_Tree_Test))

SVD Train Conf. Matrix: 
[[60074   409]
 [ 1284  6416]]

Tree Train Conf. Matrix: 
[[60333   150]
 [ 7217   483]]

SVD Test Conf. Matrix: 
[[27323  2603]
 [ 3192   465]]

Tree Test Conf. Matrix: 
[[29664   262]
 [ 3546   111]]


In [8]:
SVD_train_total = np.sum(CM_SVD_Train,axis=1)[1];
SVD_train_correct = CM_SVD_Train[1][1];

Tree_train_total = np.sum(CM_Tree_Train,axis=1)[1];
Tree_train_correct = CM_Tree_Train[1][1];

print('SVD Training Data Set:')
print('Total Readmitted Patients: '+str(SVD_train_total))
print('Total Correctly Predicted: '+str(SVD_train_correct)+'\n')
print('Tree Training Data Set:')
print('Total Readmitted Patients: '+str(Tree_train_total))
print('Total Correctly Predicted: '+str(Tree_train_correct))

SVD Training Data Set:
Total Readmitted Patients: 7700
Total Correctly Predicted: 6416

Tree Training Data Set:
Total Readmitted Patients: 7700
Total Correctly Predicted: 483


In [9]:
SVD_test_total = np.sum(CM_SVD_Test,axis=1)[1];
SVD_test_correct = CM_SVD_Test[1][1];

Tree_test_total = np.sum(CM_Tree_Test,axis=1)[1];
Tree_test_correct = CM_Tree_Test[1][1];

print('SVD Training Data Set:')
print('Total Readmitted Patients: '+str(SVD_test_total))
print('Total Correctly Predicted: '+str(SVD_test_correct)+'\n')
print('Tree Training Data Set:')
print('Total Readmitted Patients: '+str(Tree_test_total))
print('Total Correctly Predicted: '+str(Tree_test_correct))

SVD Training Data Set:
Total Readmitted Patients: 3657
Total Correctly Predicted: 465

Tree Training Data Set:
Total Readmitted Patients: 3657
Total Correctly Predicted: 111
