In [19]:
#import packages
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt


In [20]:
#import data (X - selected features)
X1=np.load('dataset_diabetes/tree_selected_25_X.npy',allow_pickle=True) #decision tree top 25
X2=np.load('dataset_diabetes/SVD_selected_25_X.npy',allow_pickle=True) # SVD top 25

#import and format Y (labels for our classifier - readmitted=1 or not (at least not soon)=0)
readmission=np.load('dataset_diabetes/Y.npy',allow_pickle=True)
my_list = np.where(readmission == 'NO', 0, readmission)
my_list2 = np.where(my_list == '>30', 0, my_list)
Y0 = np.where(my_list2 == '<30', 1, my_list2)
Y=list(Y0)

# Check to make sure data is size I expect

In [21]:
X1.shape # features chosen by decision tree (binary, meaningful)

(101766, 25)

In [22]:
X2.shape # features chosen by SVD - not binary, not rly meaningful

(101766, 25)

In [23]:
len(Y)

101766

# Split into train and test data sets

In [24]:
# relabel X arrays
X_DT = X1
X_SVD = X2

# Split data
XDT_train, XDT_test, YDT_train, YDT_test = train_test_split(X_DT, Y, test_size=0.33, random_state=1)
XSVD_train, XSVD_test, YSVD_train, YSVD_test = train_test_split(X_SVD, Y, test_size=0.33, random_state=1)

In [25]:
XDT_train.shape

(68183, 25)

In [26]:
XSVD_train.shape

(68183, 25)

# Logistic Regression Models

## Default Logistic Regression

In [36]:
### Default Log regression
DT_logit = LogisticRegression(random_state=1, max_iter=10000,tol=1e-8, penalty='none').fit(XDT_train, YDT_train)
SVD_logit = LogisticRegression(random_state=1, max_iter=10000,tol=1e-8, penalty='none').fit(XSVD_train, YSVD_train)

# Use model to make predictions
YhatDT_train = DT_logit.predict(XDT_train)
YhatDT_test = DT_logit.predict(XDT_test)
YhatSVD_train = SVD_logit.predict(XSVD_train)
YhatSVD_test = SVD_logit.predict(XSVD_test)

# Confusion matricies
CM_SVD_Train = confusion_matrix(YSVD_train,YhatSVD_train);
CM_SVD_Test = confusion_matrix(YSVD_test,YhatSVD_test);

CM_Tree_Train = confusion_matrix(YDT_train,YhatDT_train);
CM_Tree_Test = confusion_matrix(YDT_test,YhatDT_test);

print('SVD Train Conf. Matrix: '+'\n'+str(CM_SVD_Train)+'\n')
print('SVD Test Conf. Matrix: '+'\n'+str(CM_SVD_Test)+'\n')
print('Tree Train Conf. Matrix: '+'\n'+str(CM_Tree_Train)+'\n')
print('Tree Test Conf. Matrix: '+'\n'+str(CM_Tree_Test))

# Score
SVD_Train_Correct = sum(np.diag(CM_SVD_Train))
SVD_Train_Total = len(XSVD_train)
print('The percentage of correct training assignments is: ',SVD_Train_Correct/SVD_Train_Total)

SVD_Test_Correct = sum(np.diag(CM_SVD_Test))
SVD_Test_Total = len(XSVD_test)
print('The percentage of correct test assignments is: ',SVD_Test_Correct/SVD_Test_Total)

SVD Train Conf. Matrix: 
[[60483     0]
 [ 7700     0]]

SVD Test Conf. Matrix: 
[[29926     0]
 [ 3657     0]]

Tree Train Conf. Matrix: 
[[60381   102]
 [ 7589   111]]

Tree Test Conf. Matrix: 
[[29854    72]
 [ 3613    44]]
The percentage of correct training assignments is:  0.8870686241438481
The percentage of correct test assignments is:  0.8911056189143317


## Class weight: balanced

In [29]:
balancedDT_logit = LogisticRegression(random_state=1, max_iter=10000,tol=1e-8, penalty='none',class_weight='balanced').fit(XDT_train, YDT_train)
balancedSVD_logit = LogisticRegression(random_state=1, max_iter=10000,tol=1e-8, penalty='none',class_weight='balanced').fit(XSVD_train, YSVD_train)

# Use model to make predictions
YhatDT_train = balancedDT_logit.predict(XDT_train)
YhatDT_test = balancedDT_logit.predict(XDT_test)
YhatSVD_train = balancedSVD_logit.predict(XSVD_train)
YhatSVD_test = balancedSVD_logit.predict(XSVD_test)

# Confusion matricies
CM_SVD_Train = confusion_matrix(YSVD_train,YhatSVD_train);
CM_SVD_Test = confusion_matrix(YSVD_test,YhatSVD_test);

CM_Tree_Train = confusion_matrix(YDT_train,YhatDT_train);
CM_Tree_Test = confusion_matrix(YDT_test,YhatDT_test);

print('SVD Train Conf. Matrix: '+'\n'+str(CM_SVD_Train)+'\n')
print('SVD Test Conf. Matrix: '+'\n'+str(CM_SVD_Test)+'\n')
print('Tree Train Conf. Matrix: '+'\n'+str(CM_Tree_Train)+'\n')
print('Tree Test Conf. Matrix: '+'\n'+str(CM_Tree_Test))

# Score
SVD_Train_Correct = sum(np.diag(CM_SVD_Train))
SVD_Train_Total = len(XSVD_train)
print('The percentage of correct training assignments is: ',SVD_Train_Correct/SVD_Train_Total)

SVD_Test_Correct = sum(np.diag(CM_SVD_Test))
SVD_Test_Total = len(XSVD_test)
print('The percentage of correct test assignments is: ',SVD_Test_Correct/SVD_Test_Total)

SVD Train Conf. Matrix: 
[[36898 23585]
 [ 3290  4410]]

SVD Test Conf. Matrix: 
[[18125 11801]
 [ 1552  2105]]

Tree Train Conf. Matrix: 
[[42500 17983]
 [ 3670  4030]]

Tree Test Conf. Matrix: 
[[20864  9062]
 [ 1707  1950]]
The percentage of correct training assignments is:  0.6058401654371324
The percentage of correct test assignments is:  0.60238811303338


## Custom class weights

Here, we test a few values to see how the class weights affect the accuracy of our model. We use this information to pick a range to loop through below.

In [30]:
# {0:.1, 1:.9} Quite close to "balanced" since this is about the ratio seen in data
custombalancedDT_logit = LogisticRegression(random_state=1, max_iter=10000,tol=1e-8, penalty='none',class_weight={0:.1, 1:.9}).fit(XDT_train, YDT_train)
custombalancedSVD_logit = LogisticRegression(random_state=1, max_iter=10000,tol=1e-8, penalty='none',class_weight={0:.1, 1:.9}).fit(XSVD_train, YSVD_train)

# Use model to make predictions
YhatDT_train = custombalancedDT_logit.predict(XDT_train)
YhatDT_test = custombalancedDT_logit.predict(XDT_test)
YhatSVD_train = custombalancedSVD_logit.predict(XSVD_train)
YhatSVD_test = custombalancedSVD_logit.predict(XSVD_test)

# Confusion matricies
CM_SVD_Train = confusion_matrix(YSVD_train,YhatSVD_train);
CM_SVD_Test = confusion_matrix(YSVD_test,YhatSVD_test);

CM_Tree_Train = confusion_matrix(YDT_train,YhatDT_train);
CM_Tree_Test = confusion_matrix(YDT_test,YhatDT_test);

print('SVD Train Conf. Matrix: '+'\n'+str(CM_SVD_Train)+'\n')
print('SVD Test Conf. Matrix: '+'\n'+str(CM_SVD_Test)+'\n')
print('Tree Train Conf. Matrix: '+'\n'+str(CM_Tree_Train)+'\n')
print('Tree Test Conf. Matrix: '+'\n'+str(CM_Tree_Test))

# Score
SVD_Train_Correct = sum(np.diag(CM_SVD_Train))
SVD_Train_Total = len(XSVD_train)
print('The percentage of correct training assignments is: ',SVD_Train_Correct/SVD_Train_Total)

SVD_Test_Correct = sum(np.diag(CM_SVD_Test))
SVD_Test_Total = len(XSVD_test)
print('The percentage of correct test assignments is: ',SVD_Test_Correct/SVD_Test_Total)

SVD Train Conf. Matrix: 
[[30708 29775]
 [ 2443  5257]]

SVD Test Conf. Matrix: 
[[14993 14933]
 [ 1139  2518]]

Tree Train Conf. Matrix: 
[[36171 24312]
 [ 2807  4893]]

Tree Test Conf. Matrix: 
[[17607 12319]
 [ 1295  2362]]
The percentage of correct training assignments is:  0.5274775237229221
The percentage of correct test assignments is:  0.5214245302682905


In [33]:
# {0:.9, 1:.1}
custombalancedDT_logit = LogisticRegression(random_state=1, max_iter=10000,tol=1e-8, penalty='none',class_weight={0:.9, 1:.1}).fit(XDT_train, YDT_train)
custombalancedSVD_logit = LogisticRegression(random_state=1, max_iter=10000,tol=1e-8, penalty='none',class_weight={0:.9, 1:.1}).fit(XSVD_train, YSVD_train)

# Use model to make predictions
YhatDT_train = custombalancedDT_logit.predict(XDT_train)
YhatDT_test = custombalancedDT_logit.predict(XDT_test)
YhatSVD_train = custombalancedSVD_logit.predict(XSVD_train)
YhatSVD_test = custombalancedSVD_logit.predict(XSVD_test)

# Confusion matricies
CM_SVD_Train = confusion_matrix(YSVD_train,YhatSVD_train);
CM_SVD_Test = confusion_matrix(YSVD_test,YhatSVD_test);

CM_Tree_Train = confusion_matrix(YDT_train,YhatDT_train);
CM_Tree_Test = confusion_matrix(YDT_test,YhatDT_test);

print('SVD Train Conf. Matrix: '+'\n'+str(CM_SVD_Train)+'\n')
print('SVD Test Conf. Matrix: '+'\n'+str(CM_SVD_Test)+'\n')
print('Tree Train Conf. Matrix: '+'\n'+str(CM_Tree_Train)+'\n')
print('Tree Test Conf. Matrix: '+'\n'+str(CM_Tree_Test))

# Score
SVD_Train_Correct = sum(np.diag(CM_SVD_Train))
SVD_Train_Total = len(XSVD_train)
print('The percentage of correct training assignments is: ',SVD_Train_Correct/SVD_Train_Total)

SVD_Test_Correct = sum(np.diag(CM_SVD_Test))
SVD_Test_Total = len(XSVD_test)
print('The percentage of correct test assignments is: ',SVD_Test_Correct/SVD_Test_Total)

SVD Train Conf. Matrix: 
[[60483     0]
 [ 7700     0]]

SVD Test Conf. Matrix: 
[[29926     0]
 [ 3657     0]]

Tree Train Conf. Matrix: 
[[60483     0]
 [ 7695     5]]

Tree Test Conf. Matrix: 
[[29926     0]
 [ 3653     4]]
The percentage of correct training assignments is:  0.8870686241438481
The percentage of correct test assignments is:  0.8911056189143317


In [32]:
# {0:.5, 1:.5}
custombalancedDT_logit = LogisticRegression(random_state=1, max_iter=10000,tol=1e-8, penalty='none',class_weight={0:.5, 1:.5}).fit(XDT_train, YDT_train)
custombalancedSVD_logit = LogisticRegression(random_state=1, max_iter=10000,tol=1e-8, penalty='none',class_weight={0:.5, 1:.5}).fit(XSVD_train, YSVD_train)

# Use model to make predictions
YhatDT_train = custombalancedDT_logit.predict(XDT_train)
YhatDT_test = custombalancedDT_logit.predict(XDT_test)
YhatSVD_train = custombalancedSVD_logit.predict(XSVD_train)
YhatSVD_test = custombalancedSVD_logit.predict(XSVD_test)

# Confusion matricies
CM_SVD_Train = confusion_matrix(YSVD_train,YhatSVD_train);
CM_SVD_Test = confusion_matrix(YSVD_test,YhatSVD_test);

CM_Tree_Train = confusion_matrix(YDT_train,YhatDT_train);
CM_Tree_Test = confusion_matrix(YDT_test,YhatDT_test);

print('SVD Train Conf. Matrix: '+'\n'+str(CM_SVD_Train)+'\n')
print('SVD Test Conf. Matrix: '+'\n'+str(CM_SVD_Test)+'\n')
print('Tree Train Conf. Matrix: '+'\n'+str(CM_Tree_Train)+'\n')
print('Tree Test Conf. Matrix: '+'\n'+str(CM_Tree_Test))

# Score
SVD_Train_Correct = sum(np.diag(CM_SVD_Train))
SVD_Train_Total = len(XSVD_train)
print('The percentage of correct training assignments is: ',SVD_Train_Correct/SVD_Train_Total)

SVD_Test_Correct = sum(np.diag(CM_SVD_Test))
SVD_Test_Total = len(XSVD_test)
print('The percentage of correct test assignments is: ',SVD_Test_Correct/SVD_Test_Total)

SVD Train Conf. Matrix: 
[[60483     0]
 [ 7700     0]]

SVD Test Conf. Matrix: 
[[29926     0]
 [ 3657     0]]

Tree Train Conf. Matrix: 
[[60381   102]
 [ 7590   110]]

Tree Test Conf. Matrix: 
[[29855    71]
 [ 3613    44]]
The percentage of correct training assignments is:  0.8870686241438481
The percentage of correct test assignments is:  0.8911056189143317


In [38]:
# {0:1, 1:10}
custombalancedDT_logit = LogisticRegression(random_state=1, max_iter=10000,tol=1e-8, penalty='none',class_weight={0:1, 1:10}).fit(XDT_train, YDT_train)
custombalancedSVD_logit = LogisticRegression(random_state=1, max_iter=10000,tol=1e-8, penalty='none',class_weight={0:1, 1:10}).fit(XSVD_train, YSVD_train)

# Use model to make predictions
YhatDT_train = custombalancedDT_logit.predict(XDT_train)
YhatDT_test = custombalancedDT_logit.predict(XDT_test)
YhatSVD_train = custombalancedSVD_logit.predict(XSVD_train)
YhatSVD_test = custombalancedSVD_logit.predict(XSVD_test)

# Confusion matricies
CM_SVD_Train = confusion_matrix(YSVD_train,YhatSVD_train);
CM_SVD_Test = confusion_matrix(YSVD_test,YhatSVD_test);

CM_Tree_Train = confusion_matrix(YDT_train,YhatDT_train);
CM_Tree_Test = confusion_matrix(YDT_test,YhatDT_test);

print('SVD Train Conf. Matrix: '+'\n'+str(CM_SVD_Train)+'\n')
print('SVD Test Conf. Matrix: '+'\n'+str(CM_SVD_Test)+'\n')
print('Tree Train Conf. Matrix: '+'\n'+str(CM_Tree_Train)+'\n')
print('Tree Test Conf. Matrix: '+'\n'+str(CM_Tree_Test))

# Score
SVD_Train_Correct = sum(np.diag(CM_SVD_Train))
SVD_Train_Total = len(XSVD_train)
print('The percentage of correct training assignments is: ',SVD_Train_Correct/SVD_Train_Total)

SVD_Test_Correct = sum(np.diag(CM_SVD_Test))
SVD_Test_Total = len(XSVD_test)
print('The percentage of correct test assignments is: ',SVD_Test_Correct/SVD_Test_Total)

SVD Train Conf. Matrix: 
[[25980 34503]
 [ 1884  5816]]

SVD Test Conf. Matrix: 
[[12628 17298]
 [  871  2786]]

Tree Train Conf. Matrix: 
[[30108 30375]
 [ 2144  5556]]

Tree Test Conf. Matrix: 
[[14730 15196]
 [  972  2685]]
The percentage of correct training assignments is:  0.4663332502236628
The percentage of correct test assignments is:  0.45898222314861686


In [39]:
# Loop thru dictionaries

zeroweights = np.linspace(0,5,11)
oneweights = np.linspace(0,10,21)

SVD_TrainAcc = [];
SVD_TestAcc = [];
DT_TrainAcc = [];
DT_TestAcc = [];

for zero in enumerate(zeroweights):
    for one in enumerate(oneweights):
        custombalancedDT_logit = LogisticRegression(random_state=1, max_iter=10000,tol=1e-8,class_weight={0:zero[1], 1:one[1]}).fit(XDT_train, YDT_train)
        custombalancedSVD_logit = LogisticRegression(random_state=1, max_iter=10000,tol=1e-8,class_weight={0:zero[1], 1:one[1]}).fit(XSVD_train, YSVD_train)

        # Use model to make predictions
        YhatDT_train = custombalancedDT_logit.predict(XDT_train)
        YhatDT_test = custombalancedDT_logit.predict(XDT_test)
        YhatSVD_train = custombalancedSVD_logit.predict(XSVD_train)
        YhatSVD_test = custombalancedSVD_logit.predict(XSVD_test)

        # Confusion matricies
        CM_SVD_Train = confusion_matrix(YSVD_train,YhatSVD_train);
        CM_SVD_Test = confusion_matrix(YSVD_test,YhatSVD_test);
        CM_DT_Train = confusion_matrix(YDT_train,YhatDT_train);
        CM_DT_Test = confusion_matrix(YDT_test,YhatDT_test);

        # Scores
        SVD_Train_Correct = sum(np.diag(CM_SVD_Train))
        SVD_Test_Correct = sum(np.diag(CM_SVD_Test))
        DT_Train_Correct = sum(np.diag(CM_DT_Train))
        DT_Test_Correct = sum(np.diag(CM_DT_Test))
        
        # Save scores
        SVD_TrainAcc.append(SVD_Train_Correct);
        SVD_TestAcc.append(SVD_Test_Correct);
        DT_TrainAcc.append(DT_Train_Correct);
        DT_TestAcc.append(DT_Test_Correct);


In [45]:
maxSVDTrainCount = max(SVD_TrainAcc)
bestSVDTrainScore= maxSVDTrainCount/len(XSVD_train)
bestindicies=np.where(SVD_TrainAcc==maxSVDTrainCount)

In [41]:
maxSVDTestCount = max(SVD_TestAcc)
bestSVDTestScore= maxSVDTestCount/len(XSVD_test)
bestindicies = np.where(SVD_TestAcc==maxSVDTestCount)

In [42]:
bestindicies

(array([ 23,  46,  69,  92, 115, 138, 161, 184, 207, 230]),)

In [43]:
weights = [];

for zero in enumerate(zeroweights):
    for one in enumerate(oneweights):
        weights.append([zero[1],one[1]])
        
bestindicies= np.reshape(bestindicies, [10,1])
print('The optimal weights are')
for i in range(len(bestindicies)):
    print(weights[int(bestindicies[i])],'\n')

The optimal weights are
[0.5, 1.0] 

[1.0, 2.0] 

[1.5, 3.0] 

[2.0, 4.0] 

[2.5, 5.0] 

[3.0, 6.0] 

[3.5, 7.0] 

[4.0, 8.0] 

[4.5, 9.0] 

[5.0, 10.0] 



In [47]:
maxDTTestCount = max(DT_TestAcc)
bestDTTestScore= maxDTTestCount/len(XDT_test)
bestindicies = np.where(DT_TestAcc==maxDTTestCount)

bestindicies= np.reshape(bestindicies, [10,1])
print('The optimal weights are')
for i in range(len(bestindicies)):
    print(weights[int(bestindicies[i])],'\n')

The optimal weights are
[3.0, 1.0] 

[3.5, 1.0] 

[3.5, 1.5] 

[4.0, 1.0] 

[4.5, 1.0] 

[4.5, 1.5] 

[4.5, 2.0] 

[5.0, 1.0] 

[5.0, 1.5] 

[5.0, 2.0] 



In [49]:
# DT Classweight = {0:4, 1:1}
# SVD Classweight ={0:.5, 1:1}

custombalancedDT_logit = LogisticRegression(random_state=1, max_iter=10000,tol=1e-8,class_weight={0:4, 1:1}).fit(XDT_train, YDT_train)
custombalancedSVD_logit = LogisticRegression(random_state=1, max_iter=10000,tol=1e-8,class_weight={0:.5, 1:1}).fit(XSVD_train, YSVD_train)

# Use model to make predictions
YhatDT_train = custombalancedDT_logit.predict(XDT_train)
YhatDT_test = custombalancedDT_logit.predict(XDT_test)
YhatSVD_train = custombalancedSVD_logit.predict(XSVD_train)
YhatSVD_test = custombalancedSVD_logit.predict(XSVD_test)

# Confusion matricies
CM_SVD_Train = confusion_matrix(YSVD_train,YhatSVD_train);
CM_SVD_Test = confusion_matrix(YSVD_test,YhatSVD_test);
CM_DT_Train = confusion_matrix(YDT_train,YhatDT_train);
CM_DT_Test = confusion_matrix(YDT_test,YhatDT_test);

# Scores
SVD_Train_Correct = sum(np.diag(CM_SVD_Train))
SVD_Test_Correct = sum(np.diag(CM_SVD_Test))
DT_Train_Correct = sum(np.diag(CM_DT_Train))
DT_Test_Correct = sum(np.diag(CM_DT_Test))

# # Score
SVD_Train_Correct = sum(np.diag(CM_SVD_Train))
SVD_Train_Total = len(XSVD_train)
print('The percentage of correct SVD training assignments is: ',SVD_Train_Correct/SVD_Train_Total)

SVD_Test_Correct = sum(np.diag(CM_SVD_Test))
SVD_Test_Total = len(XSVD_test)
print('The percentage of correct SVD test assignments is: ',SVD_Test_Correct/SVD_Test_Total)

DT_Train_Correct = sum(np.diag(CM_DT_Train))
DT_Train_Total = len(XDT_train)
print('The percentage of correct DT training assignments is: ',DT_Train_Correct/DT_Train_Total)

DT_Test_Correct = sum(np.diag(CM_DT_Test))
DT_Test_Total = len(XDT_test)
print('The percentage of correct DT test assignments is: ',DT_Test_Correct/DT_Test_Total)

The percentage of correct SVD training assignments is:  0.8869952920816039
The percentage of correct SVD test assignments is:  0.8911353958848227
The percentage of correct DT training assignments is:  0.8871419562060924
The percentage of correct DT test assignments is:  0.8912247267962957


In [21]:
### Stochastic Gradient Descent- tried with combos of penalty/no penalty and classweight didn't change
import sklearn.linear_model
SGDDT_logit = sklearn.linear_model.SGDClassifier(loss='log',random_state=1, max_iter=10000,tol=1e-8,penalty='none', class_weight='balanced').fit(XDT_train, YDT_train)
SGDSVD_logit = sklearn.linear_model.SGDClassifier(loss='log',random_state=1, max_iter=10000,tol=1e-8,penalty='none', class_weight='balanced').fit(XSVD_train, YSVD_train)

SGDYhatDT_train = SGDDT_logit.predict(XDT_train)
SGDYhatDT_test = SGDDT_logit.predict(XDT_test)

SGDYhatSVD_train = SGDSVD_logit.predict(XSVD_train)
SGDYhatSVD_test = SGDSVD_logit.predict(XSVD_test)

CM_SVD_Train = confusion_matrix(YSVD_train,YhatSVD_train);
CM_Tree_Train = confusion_matrix(YDT_train,YhatDT_train);

CM_SVD_Test = confusion_matrix(YSVD_test,YhatSVD_test);
CM_Tree_Test = confusion_matrix(YDT_test,YhatDT_test);

print('SVD Train Conf. Matrix: '+'\n'+str(CM_SVD_Train)+'\n')
print('SVD Test Conf. Matrix: '+'\n'+str(CM_SVD_Test)+'\n')

print('Tree Train Conf. Matrix: '+'\n'+str(CM_Tree_Train)+'\n')
print('Tree Test Conf. Matrix: '+'\n'+str(CM_Tree_Test))


SVD Train Conf. Matrix: 
[[60483     0]
 [ 7700     0]]

SVD Test Conf. Matrix: 
[[29926     0]
 [ 3657     0]]

Tree Train Conf. Matrix: 
[[60418    65]
 [ 7633    67]]

Tree Test Conf. Matrix: 
[[29873    53]
 [ 3630    27]]


# Test accuracy (Older/Scratch)

In [11]:
YhatDT_train = DT_logit.predict(XDT_train)
YhatDT_test = DT_logit.predict(XDT_test)

YhatSVD_train = SVD_logit.predict(XSVD_train)
YhatSVD_test = SVD_logit.predict(XSVD_test)

In [12]:
YhatSVD_test.shape

(33583,)

# Confusion Matricies

In [13]:
CM_SVD_Train = confusion_matrix(YSVD_train,YhatSVD_train);
CM_Tree_Train = confusion_matrix(YDT_train,YhatDT_train);

CM_SVD_Test = confusion_matrix(YSVD_test,YhatSVD_test);
CM_Tree_Test = confusion_matrix(YDT_test,YhatDT_test);

print('SVD Train Conf. Matrix: '+'\n'+str(CM_SVD_Train)+'\n')
print('Tree Train Conf. Matrix: '+'\n'+str(CM_Tree_Train)+'\n')

print('SVD Test Conf. Matrix: '+'\n'+str(CM_SVD_Test)+'\n')
print('Tree Test Conf. Matrix: '+'\n'+str(CM_Tree_Test))

SVD Train Conf. Matrix: 
[[60483     0]
 [ 7700     0]]

Tree Train Conf. Matrix: 
[[60418    65]
 [ 7633    67]]

SVD Test Conf. Matrix: 
[[29926     0]
 [ 3657     0]]

Tree Test Conf. Matrix: 
[[29873    53]
 [ 3630    27]]


In [14]:
SVD_train_total = np.sum(CM_SVD_Train,axis=1)[1];
SVD_train_correct = CM_SVD_Train[1][1];

Tree_train_total = np.sum(CM_Tree_Train,axis=1)[1];
Tree_train_correct = CM_Tree_Train[1][1];

print('SVD Training Data Set:')
print('Total Readmitted Patients: '+str(SVD_train_total))
print('Total Correctly Predicted: '+str(SVD_train_correct)+'\n')
print('Tree Training Data Set:')
print('Total Readmitted Patients: '+str(Tree_train_total))
print('Total Correctly Predicted: '+str(Tree_train_correct))

SVD Training Data Set:
Total Readmitted Patients: 7700
Total Correctly Predicted: 0

Tree Training Data Set:
Total Readmitted Patients: 7700
Total Correctly Predicted: 67


In [15]:
SVD_test_total = np.sum(CM_SVD_Test,axis=1)[1];
SVD_test_correct = CM_SVD_Test[1][1];

Tree_test_total = np.sum(CM_Tree_Test,axis=1)[1];
Tree_test_correct = CM_Tree_Test[1][1];

print('SVD Test Data Set:')
print('Total Readmitted Patients: '+str(SVD_test_total))
print('Total Correctly Predicted: '+str(SVD_test_correct)+'\n')
print('Tree Test Data Set:')
print('Total Readmitted Patients: '+str(Tree_test_total))
print('Total Correctly Predicted: '+str(Tree_test_correct))

SVD Test Data Set:
Total Readmitted Patients: 3657
Total Correctly Predicted: 0

Tree Test Data Set:
Total Readmitted Patients: 3657
Total Correctly Predicted: 27


From what you say it seems class 0 is 19 times more frequent than class 1. So you should increase the class_weight of class 1 relative to class 0, say {0:.9, 1:.1}.



In [61]:
SVD_Train_Correct = sum(np.diag(CM_SVD_Train))
SVD_Train_Total = len(XSVD_train)

print('The percentage of correct assignments is: ',SVD_Train_Correct/SVD_Train_Total)

The percentage of correct assignments is:  0.5275361893727175


In [57]:
sum(sum(CM_SVD_Train))

68183

In [58]:
len(XSVD_train)

68183