In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn import tree
from scipy.sparse import csr_matrix

## Load the data

In [2]:
X=np.load('dataset_diabetes/X_array.npy',allow_pickle=True)
readmission=np.load('dataset_diabetes/Y.npy',allow_pickle=True)
encounter_id=np.load('dataset_diabetes/encounter_id.npy',allow_pickle=True)
patient_nbr=np.load('dataset_diabetes/patient_nbr.npy',allow_pickle=True)

## One Hot encoding and Convert readmission to binary Y

In [3]:
my_list = np.where(readmission == 'NO', 0, readmission)
my_list2 = np.where(my_list == '>30', 0, my_list)
Y0 = np.where(my_list2 == '<30', 1, my_list2)
Y=list(Y0)

In [4]:
#convert to binary
binarydat=OneHotEncoder().fit_transform(X)
print(binarydat.shape)
binarydat_unpacked=csr_matrix(binarydat).toarray()
#split into train and test
X_train, X_test, y_train, y_test = train_test_split(binarydat_unpacked, Y, test_size=0.33, random_state=1)

(101766, 468)


# Decision tree parameter testing

In [5]:
#do some classifyin'
testnums1= [125,130,135]
testnums2= [7,8,9]
testnums3= [4,5,6]
testnums4= [35,40,45]
testnums5= [0]
scoretemp=0
for n in testnums1:
    for m in testnums2:
        for o in testnums3:
            for p in testnums4:
                for q in testnums5:
                    clf = tree.DecisionTreeClassifier(criterion='entropy',max_depth=m,max_features=n,min_samples_leaf=o,max_leaf_nodes=p,min_impurity_decrease=q,random_state=1)
                    clf = clf.fit(X_train, y_train)
                    print('Score for max_features='+str(n)+' and max_depth='+str(m)+' is: ')
                    print(clf.score(X_test, y_test))
                    scoretemp1=clf.score(X_test, y_test)
                    if scoretemp1>scoretemp:
                        scoretemp=scoretemp1
                        best_max_depth=m
                        best_max_features=n
                        best_splitter=o
                        best_min_weight_fraction_leaf=p
                        best_min_impurity_decrease=q
print('Best score: '+str(scoretemp))
print('Best score max_features: '+str(best_max_features))
print('Best max_depth: '+str(best_max_depth))
print('Best min_samples_leaf: '+str(best_splitter))
print('Best max_leaf_nodes: '+str(best_min_weight_fraction_leaf))
print('Best min_impurity_decrease: '+str(best_min_impurity_decrease))

Score for max_features=125 and max_depth=7 is: 
0.8908078492094215
Score for max_features=125 and max_depth=7 is: 
0.8907780722389305
Score for max_features=125 and max_depth=7 is: 
0.8904505255635292
Score for max_features=125 and max_depth=7 is: 
0.8908376261799125
Score for max_features=125 and max_depth=7 is: 
0.8906887413274573
Score for max_features=125 and max_depth=7 is: 
0.8906887413274573
Score for max_features=125 and max_depth=7 is: 
0.8908376261799125
Score for max_features=125 and max_depth=7 is: 
0.8906887413274573
Score for max_features=125 and max_depth=7 is: 
0.8907780722389305
Score for max_features=125 and max_depth=8 is: 
0.8908078492094215
Score for max_features=125 and max_depth=8 is: 
0.8908078492094215
Score for max_features=125 and max_depth=8 is: 
0.8907185182979483
Score for max_features=125 and max_depth=8 is: 
0.8907482952684393
Score for max_features=125 and max_depth=8 is: 
0.8907482952684393
Score for max_features=125 and max_depth=8 is: 
0.890748295268

In [6]:
# # # # BESTS
# Best score: 0.8914331655897328
# Best score max_features: 130
# Best max_depth: 8
# Best min_samples_leaf: 5
# Best max_leaf_nodes: 40
y_test.count(0)/len(y_test)

0.8911056189143317

# Train decision tree on all data using 'best' parameters tested above

In [7]:
clf = tree.DecisionTreeClassifier(criterion='entropy',max_depth=best_max_depth,max_features=best_max_features,min_samples_leaf=best_splitter,max_leaf_nodes=best_min_weight_fraction_leaf,min_impurity_decrease=best_min_impurity_decrease,random_state=1)
clf = clf.fit(binarydat_unpacked, Y)
len(clf.feature_importances_)

468

## View unique feature importances (most will be zero)

In [8]:
np.unique(clf.feature_importances_)

array([0.        , 0.00228991, 0.00235119, 0.002364  , 0.00268085,
       0.00337474, 0.00337958, 0.00347729, 0.00392328, 0.00418606,
       0.0044936 , 0.00486669, 0.00502359, 0.00519919, 0.00559715,
       0.00755919, 0.02798941, 0.02806302, 0.03771992, 0.03894524,
       0.03933721, 0.04854153, 0.05197158, 0.05514075, 0.0734865 ,
       0.08481937, 0.11481298, 0.13066357, 0.20774266])

## Delete features whose importance is ZERO

In [9]:
importance=clf.feature_importances_
importance_na = np.where(importance <0.0025, 'NA', importance)
# to further reduce features you can do a line similar to the above but instead of saying... 
# importance ==0
# you could say importance <0.01 or whatever threshold you want 

In [10]:
#checking something...
Y.count(0)/len(Y)

0.8884008411453728

In [11]:
NA_added=np.vstack([binarydat_unpacked,importance_na])
index = np.argwhere(NA_added=='NA')
x2 = np.delete(binarydat_unpacked, index,axis=1)
x2.shape

  x2 = np.delete(binarydat_unpacked, index,axis=1)


(101766, 25)

In [12]:
binarydat_unpacked.shape

(101766, 468)

# Save Data

In [13]:
np.save('dataset_diabetes/tree_selected_25_X.npy',x2)