In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn import tree

## Load the data

In [2]:
X=np.load('dataset_diabetes/X_array.npy',allow_pickle=True)
readmission=np.load('dataset_diabetes/Y.npy',allow_pickle=True)
encounter_id=np.load('dataset_diabetes/encounter_id.npy',allow_pickle=True)
patient_nbr=np.load('dataset_diabetes/patient_nbr.npy',allow_pickle=True)

## One Hot encoding and Convert readmission to binary Y

In [3]:
my_list = np.where(readmission == 'NO', 0, readmission)
my_list2 = np.where(my_list == '>30', 0, my_list)
Y0 = np.where(my_list2 == '<30', 1, my_list2)
Y=list(Y0)

In [4]:
#convert to binary
binarydat=OneHotEncoder().fit_transform(X)
print(binarydat.shape)
#split into train and test
X_train, X_test, y_train, y_test = train_test_split(binarydat, Y, test_size=0.33, random_state=1)

(101766, 468)


# Decision tree parameter testing

In [5]:
#do some classifyin'
testnums2=[7,14,21]
testnums1=[100,150,200]
scoretemp=0
for n in testnums1:
    for m in testnums2:
        clf = tree.DecisionTreeClassifier(criterion='entropy',max_depth=m,max_features=n,random_state=1)
        clf = clf.fit(X_train, y_train)
        print('Score for max_features='+str(n)+' and max_depth='+str(m)+' is: ')
        print(clf.score(X_test, y_test))
        scoretemp1=clf.score(X_test, y_test)
        if scoretemp1>scoretemp:
            scoretemp=scoretemp1
            best_max_depth=m
            best_max_features=n
print('Best score: '+str(scoretemp))
print('Best max_depth: '+str(best_max_depth))
print('Best score max_features: '+str(best_max_features))

Score for max_features=100 and max_depth=7 is: 
0.8905398564750022
Score for max_features=100 and max_depth=14 is: 
0.8860435339308579
Score for max_features=100 and max_depth=21 is: 
0.8752047166721257
Score for max_features=150 and max_depth=7 is: 
0.8908971801208945
Score for max_features=150 and max_depth=14 is: 
0.8871750588095167
Score for max_features=150 and max_depth=21 is: 
0.8790161688949766
Score for max_features=200 and max_depth=7 is: 
0.8907780722389305
Score for max_features=200 and max_depth=14 is: 
0.8850013399636721
Score for max_features=200 and max_depth=21 is: 
0.8735669832951195
Best score: 0.8908971801208945
Best max_depth: 7
Best score max_features: 150


# Train decision tree on all data using 'best' parameters tested above

In [6]:
clf = tree.DecisionTreeClassifier(criterion='entropy',max_depth=7,max_features=150,random_state=1)
clf = clf.fit(binarydat, Y)
len(clf.feature_importances_)

468

## View unique feature importances (most will be zero)

In [7]:
np.unique(clf.feature_importances_)

array([0.        , 0.00074226, 0.00078891, 0.00079485, 0.00135004,
       0.00138169, 0.00143278, 0.00147069, 0.00168874, 0.00180788,
       0.00183164, 0.00186571, 0.00189319, 0.00211249, 0.00211617,
       0.00215589, 0.00215697, 0.00222717, 0.00242781, 0.00252509,
       0.00262002, 0.00264797, 0.00290677, 0.00296329, 0.00340416,
       0.00352472, 0.00377954, 0.00422767, 0.00433208, 0.00512859,
       0.00533508, 0.00709424, 0.00736306, 0.00764933, 0.00825355,
       0.00925191, 0.0107805 , 0.010832  , 0.02125533, 0.08330762,
       0.10384427, 0.10493506, 0.10601352, 0.10943057, 0.33634921])

In [8]:
importance=clf.feature_importances_

In [9]:
importance

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00422767,
       0.        , 0.        , 0.00078891, 0.00764933, 0.        ,
       0.        , 0.10384427, 0.        , 0.0107805 , 0.        ,
       0.        , 0.        , 0.00138169, 0.        , 0.        ,
       0.        , 0.10493506, 0.        , 0.        , 0.00211617,
       0.00340416, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.10943057, 0.        , 0.        , 0.        ,
       0.        , 0.010832  , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00433208,
       0.00079485, 0.        , 0.00296329, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     