# Decision Tree - Heart Disease

[ch2-decision-trees.ipynb](https://github.com/kyopark2014/ML-Algorithms/blob/main/xgboost/src/ch2-decision-trees.ipynb)

In [1]:
import pandas as pd
import numpy as np

In [2]:
# df_heart = pd.read_csv('heart_disease.csv')
df_heart = pd.read_csv('https://raw.githubusercontent.com/rickiepark/handson-gb/main/Chapter02/heart_disease.csv')

df_heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
df_heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [4]:
df_heart.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [5]:
df_heart.isna().sum().sum()

0

### Split Train/Test Dataset

In [6]:
X = df_heart.iloc[:,:-1]
y = df_heart.iloc[:,-1]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [7]:
X_train.shape

(227, 13)

### Decision Tree Classifier

In [8]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=2)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5)

print('Accuracy:', np.round(scores, 2))
print('Avg. Accuracy: %0.2f' % (scores.mean()))

Accuracy: [0.74 0.85 0.77 0.73 0.7 ]
Avg. Accuracy: 0.76


In [9]:
params = model.get_params(deep=True)
print(params)

{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 2, 'splitter': 'best'}


In [10]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test)

model.score(X_test, y_test)

0.8552631578947368

In [11]:
from sklearn.metrics import classification_report
print(classification_report(y_true=y_test, y_pred = y_pred))

leaf_node_count = 0
tree = model.tree_
for i in range(tree.node_count):
    if (tree.children_left[i] == -1) and (tree.children_right[i] == -1):
        leaf_node_count += 1
print('# of Leaf Nodes:', leaf_node_count)

              precision    recall  f1-score   support

           0       0.84      0.86      0.85        36
           1       0.87      0.85      0.86        40

    accuracy                           0.86        76
   macro avg       0.85      0.86      0.86        76
weighted avg       0.86      0.86      0.86        76

# of Leaf Nodes: 39


#### RandomizedSearchCV

In [12]:
from sklearn.model_selection import RandomizedSearchCV

def randomized_search_clf(params, runs=20, clf=DecisionTreeClassifier(random_state=2)):
    rand_clf = RandomizedSearchCV(clf, params, n_iter=runs, cv=5, n_jobs=-1, random_state=2)
    
    rand_clf.fit(X_train, y_train)

    best_model = rand_clf.best_estimator_    
    
    best_score = rand_clf.best_score_
    print("Best Score: {:.3f}".format(best_score))

    y_pred = best_model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy: {:.3f}'.format(accuracy))
        
    return best_model

### Hyperparameter

In [13]:
from sklearn.metrics import accuracy_score

randomized_search_clf(
    params={
        'criterion':['entropy', 'gini'],
        'splitter':['random', 'best'],
        'min_samples_split':[2, 3, 4, 5, 6, 8, 10],
        'min_samples_leaf':[1, 0.01, 0.02, 0.03, 0.04],
        'min_impurity_decrease':[0.0, 0.0005, 0.005, 0.05, 0.10, 0.15, 0.2],
        'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None],
        'max_features':['sqrt', 0.95, 0.90, 0.85, 0.80, 0.75, 0.70],
        'max_depth':[None, 2,4,6,8],
        'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]
    })

Best Score: 0.798
Accuracy: 0.855


DecisionTreeClassifier(criterion='entropy', max_depth=8, max_features=0.8,
                       max_leaf_nodes=45, min_samples_leaf=0.04,
                       min_samples_split=10, min_weight_fraction_leaf=0.05,
                       random_state=2)

### 탐색 범위 좁히기 

In [14]:
best_model = randomized_search_clf(
    params={'max_depth':[None, 6, 7],
            'max_features':['sqrt', 0.78],
            'max_leaf_nodes':[45, None],
            'min_samples_leaf':[1, 0.035, 0.04, 0.045, 0.05],
            'min_samples_split':[2, 9, 10],
            'min_weight_fraction_leaf': [0.0, 0.05, 0.06, 0.07],
            },
    runs=100)

Best Score: 0.802
Accuracy: 0.868


In [15]:
scores = cross_val_score(best_model, X, y, cv=5)

print('Accuracy:', np.round(scores, 2))

print('Avg. Accuracy: %0.2f' % (scores.mean()))

Accuracy: [0.82 0.9  0.8  0.8  0.78]
Avg. Accuracy: 0.82


In [16]:
best_model.feature_importances_

array([0.08460561, 0.08801669, 0.42204155, 0.        , 0.03897566,
       0.        , 0.04491867, 0.01925798, 0.        , 0.00618622,
       0.        , 0.1829161 , 0.11308151])

### Verify the new hyperparameters

In [17]:
params = best_model.get_params(deep=True)
print(params)

max_depth = params['max_depth']
max_features = params['max_features']
max_leaf_nodes = params['max_leaf_nodes']
min_samples_leaf = params['min_samples_leaf']
min_samples_split = params['min_samples_split']
min_weight_fraction_leaf = params['min_weight_fraction_leaf']
random_state = params['random_state']

{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 7, 'max_features': 0.78, 'max_leaf_nodes': 45, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 0.045, 'min_samples_split': 9, 'min_weight_fraction_leaf': 0.06, 'random_state': 2, 'splitter': 'best'}


In [18]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features, max_leaf_nodes=max_leaf_nodes,
                       min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split,
                       min_weight_fraction_leaf=min_weight_fraction_leaf, random_state=random_state)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.3f}'.format(accuracy))

leaf_node_count = 0
tree = model.tree_
for i in range(tree.node_count):
    if (tree.children_left[i] == -1) and (tree.children_right[i] == -1):
        leaf_node_count += 1
print('# of Leaf Nodes:', leaf_node_count)

Accuracy: 0.868
# of Leaf Nodes: 12


### `loguniform`을 사용한 랜덤 서치

In [19]:
from sklearn.utils.fixes import loguniform
from scipy.stats import randint

params = {'max_depth': randint(1,100),
          'max_leaf_nodes': randint(1,100),
          'max_features': loguniform(1e-5,1),
          'min_samples_split': loguniform(1e-5,1),
          'min_samples_leaf': loguniform(1e-5,1),
          'min_impurity_decrease': loguniform(1e-5,1),
          'min_weight_fraction_leaf': loguniform(1e-5,1)}

dtc = DecisionTreeClassifier(random_state=0)
rs = RandomizedSearchCV(dtc, params, n_iter=100, n_jobs=-1, random_state=0)
rs.fit(X_train, y_train)

print('Best Cross Validation Function:', rs.best_score_)
print('Best paramer:', rs.best_params_)

Best Cross Validation Function: 0.7932367149758455
Best paramer: {'max_depth': 48, 'max_features': 0.43091880545542754, 'max_leaf_nodes': 68, 'min_impurity_decrease': 0.0013196080073784372, 'min_samples_leaf': 0.005935250363740932, 'min_samples_split': 0.00027243167437771866, 'min_weight_fraction_leaf': 0.03410935690756259}


 0.69149758 0.70086957        nan 0.70077295 0.55072464 0.55072464
 0.71806763 0.74425121 0.55072464 0.74463768 0.71806763 0.71806763
 0.69198068 0.71806763 0.70067633 0.76222222 0.69198068 0.71806763
 0.71806763        nan        nan 0.71816425 0.55072464        nan
 0.70048309 0.55072464 0.55072464 0.71806763 0.72705314 0.71806763
 0.6789372  0.68743961 0.71806763 0.70067633 0.55072464 0.72695652
 0.70057971 0.71806763 0.71806763 0.55072464        nan 0.55072464
 0.68743961 0.71806763 0.71806763 0.67014493 0.70937198 0.71806763
 0.70937198 0.55072464 0.71806763 0.71806763 0.687343   0.71806763
 0.55072464        nan 0.55072464 0.72695652 0.6963285  0.67845411
        nan        nan 0.70492754 0.79323671 0.55072464        nan
 0.55072464 0.77536232 0.71806763 0.73971014 0.70492754 0.68763285
 0.71806763 0.55072464        nan 0.70937198 0.71806763 0.55072464
        nan 0.6963285  0.55072464 0.71806763 0.55072464 0.73574879
 0.70927536        nan 0.55072464 0.55072464 0.69188406 0.7180

### 특성 중요도

In [20]:
best_model.fit(X, y)

DecisionTreeClassifier(max_depth=7, max_features=0.78, max_leaf_nodes=45,
                       min_samples_leaf=0.045, min_samples_split=9,
                       min_weight_fraction_leaf=0.06, random_state=2)

In [21]:
# feature_importances
feature_dict = dict(zip(X.columns, best_model.feature_importances_))

import operator

sorted(feature_dict.items(), key=operator.itemgetter(1), reverse=True)[0:3]

[('cp', 0.4840958610240171),
 ('thal', 0.20494445570568706),
 ('ca', 0.18069065321397942)]

In [22]:
from sklearn.inspection import permutation_importance

result = permutation_importance(best_model, X, y, n_jobs=-1, random_state=0)

feature_dict = dict(zip(X.columns, result.importances_mean))
sorted(feature_dict.items(), key=operator.itemgetter(1), reverse=True)[0:3]

[('cp', 0.08976897689768981),
 ('thal', 0.08382838283828387),
 ('ca', 0.05940594059405944)]