# Introduction

In [None]:
"""
What? Decision tree classification applied to heart disease

Corey Wade, Hands-On Gradient Boosting with XGBoost and scikit-learn
https://github.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn
"""

# Import pandas and numpy

In [15]:
import operator
import warnings
import pandas as pd
import numpy as np
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Problem goal

In [None]:
"""
You have been asked by a hospital to use machine learning to predict heart disease. Your job is to develop a 
model and highlight two to three important features that doctors and nurses can focus on to improve patient 
health.

You decide to use a decision tree classifier with fine-tuned hyperparameters. After the model has been built, 
you will interpret results using feature_importances_, an attribute that determines the most important features
in predicting heart disease.
"""

# Load dataset

In [2]:
# Upload heart.csv to dataFrame
df_heart = pd.read_csv('../DATASETS/heart_disease.csv')

# Show first five rows
df_heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


# Split data

In [4]:
# split data into X and y
X = df_heart.iloc[:,:-1]
y = df_heart.iloc[:,-1]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

# Get the baseline accuracy with CV

In [None]:
"""
Baseline because we had perform no tuning
Cross-validation provide the mean error across the fold. This provides a better estimate of the error.
"""

In [6]:
# Initialize Decision Tree Classifier
model = DecisionTreeClassifier(random_state=2)

# Obtain scores of cross-validation
scores = cross_val_score(model, X, y, cv=5)

# Display accuracy
print('Accuracy:', np.round(scores, 2))

# Display mean accuracy
print('Accuracy mean: %0.2f' % (scores.mean()))

Accuracy: [0.74 0.85 0.77 0.73 0.7 ]
Accuracy mean: 0.76


# Hyperparameters tuning

In [None]:
"""
The initial accuracy is 76%. Let's see what gains can be made with hyperparameter fine-tuning.
"""

In [8]:
def randomized_search_clf(params, runs=20, clf=DecisionTreeClassifier(random_state=2)):

    # Instantiate GridSearchCV as grid_reg
    rand_clf = RandomizedSearchCV(clf, params, n_iter=runs,
                                  cv=5, n_jobs=-1, random_state=2)
    # Fit grid_reg on X_train and y_train
    rand_clf.fit(X_train, y_train)

    # Extract best estimator
    best_model = rand_clf.best_estimator_

    # Extract best score
    best_score = rand_clf.best_score_

    # Print best score
    print("Training score: {:.3f}".format(best_score))

    # Predict test set labels
    y_pred = best_model.predict(X_test)

    # Compute accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Print accuracy
    print('Test score: {:.3f}'.format(accuracy))

    # Return best model
    return best_model

In [None]:
"""
We create a dictionary where we give the range of the parametere we weant to try.
"""

In [9]:
randomized_search_clf(params={'criterion':['entropy', 'gini'],
                              'splitter':['random', 'best'],
                          'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01],
                          'min_samples_split':[2, 3, 4, 5, 6, 8, 10],
                          'min_samples_leaf':[1, 0.01, 0.02, 0.03, 0.04],
                          'min_impurity_decrease':[0.0, 0.0005, 0.005, 0.05, 0.10, 0.15, 0.2],
                          'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None],
                          'max_features':['auto', 0.95, 0.90, 0.85, 0.80, 0.75, 0.70],
                          'max_depth':[None, 2,4,6,8],
                          'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]
                         })

Training score: 0.797
Test score: 0.855


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=8,
            max_features=0.8, max_leaf_nodes=45, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=0.04,
            min_samples_split=10, min_weight_fraction_leaf=0.05,
            presort=False, random_state=2, splitter='best')

In [None]:
"""
This is a definite improvement, and the model generalizes well on the test set. Let's see if we can do 
better by narrowing the range. “Another strategy is to stop checking hyperparameters whose defaults are 
working fine or the changes are pretty small.
"""

In [10]:
randomized_search_clf(params={'max_depth':[None, 6, 7],
'max_features':['auto', 0.78],
'max_leaf_nodes':[45, None],
'min_samples_leaf':[1, 0.035, 0.04, 0.045, 0.05],
'min_samples_split':[2, 9, 10],
'min_weight_fraction_leaf': [0.0, 0.05, 0.06, 0.07],
},
runs=100)

Training score: 0.802
Test score: 0.868


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=0.78, max_leaf_nodes=45,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=0.045, min_samples_split=9,
            min_weight_fraction_leaf=0.06, presort=False, random_state=2,
            splitter='best')

In [None]:
"""
This model is more accurate in the training and test score. However, for a proper baseline of comparison, 
it's essential to put the new model into cross_val_clf.
"""

In [23]:
# Initialize Decision Tree Classifier
model = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=0.78, max_leaf_nodes=45,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=0.045, min_samples_split=9,
            min_weight_fraction_leaf=0.06, presort=False, random_state=2,
            splitter='best')

# Obtain scores of cross-validation on the -->>WHOLE<<-- data set
scores = cross_val_score(model, X, y, cv=5)

# Display accuracy
print('Accuracy:', np.round(scores, 2))

# Display mean accuracy
print('Accuracy mean: %0.2f' % (scores.mean()))

Accuracy: [0.82 0.9  0.8  0.8  0.78]
Accuracy mean: 0.82


In [None]:
"""
This is six percentage points higher than the default model. When it comes to predicting heart disease, 
more accuracy can save lives.
"""

# Features importance

In [None]:
"""
When testing, it's important not to mix and match training and test sets. After a final model has been 
selected, HOWEVER, fitting the model on the entire dataset can be beneficial. Why? Because the goal is 
to test the model on data that has never been seen and fitting the model on the entire dataset may lead
to additional gains in accuracy.
"""

In [11]:
best_clf = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                       max_features=0.78, max_leaf_nodes=45,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.045, min_samples_split=9,
                       min_weight_fraction_leaf=0.06, presort=False,
                       random_state=2, splitter='best')
best_clf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=0.78, max_leaf_nodes=45,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=0.045, min_samples_split=9,
            min_weight_fraction_leaf=0.06, presort=False, random_state=2,
            splitter='best')

In [12]:
best_clf.feature_importances_

array([0.04826754, 0.04081653, 0.48409586, 0.00568635, 0.        ,
       0.        , 0.        , 0.00859483, 0.        , 0.02690379,
       0.        , 0.18069065, 0.20494446])

In [None]:
"""
It's not easy to interpret these results. The following code zips the columns along with the most important 
features into a dictionary before displaying them in reverse order for a clean output that is easy to interpret
"""

In [16]:
# Zip columns and feature_importances_ into dict
feature_dict = dict(zip(X.columns, best_clf.feature_importances_))

# Sort dict by values (as list of tuples)
sorted(feature_dict.items(), key=operator.itemgetter(1), reverse=True)[0:3]

[('cp', 0.4840958610240171),
 ('thal', 0.20494445570568706),
 ('ca', 0.18069065321397942)]

In [None]:
"""
These numbers may be interpreted as their explanation of variance, so 'cp' accounts for 48% of the variance, 
which is more.
"""