In [None]:
# Import neccessary libraries and tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('heart.csv')

In [None]:
# Have a first look at the data
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

### Classification of data types
Before going to any training, we should classify the types of data into two different kinds: 'categorical_val' for whose the unique data is less than 10 different values (e.g. age, sex...) and 'continuous_val' vice versa.

In [None]:
# Please fill your answer in '...'
categorical_val = []
continuous_val = []
for col in ...:
    if ...:
        ...
    else:
        ...

### Create dummies and scale data
After exploring the dataset, we need to convert some categorical variables into dummy variables and scale all the values before training the models.

In [None]:
'''
Create dummies
'''
# Please fill your answer in '...'
categorical_val.remove('target')
dataset = pd.get_dummies(..., columns=...)
dataset.head()

In [None]:
'''
Scale the values
- Set the array of columns to scale.
'''
# Please fill your answer in '...'
from sklearn.preprocessing import StandardScaler

s_sc = StandardScaler()
col_to_scale = ...
dataset[col_to_scale] = s_sc.fit_transform(dataset[col_to_scale])
dataset.head()

#### Define function to print the accuracy score 

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, x_train, y_train, x_test, y_test, train):
    if train == True:
        pred = clf.predict(x_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print(f'Accuracy Score: {accuracy_score(y_train, pred) * 100:.4f}%')
        print('______________________________________________________________________')
        print(f'Classification Report:\n{clf_report}')
#         print('______________________________________________________________________')
#         print(f'Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n')
    elif train == False:
        pred = clf.predict(x_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print(f'Accuracy Score: {accuracy_score(y_test, pred) * 100:.4f}%')
        print('______________________________________________________________________')
        print(f'Classification Report:\n{clf_report}')
#         print('______________________________________________________________________')
#         print(f'Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n')

### Dataset splitting
Split the dataset into training (70%) and test set (30%)

In [None]:
'''
use train_test_split(data, target, test_size, random)
'''
# Please fill your answer in '...'
from sklearn.model_selection import train_test_split

x = dataset.drop('target', axis=1)
y = dataset.target
x_train, x_test, y_train, y_test = train_test_split(..., ..., test_size=..., random_state=...)

## Decision Tree
Decision Trees are a non-parametric supervised learning method used for both classification and regression tasks. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features. Let's look at an simple example:

| ![](https://miro.medium.com/max/450/1*XMId5sJqPtm8-RIwVVz2tg.png) |
|:--:|
| <b>Example of decision tree.</b> |

In the image, the bold texts represent **internal nodes** (a.k.a. nodes or conditions). The **edges/branches** then split based on these nodes. If we can not branch anymore , then those nodes are called **decision** (or leaf).

### Backgorund algorithm
In order to decide which features to choose and what conditions to use for splitting, along with knowing when to stop, we need to calculate how much *accuracy* eah split (branch) will cost us, using the function called cost function. **The split costs least is chosen**. This algorthim is also known as **greedy algorithm** as we have an excessive desire of lowering the cost. This makes the root node as best predictor (for regression problems) or classifier (for classification problems). In both cases the cost functions try to find most homogeneous branches, or branches having groups with similar responses.
- For the regression problem, we can use a simple squred error:

$cost=\sum(y-pred)^2=\sum(y-\hat{y})^2$

where $y$ is the ground truth and $\hat{y}$ is the predicted value.

- For the classification problem, we use the *Gini index function*:

$cost=\sum(p_k(1-p_k))$

where $p_k$ is the proportion of training instances of class k in a particular prediction node. What we want here is that a particular decision node should *ideally* have an error value of 0, which means that each split outputs a single class 100% of the time.
The concept of having a single class per split is also known as *information gain*. Shortly, we won't gain any information if we have to choose a split where each output has a mix of classes. On the other hand, if our split has a high percentage of each class for each output, then we have gained the information that splitting in that particular way on that particular feature variable gives us a particular output. The information gain can be intuitively understanded as the image below:

| ![](https://miro.medium.com/max/750/1*z7tK94rGGIy_42UpiqilLQ.png) |
|:--:|
| <b>Intuition of information gain</b> |

The nature of decision tree can expand the tree so bad, indeed many of these splits will end up being redundant and unnecessary to increasing the accuracy of our model. Thus, we need to manage some stopping criterion to halt the construction of the tree. Here we have a common solution, **tree pruning**. It is a technique that leverages this splitting redundancy to remove i.e *prune* the unnecessary splits in our tree.

| ![](https://miro.medium.com/max/848/1*TxzPx2UmUdhKieWruQ1prA.png) |
|:--:|
| <b>Tree pruning</b> |

## Build a decision tree model

In [None]:
'''
Building a decision model with Scitkit Learn
- set random_state to 42
- fit the training data
'''
# Please fill your answer in '...'
from sklearn.tree import DecisionTreeClassifier

dt_clf = ...

In [None]:
print_score(dt_clf, x_train, y_train, x_test, y_test, train=True)

In [None]:
print_score(dt_clf, x_train, y_train, x_test, y_test, train=False)

## Improve model with hyperparameters

In [None]:
'''
Implement some different hyperparameters to test their efficacy on new decision tree model: 
- criterion: we aim to calculate 'gini' and 'entropy' index
- splitter: use 'best' and 'random'
- max_depth: list of number from 1 to 20
- min_samples_split: array of 2, 3, 4 
  'min_samples_leaf': list of number from 1 to 20
'''
# Please fill your answer in '...'
from sklearn.model_selection import GridSearchCV

params = {...}
# dt_clf = DecisionTreeClassifier(random_state=42)
dt_cv = GridSearchCV(..., ..., scoring='accuracy', n_jobs=-1, verbose=1, cv=3).fit(x_train, y_train)
best_params = dt_cv.best_params_
print(f'Best_params: {best_params}')

In [None]:
# Apply best parameters to the decision tree model
dt_clf_tuning = DecisionTreeClassifier(**best_params).fit(x_train, y_train)

In [None]:
print_score(dt_clf_tuning, x_train, y_train, x_test, y_test, train=True)

In [None]:
print_score(dt_clf_tuning, x_train, y_train, x_test, y_test, train=False)

In [None]:
'''
Summarize the accuracy score of decision tree algorithm with and without hyperparameters.
'''
# Please fill your answer in '...'
dt_clf_train = accuracy_score(y_train, dt_clf.predict(x_train)) * 100
dt_clf_test = accuracy_score(y_test, dt_clf.predict(x_test)) * 100
dt_clf_tuning_train = ...
dt_clf_tuning_test = ...

result = pd.DataFrame(columns=['Model', 'Non-tuning train accuracy %', 'Tuning train accuracy %', 'Non-tuning test accuracy %', 'Tuning test accuracy %'])
dt_result = pd.DataFrame(data=[['Decision Tree', ..., ..., ..., ...]],
                        columns=['Model', 'Non-tuning train accuracy %', 'Tuning train accuracy %', 'Non-tuning test accuracy %', 'Tuning test accuracy %'])
result = result.append(dt_result, ignore_index=True)
result