In [None]:
# Import neccessary libraries and tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('heart.csv')

In [None]:
# Have a first look at the data
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

### Classification of data types
Before going to any training, we should classify the types of data into two different kinds: 'categorical_val' for whose the unique data is less than 10 different values (e.g. age, sex...) and 'continuous_val' vice versa.

In [None]:
# Please fill your answer in '...'
categorical_val = []
continuous_val = []
for col in ...:
    if ...:
        ...
    else:
        ...

### Create dummies and scale data
After exploring the dataset, we need to convert some categorical variables into dummy variables and scale all the values before training the models.

In [None]:
'''
Create dummies
'''
# Please fill your answer in '...'
categorical_val.remove('target')
dataset = pd.get_dummies(..., columns=...)
dataset.head()

In [None]:
'''
Scale the values
- Set the array of columns to scale.
'''
# Please fill your answer in '...'
from sklearn.preprocessing import StandardScaler

s_sc = StandardScaler()
col_to_scale = ...
dataset[col_to_scale] = s_sc.fit_transform(dataset[col_to_scale])
dataset.head()

#### Define function to print the accuracy score 

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, x_train, y_train, x_test, y_test, train):
    if train == True:
        pred = clf.predict(x_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print(f'Accuracy Score: {accuracy_score(y_train, pred) * 100:.4f}%')
        print('______________________________________________________________________')
        print(f'Classification Report:\n{clf_report}')
#         print('______________________________________________________________________')
#         print(f'Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n')
    elif train == False:
        pred = clf.predict(x_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print(f'Accuracy Score: {accuracy_score(y_test, pred) * 100:.4f}%')
        print('______________________________________________________________________')
        print(f'Classification Report:\n{clf_report}')
#         print('______________________________________________________________________')
#         print(f'Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n')

### Dataset splitting
Split the dataset into training (70%) and test set (30%)

In [None]:
'''
use train_test_split(data, target, test_size, random)
'''
# Please fill your answer in '...'
from sklearn.model_selection import train_test_split

x = dataset.drop('target', axis=1)
y = dataset.target
x_train, x_test, y_train, y_test = train_test_split(..., ..., test_size=..., random_state=...)

## Random forest
Intuitively, random forest (RF) algorithm can be concerned as an extension of decision tree algorithm. RF consists of a large number of individual decision trees that operate as an ensemble. Each individual tree in the random forest spits out a class prediction and the class with the most votes becomes our model’s prediction (see figure below).

| ![](https://miro.medium.com/max/1000/1*VHDtVaDPNepRglIAv72BFg.jpeg) |
|:--:|
| <b>Visualization of a Random Forest Model Making a Prediction</b> |

the fundamental concept behind RF is "the wisdom of crowd", which mean **A large number of relatively uncorrelated models (trees) operating as a committee will outperform any of the individual constituent models**. The reason for this wonderful effect is that the trees protect each other from their individual errors (as long as they don’t constantly all err in the same direction). Alternatively, in decision tree works separately and doesn't depend on each other.

#### Prerequisites for random forest
- There needs to be some actual signal in our features so that models built using those features do better than random guessing.
- The predictions (and therefore the errors) made by the individual trees need to have low correlations with each other.
#### Ensure the model diverse
In order to ensure the uncorrelation between trees and the behavior of each individual tree, we usually use two common methods:
- Bagging (Bootstrap Aggregation) — Decisions trees are very sensitive to the data they are trained on — small changes to the training set can result in significantly different tree structures. For example, if our training data was [1, 2, 3, 4, 5, 6] then we might give one of our trees the following list [1, 2, 2, 3, 6, 6]. 
- Feature randomness - Each tree in a random forest can pick only from a random subset of features. This forces even more variation amongst the trees in the model.

| ![](https://miro.medium.com/max/1000/1*EemYMyOADnT0lJWSXmTDdg.jpeg) |
|:--:|
| <b>Changing in data and features between trees in random forest</b> |

## Build a random forest model 

In [None]:
'''
Building a decision model with Scitkit Learn
- set random_state to 42
- set number of estimator (n_estimators) to 1000
'''
# Please fill your answer in '...'
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf_clf = ...

In [None]:
print_score(rf_clf, x_train, y_train, x_test, y_test, train=True)

In [None]:
print_score(rf_clf, x_train, y_train, x_test, y_test, train=False)

## Improve model with hyperparameters

In [None]:
'''
Implement some different hyperparameters to test their efficacy on new random forest model: 
n_estimators, max_features, max_depth, min_samples_split, min_samples_leaf, bootstrap
'''
# Please fill your answer in '...'
from sklearn.model_selection import GridSearchCV

n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

params_grid = {...}

# rf_clf = RandomForestClassifier(random_state=42)
rf_cv = GridSearchCV(..., ..., scoring="accuracy", cv=3, verbose=2, n_jobs=-1).fit(x_train, y_train)
best_params = rf_cv.best_params_
print(f'Best_params: {best_params}')

In [None]:
rf_clf_tuning = RandomForestClassifier(**best_params).fit(x_train, y_train)

In [None]:
print_score(rf_clf_tuning, x_train, y_train, x_test, y_test, train=True)

In [None]:
print_score(rf_clf_tuning, x_train, y_train, x_test, y_test, train=False)

In [None]:
'''
Summarize the accuracy score of random forest algorithm with and without hyperparameters.
'''
# Please fill your answer in '...'
rf_clf_train = accuracy_score(y_train, rf_clf.predict(x_train)) * 100
rf_clf_test = accuracy_score(y_test, rf_clf.predict(x_test)) * 100
rf_clf_tuning_train = ...
rf_clf_tuning_test = ...

result = pd.DataFrame(columns=['Model', 'Non-tuning train accuracy %', 'Tuning train accuracy %', 'Non-tuning test accuracy %', 'Tuning test accuracy %'])
dt_result = pd.DataFrame(data=[['Random Forest', ..., ..., ..., ...]],
                        columns=['Model', 'Non-tuning train accuracy %', 'Tuning train accuracy %', 'Non-tuning test accuracy %', 'Tuning test accuracy %'])
result = result.append(dt_result, ignore_index=True)
result