In [None]:
# Import neccessary libraries and tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('heart.csv')

In [None]:
# Have a first look at the data
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

### Classification of data types
Before going to any training, we should classify the types of data into two different kinds: 'categorical_val' for whose the unique data is less than 10 different values (e.g. age, sex...) and 'continuous_val' vice versa.

In [None]:
# Please fill your answer in '...'
categorical_val = []
continuous_val = []
for col in ...:
    if ...:
        ...
    else:
        ...

### Create dummie and scale data
After exploring the dataset, we need to convert some categorical variables into dummy variables and scale all the values before training the models.

In [None]:
'''
Create dummies
'''
# Please fill your answer in '...'
categorical_val.remove('target')
dataset = pd.get_dummies(..., columns=...)
dataset.head()

In [None]:
'''
Scale the values
- Set the array of columns to scale.
'''
# Please fill your answer in '...'
from sklearn.preprocessing import StandardScaler

s_sc = StandardScaler()
col_to_scale = ...
dataset[col_to_scale] = s_sc.fit_transform(dataset[col_to_scale])
dataset.head()

#### Define function to print the accuracy score 

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, x_train, y_train, x_test, y_test, train):
    if train == True:
        pred = clf.predict(x_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print(f'Accuracy Score: {accuracy_score(y_train, pred) * 100:.4f}%')
        print('______________________________________________________________________')
        print(f'Classification Report:\n{clf_report}')
#         print('______________________________________________________________________')
#         print(f'Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n')
    elif train == False:
        pred = clf.predict(x_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print(f'Accuracy Score: {accuracy_score(y_test, pred) * 100:.4f}%')
        print('______________________________________________________________________')
        print(f'Classification Report:\n{clf_report}')
#         print('______________________________________________________________________')
#         print(f'Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n')

### Dataset splitting
Split the dataset into training (70%) and test set (30%)

In [None]:
'''
use train_test_split(data, target, test_size, random)
'''
# Please fill your answer in '...'
from sklearn.model_selection import train_test_split

x = dataset.drop('target', axis=1)
y = dataset.target
x_train, x_test, y_train, y_test = train_test_split(..., ..., test_size=..., random_state=...)

# Logistic Regression
Logistic regression (LR) is a classification algorithm used to assign observations to a discrete set of classes. It is a predictive analysis algorithm and based on the concept of probability. Its hypothesis tends it to limit the cost function between 0 and 1: 

$0\leq{h_0}\leq1$.

### 1. Sigmoid function
Sigmoid function maps any real value into another value between 0 and 1. In machine learning, we use sigmoid to map predictions to probabilities. Formula of sigmoid function: 

$f(x)=\frac{1}{1+e^{-x}}$

### 2. Decision boundary
We expect our classifier to give us a set of outputs or classes based on probability when we pass the inputs through the prediction function (in this case sigmoid function) and returns a probability score between 0 and 1. Basically we decide a threshold value which we classify the probabilities into their correspond classes. For example:

| ![](https://miro.medium.com/max/619/1*2Vsum532aNQX9TgR7_rAzQ.png) |
|:--:|
| <b>Decision boundary</b> |

Let's think about a binary classification problem. In this fugure, the threshold value is set to be 0.5. Any probabilities that are higher than 0.5 are classified to class 1, otherwise they are classified into class 0. In real cases the threshold value varies from 0.5 to 0.7, depending on the types of data and problem requirements.

### 3. Cost function
The cost function represents optimization objective i.e. we create a cost function and minimize it so that we can develop an accurate model with minimum error. For logistic regression, the Cost function is defined as:

$J(\theta)=\begin{cases}-log(h_0(x))&(y=1)\\-log(1-h_0(x))&(y=0)\end{cases}$

| ![](https://miro.medium.com/max/609/1*yWzKLQhWITQ4bR2aMSVVuw.png) |
|:--:|
| <b>Intuition of logistic regression cost function.</b> |

Or we can combine both equations in term of:

$J(\theta)=\frac{-1}{m}\sum(y^{(i)}log(h_0(x^{(i)})+(1-y^{(i)})log(1-h_0(x^{(i)}))$

The final purpose of algorithm is to converge the cost function into the minimum error.

### 4. Gradient descent
The problem of minimizing the cost value ($minJ(\theta)$) can be done by using **gradient descent**. It is simply understanded that we "update" the value $\theta$ after each training and use this $\theta$ for the next training. This progress will be ended once the cost value $J(\theta)$ **converge**. Mathematically:

$\theta_j:=\theta_j-\alpha\frac{\partial}{\partial\theta_j}J(\theta)$

Let's imagine that we are at the top of a mountain. We need to find the way to the valley while blindfolded. Feeling the slope of the terrain around you is all the things you can do. And it's actually analogous with the calculation of gradient descent, with the valley equals to the local minimum. 

| ![](https://media-exp1.licdn.com/dms/image/C5612AQGjsDROHG6DhQ/article-inline_image-shrink_1000_1488/0/1544817106951?e=1633564800&v=beta&t=rdLLJTl6sl9JoRlLgZs7DEu8TEn_6BzaCybiH5sEsbA) |
|:--:|
| <b>Gradient descent analogy. You start at the top of the hill. After each step of calculating, you approach closer and closer to the local minimum as "valley".</b> |

## Training logistic regression model

In [None]:
'''
Training logistic regression model with Scikit Learn
- solver: 'liblinear' (algorithm to use in the optimization)
'''
# Please fill your answer in '...'
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(solver=...).fit(..., ...)

In [None]:
print_score(lr_clf, x_train, y_train, x_test, y_test, train=True)

In [None]:
print_score(lr_clf, x_train, y_train, x_test, y_test, train=False)

## Improve logistic regression model with hyperparameter tuning
In order to imporve the accuracy of the model and prevent overfitting, we add an additional parameter called **regularization** to our cost function. It plays a role as a *hyperparameter* for the cost function.
#### (?) What is overfitting (and underfitting)?
Overfitting occur when your model performance perfectly fits the data but poorly represent the whole dataset. Similarly, underfitting can be defined when your model are too weak to explain the variance of dataset, causing by insufficient data. The intuition of overfitting and underfitting can be explained by graphs:

| ![](https://miro.medium.com/max/1400/1*_7OPgojau8hkiPUiHoGK_w.png) |
|:--:|
| <b>Model performance</b> |

Back to the regularization. We implement the regularization hyperparameter into the cost function as the formula:

$J(\theta)=\frac{-1}{m}\sum(y^{(i)}log(h_0(x^{(i)})+(1-y^{(i)})log(1-h_0(x^{(i)}))+\frac{\lambda}{2}||w||^2$

##### Next, we will implement the regularization into our logistic regression using Scikit Learn.

In [None]:
'''
Hyperparameter used: 
- C: inverse of regularization strength (set C value to be the array of numbers spaced evenly on a log scale)
'''
# Please fill your answer in '...'
from sklearn.model_selection import GridSearchCV

params = {'C': ...(-4, 4, 20), 'solver': ['liblinear']}
lr_cv = GridSearchCV(LogisticRegression(), params, scoring='accuracy', n_jobs=1, verbose=1, cv=5)
lr_cv.fit(x_train, y_train)
lr_cv.best_params_

In [None]:
lr_clf_tuning = LogisticRegression(**lr_cv.best_params_).fit(x_train, y_train)

In [None]:
print_score(lr_clf_tuning, x_train, y_train, x_test, y_test, train=True)

In [None]:
print_score(lr_clf_tuning, x_train, y_train, x_test, y_test, train=False)

In [None]:
'''
Summarize the accuracy score of logistic regression models with and without hyperparameter tuning
'''
# Please fill your answer in '...'
lr_clf_train = accuracy_score(y_train, lr_clf.predict(x_train)) * 100
lr_clf_test = accuracy_score(y_test, lr_clf.predict(x_test)) * 100
lr_clf_tuning_train = ...
lr_clf_tuning_test = ...

result = pd.DataFrame(columns=['Model', 'Non-tuning train accuracy %', 'Tuning train accuracy %', 'Non-tuning test accuracy %', 'Tuning test accuracy %'])
lr_result = pd.DataFrame(data=[['Logistic regression', ..., ..., ..., ...]],
                        columns=['Model', 'Non-tuning train accuracy %', 'Tuning train accuracy %', 'Non-tuning test accuracy %', 'Tuning test accuracy %'])
result = result.append(lr_result, ignore_index=True)
result