In [None]:
# Import neccessary libraries and tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('heart.csv')

In [None]:
# Have a first look at the data
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

### Classification of data types
Before going to any training, we should classify the types of data into two different kinds: 'categorical_val' for whose the unique data is less than 10 different values (e.g. age, sex...) and 'continuous_val' vice versa.

In [None]:
# Please fill your answer in '...'
categorical_val = []
continuous_val = []
for col in ...:
    if ...:
        ...
    else:
        ...

### Create dummies and scale data
After exploring the dataset, we need to convert some categorical variables into dummy variables and scale all the values before training the models.

In [None]:
'''
Create dummies
'''
# Please fill your answer in '...'
categorical_val.remove('target')
dataset = pd.get_dummies(..., columns=...)
dataset.head()

In [None]:
'''
Scale the values
- Set the array of columns to scale.
'''
# Please fill your answer in '...'
from sklearn.preprocessing import StandardScaler

s_sc = StandardScaler()
col_to_scale = ...
dataset[col_to_scale] = s_sc.fit_transform(dataset[col_to_scale])
dataset.head()

#### Define function to print the accuracy score 

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, x_train, y_train, x_test, y_test, train):
    if train == True:
        pred = clf.predict(x_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print(f'Accuracy Score: {accuracy_score(y_train, pred) * 100:.4f}%')
        print('______________________________________________________________________')
        print(f'Classification Report:\n{clf_report}')
#         print('______________________________________________________________________')
#         print(f'Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n')
    elif train == False:
        pred = clf.predict(x_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print(f'Accuracy Score: {accuracy_score(y_test, pred) * 100:.4f}%')
        print('______________________________________________________________________')
        print(f'Classification Report:\n{clf_report}')
#         print('______________________________________________________________________')
#         print(f'Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n')

### Dataset splitting
Split the dataset into training (70%) and test set (30%)

In [None]:
'''
use train_test_split(data, target, test_size, random)
'''
# Please fill your answer in '...'
from sklearn.model_selection import train_test_split

x = dataset.drop('target', axis=1)
y = dataset.target
x_train, x_test, y_train, y_test = train_test_split(..., ..., test_size=..., random_state=...)

# K-nearest neighbors (KNN)
The KNN is one of the simplest supervised learning (and sometimes, is effective) for both classification and regression problems. The KNN algorithm assumes that similar things exist in close proximity.
### The KNN algorithm step-by-step
1. Load the data.
2. Initilize K to your chosen number of neighbors.
3. For each example (data point) in the data:
    - Calculate the disance between the query example and the current example from the data.
    - Add the distance and the index of the example to an ordered collection.
4. Sort the ordered collection of distances and indices in ascending order by the distances.
5. Pick the first K entries from the sorted collection.
6. Get the labels of the selected K entries.
7. If regression, return the mean of the K labels; otherwise, return the mode of the K labels.

| ![](http://res.cloudinary.com/dyd911kmh/image/upload/f_auto,q_auto:best/v1531424125/KNN_final1_ibdm8a.png) |
|:--:|
| <b>K-nearest neighbors</b> |


## Training KNN model

In [None]:
'''
Training KNN model with Scikit Learn
- By default, the number of neighbors n_neighbors is 5.
'''
# Please fill your answer in '...'
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier().fit(..., ...)

In [None]:
print_score(knn_clf, x_train, y_train, x_test, y_test, train=True)

In [None]:
print_score(knn_clf, x_train, y_train, x_test, y_test, train=False)

## Improve the KNN model
In this part we will test the KNN model with different number of neighbors to find the best one. 

In [None]:
'''
Put the KNN model into a 'for' loop of number of neighbors.
'''
# Please fill your answer in '...'
train_score = []
test_score = []
neighbors = range(1, 30)

for ...:
    model = KNeighborsClassifier(...).fit(..., ...)
    train_score.append(accuracy_score(y_train, model.predict(x_train)))
#     test_score.append(accuracy_score(y_test, model.predict(x_test)))

In [None]:
# Plot the model results
plt.figure(figsize=(8,6))

plt.plot(neighbors, train_score, label="Train score")
# plt.plot(neighbors, test_score, label="Test score")
plt.xticks(np.arange(1, 21, 1))
plt.xlabel("Number of neighbors")
plt.ylabel("Model score")
plt.legend()

print(f"Maximum KNN score on the test data: {max(train_score)*100:.2f}%")

In [None]:
'''
Use the best number of neighbors to train KNN model.
'''
knn_clf_tuning = KNeighborsClassifier(n_neighbors=...).fit(..., ...)

In [None]:
print_score(knn_clf_tuning, x_train, y_train, x_test, y_test, train=True)

In [None]:
print_score(knn_clf_tuning, x_train, y_train, x_test, y_test, train=False)

In [None]:
'''
Summarize the accuracy score of KNN models with and without modifying number of neighbors.
'''
# Please fill your answer in '...'
knn_clf_test = accuracy_score(y_test, knn_clf.predict(x_test)) * 100
knn_clf_train = accuracy_score(y_train, knn_clf.predict(x_train)) * 100
knn_clf_tuning_test = ...
knn_clf_tuning_train = ...

result = pd.DataFrame(columns=['Model', 'Non-tuning train accuracy %', 'Tuning train accuracy %', 'Non-tuning test accuracy %', 'Tuning test accuracy %'])
knn_result = pd.DataFrame(data=[['K-nearest neighbors', ..., ..., ..., ...]],
                        columns=['Model', 'Non-tuning train accuracy %', 'Tuning train accuracy %', 'Non-tuning test accuracy %', 'Tuning test accuracy %'])
result = result.append(knn_result, ignore_index=True)
result