In [None]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import os

In [None]:
# Read the CSV
heart = pd.read_csv("cleaned_cardio.csv")
heart.head()

In [None]:
y = heart["cardio"]
target_names = ["negative", "positive"]

In [None]:
X = heart.drop("cardio", axis=1)
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_score = knn.score(X_train, y_train)
    test_score = knn.score(X_test, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
# Note that k: 13 seems to be the best choice for this dataset
knn = KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train, y_train)
print('k=13 Test Acc: %.3f' % knn.score(X_test, y_test))

In [None]:
# GridSearch for KNN
from sklearn.model_selection import GridSearchCV

params_knn = {'n_neighbors':[5,6,7,8,9,10],
          'leaf_size':[1,2,3,5],
          'weights':['uniform', 'distance'],
          'algorithm':['auto', 'ball_tree','kd_tree','brute'],
          'n_jobs':[-1]}

grid_knn = GridSearchCV(knn, param_grid = params_knn, n_jobs=-1)
grid_knn.fit(X_train, y_train)
print("Best Hyper Parameters:\n", grid_knn.best_params_)
prediction_knn = grid_knn.predict(X_test)

# Print Accuracy
accuracy_knn = metrics.accuracy_score(prediction_knn, y_test)
print("Accuracy: ", accuracy_knn)