In [None]:
# model implementation
from ml.models.clustering import KNN
from ml.functions.distance import EuclideanDistance, MahalanobisDistance

# model selection
from ml.model_selection import GridSearchCV
from ml.functions.metrics.classification import F1Score, BinaryAccuracy
from ml.stats import ClassificationStats

# data manipulation
from ml.dataset import load_csv, split_train_test
from ml.algorithms.normalization import MinMaxScaler

# Utility
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(25)

## Load dataset

In [None]:
X, y = load_csv('./datasets/kc2.csv')

y = y.astype(int).squeeze()
X_train, y_train, X_test, y_test = split_train_test(X, y, 0.8, shuffle=True)

## Q1 a,b - K-Nearest Neighbors (KNN)

### Grid search with cross validation and k-fold(k=10) for KNN(k=1,5) with Euclidean and Mahalanobis distances

In [None]:
grid = GridSearchCV(stats_generator=ClassificationStats)

for k in [1, 5]:
    grid.add(KNN, {"k": k, "distance": EuclideanDistance, "data_scaler": MinMaxScaler})
    grid.add(KNN, {"k": k, "distance": MahalanobisDistance, "data_scaler": MinMaxScaler})

grid.search(X_train, y_train, num_folds=10, score=F1Score(), score_minimize=False, shuffle=False)
print("Best KNN:", grid.best_model_params)

### Performance of the best model on the test dataset

In [None]:
knnX = grid.best_model(**grid.best_model_params)
knnX.fit(X_train, y_train)

y_pred = knnX.predict(X_test)

print("Accuracy:", BinaryAccuracy().measure(y_test, y_pred))
print("F1-score:", F1Score().measure(y_test, y_pred))

### Report K-fold mean +- standard deviation for: F1-score, Accuracy, Recall and Precision

In [None]:
candidates = ["K=1, Euclid.", "K=1, Mahal.", "K=5 Euclid.", "K=5 Mahal."]
num_candidates = len(grid.candidates)
k=0
plt.figure(figsize=(30, 4))
plt.suptitle("KNN", fontsize=20)
plt.subplots_adjust(top=0.7)
for metric in ['f1', 'recall', 'precision', 'binary_accuracy']:
    means = []
    std = []
    
    for i in range(num_candidates):
        means.append(np.mean(grid.stats[i].stats["values"][metric]))
        std.append(np.std(grid.stats[i].stats["values"][metric]))

    plt.subplot(1, 4, k+1)
    plt.errorbar(np.arange(num_candidates), means, std, fmt='ok', lw=3)
    plt.errorbar(grid.index_of_best, means[grid.index_of_best], std[grid.index_of_best], ecolor='red', fmt='ok', lw=3)
    
    plt.title(metric.upper() + " mean and std deviation", fontsize=20)
    plt.xticks(np.arange(num_candidates), labels=candidates)
    plt.xlabel("K, distance", fontsize=16)
    plt.ylabel(metric, fontsize=16)

    print(metric.upper())
    for c in range(num_candidates):
        print(f"\t{candidates[c]}:", means[c], "+-", std[c])
    
    k+=1