In [5]:
import matplotlib.pyplot as plt
from data_loader import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid

import time

In [2]:
bc = load_breast_cancer("breast-cancer", scaled=True)
X, y = bc["x"], bc["y"]

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=0,
    stratify=y,
)
print(X_train.shape, X_test.shape)

(478, 9) (205, 9)


In [6]:
results_bc = []

Cross-validation strategy

In [7]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

Grid search for K-Nearest Neighbors

In [8]:
knn_param_grid = {
    "n_neighbors": [1, 3, 5, 7, 9, 11, 15],
    "metric": ["euclidean", "manhattan"],
    "weights": ["uniform", "distance"],
}

knn = KNeighborsClassifier()

In [9]:
print("\nRunning GridSearchCV for KNN (breast cancer)...")
start = time.perf_counter()

knn_grid = GridSearchCV(
    estimator=knn,
    param_grid=knn_param_grid,
    scoring="accuracy",
    cv=cv,
    n_jobs=-1,
    verbose=2,
    refit=True,   # default, but explicit
)

knn_grid.fit(X_train, y_train)
knn_time = time.perf_counter() - start

print("\nBest KNN params:", knn_grid.best_params_)
print("Best KNN CV accuracy:", knn_grid.best_score_)

best_knn = knn_grid.best_estimator_

# Evaluate on held-out test set
y_pred_knn = best_knn.predict(X_test)

knn_test_acc = accuracy_score(y_test, y_pred_knn)
knn_test_f1  = f1_score(y_test, y_pred_knn, average="binary")

print("\n=== KNN on breast cancer test set ===")
print(f"Test accuracy: {knn_test_acc:.4f}")
print(f"Test F1 (binary): {knn_test_f1:.4f}")
print("\nKNN classification report:")
print(classification_report(y_test, y_pred_knn))

cm_knn = confusion_matrix(y_test, y_pred_knn)
print("KNN confusion matrix:\n", cm_knn)

results_bc.append({
    "name": f"BC_KNN_best_{knn_grid.best_params_}",
    "train_time": knn_time,
    "test_accuracy": knn_test_acc,
    "test_f1": knn_test_f1,
})


Running GridSearchCV for KNN (breast cancer)...
Fitting 5 folds for each of 28 candidates, totalling 140 fits
[CV] END ...metric=euclidean, n_neighbors=1, weights=uniform; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=1, weights=distance; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=1, weights=distance; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=9, weights=distance; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=9, weights=distance; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=11, weights=uniform; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=5, weights=uniform; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=11, weights=uniform; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   0.0

Grid search for Nearest Class Centroid

In [12]:
ncc_param_grid = {
    "metric": ["euclidean", "manhattan"],
    "shrink_threshold": [None, 0.05, 0.1, 0.5, 1.0],
}

ncc = NearestCentroid()

In [13]:
print("\nRunning GridSearchCV for Nearest Class Centroid (breast cancer)...")
start = time.perf_counter()

ncc_grid = GridSearchCV(
    estimator=ncc,
    param_grid=ncc_param_grid,
    scoring="accuracy",
    cv=cv,
    n_jobs=-1,
    verbose=2,
    refit=True,
)

ncc_grid.fit(X_train, y_train)
ncc_time = time.perf_counter() - start

print("\nBest NCC params:", ncc_grid.best_params_)
print("Best NCC CV accuracy:", ncc_grid.best_score_)

best_ncc = ncc_grid.best_estimator_

# Evaluate on held-out test set
y_pred_ncc = best_ncc.predict(X_test)

ncc_test_acc = accuracy_score(y_test, y_pred_ncc)
ncc_test_f1  = f1_score(y_test, y_pred_ncc, average="binary")

print("\n=== Nearest Class Centroid on breast cancer test set ===")
print(f"Test accuracy: {ncc_test_acc:.4f}")
print(f"Test F1 (binary): {ncc_test_f1:.4f}")
print("\nNCC classification report:")
print(classification_report(y_test, y_pred_ncc))

cm_ncc = confusion_matrix(y_test, y_pred_ncc)
print("NCC confusion matrix:\n", cm_ncc)

results_bc.append({
    "name": f"BC_NCC_best_{ncc_grid.best_params_}",
    "train_time": ncc_time,
    "test_accuracy": ncc_test_acc,
    "test_f1": ncc_test_f1,
})


Running GridSearchCV for Nearest Class Centroid (breast cancer)...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END .............metric=euclidean, shrink_threshold=0.1; total time=   0.0s
[CV] END ............metric=euclidean, shrink_threshold=0.05; total time=   0.0s
[CV] END .............metric=euclidean, shrink_threshold=0.1; total time=   0.0s
[CV] END .............metric=euclidean, shrink_threshold=0.1; total time=   0.0s
[CV] END .............metric=euclidean, shrink_threshold=0.5; total time=   0.0s
[CV] END .............metric=euclidean, shrink_threshold=0.1; total time=   0.0s
[CV] END .............metric=euclidean, shrink_threshold=0.5; total time=   0.0s
[CV] END .............metric=euclidean, shrink_threshold=0.5; total time=   0.0s
[CV] END .............metric=euclidean, shrink_threshold=0.5; total time=   0.0s
[CV] END ............metric=euclidean, shrink_threshold=0.05; total time=   0.0s
[CV] END .............metric=euclidean, shrink_threshold=0.5;

summary

In [14]:
print("\n=== Summary KNN / NCC on breast cancer ===")
for r in results_bc:
    print(
        f"{r['name']}\n"
        f"  train_time = {r['train_time']:.4f} sec\n"
        f"  test_acc   = {r['test_accuracy']:.4f}\n"
        f"  test_F1    = {r['test_f1']:.4f}\n"
    )


=== Summary KNN / NCC on breast cancer ===
BC_KNN_best_{'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
  train_time = 2.7610 sec
  test_acc   = 0.9610
  test_F1    = 0.9437

BC_NCC_best_{'metric': 'euclidean', 'shrink_threshold': None}
  train_time = 0.1149 sec
  test_acc   = 0.9659
  test_F1    = 0.9510

BC_NCC_best_{'metric': 'euclidean', 'shrink_threshold': None}
  train_time = 0.0824 sec
  test_acc   = 0.9659
  test_F1    = 0.9510

