Load libraries

In [1]:
from data_loader import load_cifar10
import matplotlib.pyplot as plt
import numpy as np
import time


from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid

In [2]:
data = load_cifar10("cifar-10-batches-py")
X_train, y_train = data["x_train"], data["y_train"]
X_test, y_test   = data["x_test"], data["y_test"]
label_names = data["label_names"]

Flatten + scale images <br>
CIFAR-10: (N, 32, 32, 3) -> (N, 3072)

In [3]:
X_train_flat = X_train.reshape(len(X_train), -1) / 255.0
X_test_flat  = X_test.reshape(len(X_test), -1) / 255.0

In [4]:
X_train.shape

(50000, 32, 32, 3)

In [4]:
X_train_flat.shape

(50000, 3072)

In [5]:

# Υποθετουμε οτι:
# X_train_flat, X_test_flat ειναι ηδη φλαταρισμενα και scaled
# y_train, y_test ειναι np.arrays με labels 0..9
# label_names = ["airplane", "automobile", ..., "truck"]
def select_stratified_subset(
    X_train_flat, y_train,
    X_test_flat, y_test,
    n_train_per_class, n_test_per_class,
    random_state=0):
    
    num_classes = len(label_names)

    rng = np.random.default_rng(random_state)  # για αναπαραγωγιμοτητα

    # ----------------------------
    # Stratified sampling για TRAIN
    # ----------------------------
    train_indices = []

    for c in range(num_classes):
        class_idxs = np.where(y_train == c)[0]
        # ασφαλεια: να μην ζητησουμε περισσοτερα απο οσα υπαρχουν
        if len(class_idxs) < n_train_per_class:
            raise ValueError(
                f"Not enough train samples for class {c}: "
                f"have {len(class_idxs)}, requested {n_train_per_class}"
            )
        chosen = rng.choice(class_idxs, size=n_train_per_class, replace=False)
        train_indices.append(chosen)

    train_indices = np.concatenate(train_indices)
    rng.shuffle(train_indices)  # ανακατεμα για να μην ειναι ολα της ιδιας κλασης μαζι

    X_train_sub = X_train_flat[train_indices]
    y_train_sub = y_train[train_indices]

    # ----------------------------
    # Stratified sampling για TEST
    # ----------------------------
    test_indices = []

    for c in range(num_classes):
        class_idxs = np.where(y_test == c)[0]
        if len(class_idxs) < n_test_per_class:
            raise ValueError(
                f"Not enough test samples for class {c}: "
                f"have {len(class_idxs)}, requested {n_test_per_class}"
            )
        chosen = rng.choice(class_idxs, size=n_test_per_class, replace=False)
        test_indices.append(chosen)

    test_indices = np.concatenate(test_indices)
    rng.shuffle(test_indices)

    X_test_sub = X_test_flat[test_indices]
    y_test_sub = y_test[test_indices]

    print("Train shape:", X_train_sub.shape, "Test shape:", X_test_sub.shape)
    return X_train_sub, y_train_sub, X_test_sub, y_test_sub


In [6]:
X_train_sub, y_train_sub, X_test_sub, y_test_sub = select_stratified_subset(
    X_train_flat, y_train,
    X_test_flat,  y_test,
    n_train_per_class=1000,   # 10k total train
    n_test_per_class=200,     # 2k total test
    random_state=0
)

results = []

Train shape: (10000, 3072) Test shape: (2000, 3072)


Grid search for K-Nearest Neighbors (Nearest Neighbor classifier)

In [9]:
knn_param_grid = {
    "n_neighbors": [1, 3, 5, 7, 9, 11],
    "metric": ["euclidean", "manhattan"],
    "weights": ["uniform", "distance"],
    "algorithm": ["auto"],  # leave search algorithm on auto
}

knn = KNeighborsClassifier()

In [10]:
print("\nRunning GridSearchCV for KNN...")
start = time.perf_counter()

knn_grid = GridSearchCV(
    estimator=knn,
    param_grid=knn_param_grid,
    scoring="accuracy",   # choose best by accuracy
    cv=3,
    n_jobs=-1,
    verbose=2
)

knn_grid.fit(X_train_sub, y_train_sub)
knn_time = time.perf_counter() - start

print("Best KNN params:", knn_grid.best_params_)
print("Best CV accuracy:", knn_grid.best_score_)

best_knn = knn_grid.best_estimator_
y_pred_knn = best_knn.predict(X_test_sub)

knn_test_acc = accuracy_score(y_test_sub, y_pred_knn)
knn_test_f1  = f1_score(y_test_sub, y_pred_knn, average="macro")

print(f"KNN test accuracy:   {knn_test_acc:.4f}")
print(f"KNN test macro F1:   {knn_test_f1:.4f}")

results.append({
    "name": f"KNN_best_{knn_grid.best_params_}",
    "train_time": knn_time,
    "test_accuracy": knn_test_acc,
    "test_macro_f1": knn_test_f1,
})


Running GridSearchCV for KNN...
Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END algorithm=auto, metric=euclidean, n_neighbors=7, weights=distance; total time=  25.6s
[CV] END algorithm=auto, metric=euclidean, n_neighbors=3, weights=uniform; total time=  25.3s
[CV] END algorithm=auto, metric=euclidean, n_neighbors=1, weights=uniform; total time=  25.4s
[CV] END algorithm=auto, metric=euclidean, n_neighbors=1, weights=uniform; total time=  27.7s
[CV] END algorithm=auto, metric=euclidean, n_neighbors=1, weights=distance; total time=  28.4s
[CV] END algorithm=auto, metric=euclidean, n_neighbors=1, weights=uniform; total time=  28.2s
[CV] END algorithm=auto, metric=euclidean, n_neighbors=11, weights=distance; total time=  28.7s
[CV] END algorithm=auto, metric=euclidean, n_neighbors=5, weights=uniform; total time=  28.9s
[CV] END algorithm=auto, metric=euclidean, n_neighbors=11, weights=uniform; total time=  28.8s
[CV] END algorithm=auto, metric=euclidean, n_neighbors=

Grid search for Nearest Class Centroid (NearestCentroid)

In [14]:
ncc_param_grid = {
    "metric": ["euclidean", "manhattan"],
    "shrink_threshold": [None, 0.05, 0.1, 0.5, 1.0],
}

ncc = NearestCentroid()

In [15]:
print("\nRunning GridSearchCV for Nearest Class Centroid...")
start = time.perf_counter()

ncc_grid = GridSearchCV(
    estimator=ncc,
    param_grid=ncc_param_grid,
    scoring="accuracy",
    cv=3,
    n_jobs=-1,
    verbose=2
)

ncc_grid.fit(X_train_sub, y_train_sub)
ncc_time = time.perf_counter() - start

print("Best NCC params:", ncc_grid.best_params_)
print("Best CV accuracy:", ncc_grid.best_score_)

best_ncc = ncc_grid.best_estimator_
y_pred_ncc = best_ncc.predict(X_test_sub)

ncc_test_acc = accuracy_score(y_test_sub, y_pred_ncc)
ncc_test_f1  = f1_score(y_test_sub, y_pred_ncc, average="macro")

print(f"NCC test accuracy:   {ncc_test_acc:.4f}")
print(f"NCC test macro F1:   {ncc_test_f1:.4f}")

results.append({
    "name": f"NCC_best_{ncc_grid.best_params_}",
    "train_time": ncc_time,
    "test_accuracy": ncc_test_acc,
    "test_macro_f1": ncc_test_f1,
})


Running GridSearchCV for Nearest Class Centroid...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END .............metric=manhattan, shrink_threshold=1.0; total time=   2.0s
[CV] END .............metric=manhattan, shrink_threshold=1.0; total time=   2.1s
[CV] END .............metric=manhattan, shrink_threshold=0.1; total time=   2.4s
[CV] END .............metric=manhattan, shrink_threshold=0.1; total time=   2.5s
[CV] END .............metric=manhattan, shrink_threshold=0.5; total time=   2.5s
[CV] END .............metric=manhattan, shrink_threshold=0.5; total time=   2.8s
[CV] END ............metric=manhattan, shrink_threshold=0.05; total time=   3.3s
[CV] END .............metric=manhattan, shrink_threshold=1.0; total time=   3.5s
[CV] END ............metric=manhattan, shrink_threshold=0.05; total time=   3.8s
[CV] END ............metric=manhattan, shrink_threshold=0.05; total time=   3.9s
[CV] END .............metric=manhattan, shrink_threshold=0.5; total time=   4

In [16]:
print("\n=== Summary of best NN / NCC models on CIFAR-10 subset ===")
for r in results:
    print(
        f"{r['name']}\n"
        f"  train_time   = {r['train_time']:.2f} sec\n"
        f"  test_acc     = {r['test_accuracy']:.4f}\n"
        f"  test_macroF1 = {r['test_macro_f1']:.4f}\n"
    )


=== Summary of best NN / NCC models on CIFAR-10 subset ===
KNN_best_{'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
  train_time   = 268.50 sec
  test_acc     = 0.3225
  test_macroF1 = 0.3100

NCC_best_{'metric': 'manhattan', 'shrink_threshold': None}
  train_time   = 7.96 sec
  test_acc     = 0.2615
  test_macroF1 = 0.2416

NCC_best_{'metric': 'manhattan', 'shrink_threshold': None}
  train_time   = 7.06 sec
  test_acc     = 0.2615
  test_macroF1 = 0.2416



Retrain BEST KNN on the FULL CIFAR-10 train set


In [17]:
print("\nRetraining BEST KNN on FULL CIFAR-10 train set...")

best_knn_full = KNeighborsClassifier(**knn_grid.best_params_)

start = time.perf_counter()
best_knn_full.fit(X_train_flat, y_train)
knn_full_train_time = time.perf_counter() - start

y_pred_knn_full = best_knn_full.predict(X_test_flat)

knn_full_acc = accuracy_score(y_test, y_pred_knn_full)
knn_full_f1  = f1_score(y_test, y_pred_knn_full, average="macro")

print("=== KNN on FULL dataset ===")
print("Best params:", knn_grid.best_params_)
print(f"Train time (full): {knn_full_train_time:.2f} sec")
print(f"Test accuracy:     {knn_full_acc:.4f}")
print(f"Test macro F1:     {knn_full_f1:.4f}")

results.append({
    "name": f"KNN_FULL_{knn_grid.best_params_}",
    "train_time": knn_full_train_time,
    "test_accuracy": knn_full_acc,
    "test_macro_f1": knn_full_f1,
})


Retraining BEST KNN on FULL CIFAR-10 train set...
=== KNN on FULL dataset ===
Best params: {'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Train time (full): 0.17 sec
Test accuracy:     0.3952
Test macro F1:     0.3895


Retrain BEST Nearest Class Centroid on FULL CIFAR-10


In [18]:
print("\nRetraining BEST NCC on FULL CIFAR-10 train set...")

best_ncc_full = NearestCentroid(**ncc_grid.best_params_)

start = time.perf_counter()
best_ncc_full.fit(X_train_flat, y_train)
ncc_full_train_time = time.perf_counter() - start

y_pred_ncc_full = best_ncc_full.predict(X_test_flat)

ncc_full_acc = accuracy_score(y_test, y_pred_ncc_full)
ncc_full_f1  = f1_score(y_test, y_pred_ncc_full, average="macro")

print("=== NCC on FULL dataset ===")
print("Best params:", ncc_grid.best_params_)
print(f"Train time (full): {ncc_full_train_time:.2f} sec")
print(f"Test accuracy:     {ncc_full_acc:.4f}")
print(f"Test macro F1:     {ncc_full_f1:.4f}")

results.append({
    "name": f"NCC_FULL_{ncc_grid.best_params_}",
    "train_time": ncc_full_train_time,
    "test_accuracy": ncc_full_acc,
    "test_macro_f1": ncc_full_f1,
})


Retraining BEST NCC on FULL CIFAR-10 train set...
=== NCC on FULL dataset ===
Best params: {'metric': 'manhattan', 'shrink_threshold': None}
Train time (full): 8.23 sec
Test accuracy:     0.2734
Test macro F1:     0.2528


In [19]:
print("\n=== FINAL SUMMARY (including FULL KNN / NCC) ===")
for r in results:
    print(
        f"{r['name']}\n"
        f"  train_time   = {r['train_time']:.2f} sec\n"
        f"  test_acc     = {r['test_accuracy']:.4f}\n"
        f"  test_macroF1 = {r['test_macro_f1']:.4f}\n"
    )


=== FINAL SUMMARY (including FULL KNN / NCC) ===
KNN_best_{'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
  train_time   = 268.50 sec
  test_acc     = 0.3225
  test_macroF1 = 0.3100

NCC_best_{'metric': 'manhattan', 'shrink_threshold': None}
  train_time   = 7.96 sec
  test_acc     = 0.2615
  test_macroF1 = 0.2416

NCC_best_{'metric': 'manhattan', 'shrink_threshold': None}
  train_time   = 7.06 sec
  test_acc     = 0.2615
  test_macroF1 = 0.2416

KNN_FULL_{'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
  train_time   = 0.17 sec
  test_acc     = 0.3952
  test_macroF1 = 0.3895

NCC_FULL_{'metric': 'manhattan', 'shrink_threshold': None}
  train_time   = 8.23 sec
  test_acc     = 0.2734
  test_macroF1 = 0.2528

