# Assignment 4 â€” kNN with Scaling, Metric Choice & Learning Curves (Digits)
*Prepared:* 2025-10-11

**Goal:** Compare metrics and k values for kNN, visualize a heatmap, and draw a learning curve.

**Dataset:** `sklearn.datasets.load_digits()`

In [None]:
# Setup
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
plt.rcParams['figure.figsize'] = (7,4)

In [None]:
# Load digits and preview samples
digits = load_digits()
X, y = digits.data, digits.target

fig, axes = plt.subplots(2,4, figsize=(8,4))
for ax, img, label in zip(axes.ravel(), digits.images[:8], y[:8]):
    ax.imshow(img, cmap='gray'); ax.set_title(f'Label: {label}'); ax.axis('off')
plt.tight_layout(); plt.show()

In [None]:
# Grid over k and metric
from itertools import product
ks = [1,3,5,7,11]
metrics = ['euclidean','manhattan']

scores = np.zeros((len(ks), len(metrics)))
for i,k in enumerate(ks):
    for j,m in enumerate(metrics):
        pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_neighbors=k, metric=m))])
        cv = cross_val_score(pipe, X, y, cv=5)
        scores[i,j] = cv.mean()

# Heatmap (matplotlib imshow)
fig, ax = plt.subplots()
im = ax.imshow(scores, origin='lower')
ax.set_xticks(range(len(metrics))); ax.set_xticklabels(metrics)
ax.set_yticks(range(len(ks))); ax.set_yticklabels(ks)
ax.set_xlabel('metric'); ax.set_ylabel('k')
plt.title('CV Accuracy Heatmap'); fig.colorbar(im, ax=ax); plt.show()

best_i, best_j = np.unravel_index(np.argmax(scores), scores.shape)
best_k, best_metric = ks[best_i], metrics[best_j]
print('Best:', best_k, best_metric)

In [None]:
# Fit best on train/test and show confusion matrix
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=RANDOM_STATE)
pipe_best = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_neighbors=best_k, metric=best_metric))])
pipe_best.fit(X_train, y_train)
pred = pipe_best.predict(X_test)

cm = confusion_matrix(y_test, pred)
print('Confusion matrix:\n', cm)

In [None]:
# Learning curve
train_sizes, train_scores, val_scores = learning_curve(pipe_best, X, y, cv=5, train_sizes=np.linspace(0.1,1.0,5), random_state=RANDOM_STATE)
plt.plot(train_sizes, train_scores.mean(axis=1), marker='o', label='train')
plt.plot(train_sizes, val_scores.mean(axis=1), marker='o', label='cv')
plt.xlabel('Train size'); plt.ylabel('Accuracy'); plt.title('Learning Curve'); plt.legend(); plt.show()

**TODOs:**
- Add a small discussion on why scaling matters for kNN and how learning curves inform data needs.