# Assignment 5 — Mini Project: Model Selection & Reproducible Report
*Prepared:* 2025-10-11

**Goal:** Choose a dataset; compare 2–3 algorithms; produce a concise executive summary with evidence.

**Pick one dataset:** digits | breast_cancer | wine | diabetes

In [None]:
# Setup
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits, load_breast_cancer, load_wine, load_diabetes
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve,
                             confusion_matrix, mean_absolute_error, mean_squared_error, r2_score)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
plt.rcParams['figure.figsize'] = (7,4)

In [None]:
# Choose dataset here:
task = 'classification'  # 'classification' or 'regression'
dataset_name = 'breast_cancer'  # 'digits' | 'breast_cancer' | 'wine' | 'diabetes'

def load_dataset(name):
    if name == 'digits':
        d = load_digits(); return d.data, d.target, 'classification'
    if name == 'breast_cancer':
        d = load_breast_cancer(); return d.data, d.target, 'classification'
    if name == 'wine':
        d = load_wine(); return d.data, d.target, 'classification'
    if name == 'diabetes':
        d = load_diabetes(); return d.data, d.target, 'regression'
    raise ValueError('unknown dataset')

X, y, inferred = load_dataset(dataset_name)
task = inferred
print('Task:', task, ' Dataset:', dataset_name, ' X shape:', X.shape)

In [None]:
# EDA — a couple of simple plots
if dataset_name == 'digits':
    from sklearn.datasets import load_digits
    d = load_digits()
    fig, axes = plt.subplots(2,4, figsize=(8,4))
    for ax, img, label in zip(axes.ravel(), d.images[:8], d.target[:8]):
        ax.imshow(img, cmap='gray'); ax.set_title(f'Label: {label}'); ax.axis('off')
    plt.tight_layout(); plt.show()
else:
    df = pd.DataFrame(X)
    df.hist(bins=20, figsize=(10,6)); plt.suptitle('Feature Histograms'); plt.show()

In [None]:
# Split
strat = y if task=='classification' else None
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=strat, test_size=0.2, random_state=RANDOM_STATE)

In [None]:
# Define candidate models
candidates = []
if task == 'classification':
    candidates = [
        ('LogReg', Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))])),
        ('Tree', DecisionTreeClassifier(random_state=RANDOM_STATE)),
        ('kNN', Pipeline([('scaler', StandardScaler()), ('clf', KNeighborsClassifier(n_neighbors=5))]))
    ]
else:
    candidates = [
        ('Linear', Pipeline([('scaler', StandardScaler()), ('reg', LinearRegression())])),
        ('Ridge', Pipeline([('scaler', StandardScaler()), ('reg', Ridge(alpha=1.0, random_state=RANDOM_STATE))])),
        ('kNN_Reg', Pipeline([('scaler', StandardScaler()), ('reg', KNeighborsRegressor(n_neighbors=7))]))
    ]

# Evaluate candidates
rows = []
for name, model in candidates:
    model.fit(X_train, y_train)
    if task == 'classification':
        pred = model.predict(X_test)
        proba = None
        try:
            from sklearn.base import is_classifier
            if hasattr(model, "predict_proba"):
                proba = model.predict_proba(X_test)
        except Exception:
            proba = None
        row = dict(Model=name,
                   Acc=accuracy_score(y_test, pred),
                   F1=f1_score(y_test, pred, average='weighted'),
                   Prec=precision_score(y_test, pred, average='weighted', zero_division=0),
                   Rec=recall_score(y_test, pred, average='weighted'))
        rows.append(row)
    else:
        pred = model.predict(X_test)
        row = dict(Model=name,
                   MAE=mean_absolute_error(y_test, pred),
                   RMSE=mean_squared_error(y_test, pred, squared=False),
                   R2=r2_score(y_test, pred))
        rows.append(row)

metrics_df = pd.DataFrame(rows).set_index('Model')
display(metrics_df)

In [None]:
# Simple matplotlib table of metrics
fig, ax = plt.subplots(figsize=(6, 0.5 + 0.4*len(metrics_df)))
ax.axis('off')
table = ax.table(cellText=np.round(metrics_df.values, 4),
                 rowLabels=metrics_df.index,
                 colLabels=metrics_df.columns,
                 loc='center')
table.auto_set_font_size(False); table.set_fontsize(10)
table.scale(1, 1.2)
plt.title('Model Comparison'); plt.show()

In [None]:
# Diagnostics per best model
best_name = metrics_df.sort_values(metrics_df.columns[-1], ascending=False).index[0]  # crude pick
best_model = dict(candidates)[best_name]
best_model.fit(X_train, y_train)
pred = best_model.predict(X_test)

if task == 'classification':
    cm = confusion_matrix(y_test, pred)
    print('Best model:', best_name, '\nConfusion matrix:\n', cm)
    # ROC/PR optional if predict_proba exists and binary
else:
    plt.scatter(pred, y_test - pred, s=18)
    plt.axhline(0, linestyle='--')
    plt.xlabel('Predicted'); plt.ylabel('Residual'); plt.title(f'Residuals — {best_name}'); plt.show()

**TODOs:**
- Add one robustness check (bootstrap CI or repeated splits) and a 6–10 bullet executive summary.