# Assignment 3 — Decision Tree with Cost-Complexity Pruning (Wine)
*Prepared:* 2025-10-11

**Goal:** Train a multiclass tree, study overfitting, and select ccp_alpha via pruning path.

**Dataset:** `sklearn.datasets.load_wine()`

In [None]:
# Setup
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
plt.rcParams['figure.figsize'] = (9,5)

In [None]:
# Load and split
data = load_wine()
X, y = data.data, data.target
feature_names = data.feature_names
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=RANDOM_STATE)

In [None]:
# Baseline tree
tree0 = DecisionTreeClassifier(random_state=RANDOM_STATE)
tree0.fit(X_train, y_train)
y_pred = tree0.predict(X_test)
print('Baseline Acc (train/test):', tree0.score(X_train,y_train), tree0.score(X_test,y_test))
plt.figure(figsize=(12,6))
plot_tree(tree0, feature_names=feature_names, class_names=[str(c) for c in np.unique(y)], filled=True, max_depth=3)
plt.title('Decision Tree (truncated)'); plt.show()

In [None]:
# Cost-Complexity Pruning path
path = tree0.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas

# Simple CV over ccp_alpha
def cv_for_alpha(ccp_alpha, X, y):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    scores = []
    for tr, va in skf.split(X, y):
        clf = DecisionTreeClassifier(random_state=RANDOM_STATE, ccp_alpha=ccp_alpha)
        clf.fit(X[tr], y[tr])
        scores.append(clf.score(X[va], y[va]))
    return np.mean(scores)

alphas = np.linspace(ccp_alphas[0], ccp_alphas[-1], num=20)
cv_scores = [cv_for_alpha(a, X_train, y_train) for a in alphas]
plt.plot(alphas, cv_scores); plt.xlabel('ccp_alpha'); plt.ylabel('CV accuracy'); plt.title('Pruning Path'); plt.show()

best_alpha = alphas[int(np.argmax(cv_scores))]
print('Best ccp_alpha:', best_alpha)

In [None]:
# Retrain with best alpha and evaluate
tree_best = DecisionTreeClassifier(random_state=RANDOM_STATE, ccp_alpha=best_alpha)
tree_best.fit(X_train, y_train)
print('Acc train/test:', tree_best.score(X_train,y_train), tree_best.score(X_test,y_test))

# Feature importances
imp = pd.Series(tree_best.feature_importances_, index=feature_names).sort_values()
imp.plot(kind='barh'); plt.title('Feature Importances'); plt.show()

# Confusion matrix
cm = confusion_matrix(y_test, tree_best.predict(X_test))
print('Confusion matrix:\n', cm)

**TODOs:**
- Write one paragraph on pruning’s effect on variance and interpret top-3 features.