In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

# 1. Load dataset
X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# 2. Train a large regression tree
reg = DecisionTreeRegressor(random_state=0)
reg.fit(X_train, y_train)

# 3. Get pruning path
path = reg.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

print("Number of alphas:", len(ccp_alphas))

# 4. Train a sequence of trees with different alphas
regs = []
for ccp_alpha in ccp_alphas:
    r = DecisionTreeRegressor(random_state=0, ccp_alpha=ccp_alpha)
    r.fit(X_train, y_train)
    regs.append(r)

# 5. Evaluate training and test scores
train_scores = [r.score(X_train, y_train) for r in regs]
test_scores = [r.score(X_test, y_test) for r in regs]

# 6. Plot accuracy vs alpha
plt.figure(figsize=(8,6))
plt.plot(ccp_alphas, train_scores, marker="o", label="Train score", drawstyle="steps-post")
plt.plot(ccp_alphas, test_scores, marker="o", label="Test score", drawstyle="steps-post")
plt.xlabel("ccp_alpha")
plt.ylabel("R^2 score")
plt.title("Cost-Complexity Pruning (DecisionTreeRegressor)")
plt.legend()
plt.show()

# 7. Plot cost-complexity curve
plt.figure(figsize=(8,6))
plt.plot(ccp_alphas, impurities, marker="o", drawstyle="steps-post")
plt.xlabel("ccp_alpha")
plt.ylabel("Total Impurity of Leaves")
plt.title("Cost-Complexity Pruning Path")
plt.show()

# 8. Visualize one of the pruned trees
best_alpha_idx = np.argmax(test_scores)  # choose alpha giving best test performance
best_alpha = ccp_alphas[best_alpha_idx]

print(f"Best alpha = {best_alpha:.5f}, Test R^2 = {test_scores[best_alpha_idx]:.3f}")

plt.figure(figsize=(16,8))
plot_tree(
    regs[best_alpha_idx],
    filled=True,
    max_depth=3,   # just for readability
    feature_names=fetch_california_housing().feature_names
)
plt.title(f"Pruned Decision Tree Regressor (ccp_alpha={best_alpha:.5f})")
plt.show()


Number of alphas: 13935


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from joblib import Parallel, delayed

# 1. Load dataset
X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# 2. Train a large regression tree
reg = DecisionTreeRegressor(random_state=0)
reg.fit(X_train, y_train)

# 3. Get pruning path
path = reg.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

print("Number of alphas:", len(ccp_alphas))

# 4. Train trees in parallel
def train_tree(ccp_alpha):
    r = DecisionTreeRegressor(random_state=0, ccp_alpha=ccp_alpha)
    r.fit(X_train, y_train)
    return r

regs = Parallel(n_jobs=-1)(delayed(train_tree)(alpha) for alpha in ccp_alphas)
# n_jobs=-1 → use all CPU cores

# 5. Evaluate training and test scores
train_scores = [r.score(X_train, y_train) for r in regs]
test_scores = [r.score(X_test, y_test) for r in regs]

# 6. Plot accuracy vs alpha
plt.figure(figsize=(8,6))
plt.plot(ccp_alphas, train_scores, marker="o", label="Train score", drawstyle="steps-post")
plt.plot(ccp_alphas, test_scores, marker="o", label="Test score", drawstyle="steps-post")
plt.xlabel("ccp_alpha")
plt.ylabel("R^2 score")
plt.title("Cost-Complexity Pruning (DecisionTreeRegressor)")
plt.legend()
plt.show()

# 7. Plot cost-complexity curve
plt.figure(figsize=(8,6))
plt.plot(ccp_alphas, impurities, marker="o", drawstyle="steps-post")
plt.xlabel("ccp_alpha")
plt.ylabel("Total Impurity of Leaves")
plt.title("Cost-Complexity Pruning Path")
plt.show()

# 8. Visualize one of the pruned trees
best_alpha_idx = np.argmax(test_scores)  # choose alpha giving best test performance
best_alpha = ccp_alphas[best_alpha_idx]

print(f"Best alpha = {best_alpha:.5f}, Test R^2 = {test_scores[best_alpha_idx]:.3f}")

plt.figure(figsize=(16,8))
plot_tree(
    regs[best_alpha_idx],
    filled=True,
    max_depth=3,   # just for readability
    feature_names=fetch_california_housing().feature_names
)
plt.title(f"Pruned Decision Tree Regressor (ccp_alpha={best_alpha:.5f})")
plt.show()
