In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.metrics import (mean_squared_error, r2_score, 
                            accuracy_score, confusion_matrix, ConfusionMatrixDisplay)
from sklearn.ensemble import (RandomForestRegressor, RandomForestClassifier,
                             AdaBoostRegressor, AdaBoostClassifier)
from sklearn.datasets import fetch_california_housing, load_breast_cancer

# ==============================================================================
# REGRESSION PROBLEM (California Housing Dataset)
# ==============================================================================

# Load regression dataset
housing = fetch_california_housing()
X_reg = housing.data
y_reg = housing.target
feature_names_reg = housing.feature_names

# a. Split data
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.3, random_state=42)

# b. Build initial decision tree
base_tree_reg = DecisionTreeRegressor(random_state=42)
base_tree_reg.fit(X_train_reg, y_train_reg)

# c. Check performance
def evaluate_regression(model, X_train, X_test, y_train, y_test):
    return {
        'train_mse': mean_squared_error(y_train, model.predict(X_train)),
        'test_mse': mean_squared_error(y_test, model.predict(X_test)),
        'train_r2': r2_score(y_train, model.predict(X_train)),
        'test_r2': r2_score(y_test, model.predict(X_test))
    }

base_reg_perf = evaluate_regression(base_tree_reg, X_train_reg, X_test_reg, y_train_reg, y_test_reg)

# d. Cost Complexity Pruning
path_reg = base_tree_reg.cost_complexity_pruning_path(X_train_reg, y_train_reg)
ccp_alphas_reg = path_reg.ccp_alphas[:-1]  # Remove maximum alpha

# Find optimal alpha using cross-validation
scores_reg = []
for alpha in ccp_alphas_reg:
    tree_reg = DecisionTreeRegressor(ccp_alpha=alpha, random_state=42)
    score = np.mean(cross_val_score(tree_reg, X_train_reg, y_train_reg, cv=5,
                                   scoring='neg_mean_squared_error'))
    scores_reg.append(-score)

optimal_alpha_reg = ccp_alphas_reg[np.argmin(scores_reg)]

# Build pruned tree
pruned_tree_reg = DecisionTreeRegressor(ccp_alpha=optimal_alpha_reg, random_state=42)
pruned_tree_reg.fit(X_train_reg, y_train_reg)
pruned_reg_perf = evaluate_regression(pruned_tree_reg, X_train_reg, X_test_reg, y_train_reg, y_test_reg)

# e. Random Forest
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_reg, y_train_reg)
rf_reg_perf = evaluate_regression(rf_reg, X_train_reg, X_test_reg, y_train_reg, y_test_reg)

# f. AdaBoost with Decision Stumps
ada_reg = AdaBoostRegressor(
    DecisionTreeRegressor(max_depth=1),
    n_estimators=200,
    learning_rate=0.1,
    random_state=42
)
ada_reg.fit(X_train_reg, y_train_reg)
ada_reg_perf = evaluate_regression(ada_reg, X_train_reg, X_test_reg, y_train_reg, y_test_reg)

# ==============================================================================
# CLASSIFICATION PROBLEM (Breast Cancer Dataset)
# ==============================================================================

# Load classification dataset
cancer = load_breast_cancer()
X_clf = cancer.data
y_clf = cancer.target
feature_names_clf = cancer.feature_names

# a. Split data
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_clf, y_clf, test_size=0.3, random_state=42)

# b. Build initial decision tree
base_tree_clf = DecisionTreeClassifier(random_state=42)
base_tree_clf.fit(X_train_clf, y_train_clf)

# c. Check performance
def evaluate_classification(model, X_train, X_test, y_train, y_test):
    return {
        'train_acc': accuracy_score(y_train, model.predict(X_train)),
        'test_acc': accuracy_score(y_test, model.predict(X_test))
    }

base_clf_perf = evaluate_classification(base_tree_clf, X_train_clf, X_test_clf, y_train_clf, y_test_clf)

# d. Cost Complexity Pruning
path_clf = base_tree_clf.cost_complexity_pruning_path(X_train_clf, y_train_clf)
ccp_alphas_clf = path_clf.ccp_alphas[:-1]

# Find optimal alpha using cross-validation
scores_clf = []
for alpha in ccp_alphas_clf:
    tree_clf = DecisionTreeClassifier(ccp_alpha=alpha, random_state=42)
    score = np.mean(cross_val_score(tree_clf, X_train_clf, y_train_clf, cv=5, scoring='accuracy'))
    scores_clf.append(1 - score)  # Convert to error rate

optimal_alpha_clf = ccp_alphas_clf[np.argmin(scores_clf)]

# Build pruned tree
pruned_tree_clf = DecisionTreeClassifier(ccp_alpha=optimal_alpha_clf, random_state=42)
pruned_tree_clf.fit(X_train_clf, y_train_clf)
pruned_clf_perf = evaluate_classification(pruned_tree_clf, X_train_clf, X_test_clf, y_train_clf, y_test_clf)

# e. Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_clf, y_train_clf)
rf_clf_perf = evaluate_classification(rf_clf, X_train_clf, X_test_clf, y_train_clf, y_test_clf)

# f. AdaBoost with Decision Stumps
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=200,
    learning_rate=0.1,
    random_state=42
)
ada_clf.fit(X_train_clf, y_train_clf)
ada_clf_perf = evaluate_classification(ada_clf, X_train_clf, X_test_clf, y_train_clf, y_test_clf)

# ==============================================================================
# Visualization and Results
# ==============================================================================

plt.figure(figsize=(18, 12))

# Regression Results
plt.subplot(2, 2, 1)
models_reg = ['Base Tree', 'Pruned Tree', 'Random Forest', 'AdaBoost']
mse_test_reg = [base_reg_perf['test_mse'], pruned_reg_perf['test_mse'],
               rf_reg_perf['test_mse'], ada_reg_perf['test_mse']]
plt.bar(models_reg, mse_test_reg)
plt.title('Regression Model Comparison (Test MSE)')
plt.ylabel('MSE')

plt.subplot(2, 2, 2)
plt.plot(ccp_alphas_reg, scores_reg)
plt.scatter(optimal_alpha_reg, np.min(scores_reg), c='red')
plt.title('Regression: Cost Complexity Pruning')
plt.xlabel('alpha')
plt.ylabel('Cross-Validated MSE')

# Classification Results
plt.subplot(2, 2, 3)
models_clf = ['Base Tree', 'Pruned Tree', 'Random Forest', 'AdaBoost']
acc_test_clf = [base_clf_perf['test_acc'], pruned_clf_perf['test_acc'],
               rf_clf_perf['test_acc'], ada_clf_perf['test_acc']]
plt.bar(models_clf, acc_test_clf)
plt.title('Classification Model Comparison (Test Accuracy)')
plt.ylim(0.8, 1.0)
plt.ylabel('Accuracy')

plt.subplot(2, 2, 4)
ConfusionMatrixDisplay.from_estimator(ada_clf, X_test_clf, y_test_clf)
plt.title('AdaBoost Confusion Matrix')

plt.tight_layout()
plt.show()

# Print performance metrics
print("Regression Results:")
print(f"Base Tree - Train R²: {base_reg_perf['train_r2']:.3f}, Test R²: {base_reg_perf['test_r2']:.3f}")
print(f"Pruned Tree - Test R²: {pruned_reg_perf['test_r2']:.3f}")
print(f"Random Forest - Test R²: {rf_reg_perf['test_r2']:.3f}")
print(f"AdaBoost - Test R²: {ada_reg_perf['test_r2']:.3f}\n")

print("Classification Results:")
print(f"Base Tree - Test Accuracy: {base_clf_perf['test_acc']:.3f}")
print(f"Pruned Tree - Test Accuracy: {pruned_clf_perf['test_acc']:.3f}")
print(f"Random Forest - Test Accuracy: {rf_clf_perf['test_acc']:.3f}")
print(f"AdaBoost - Test Accuracy: {ada_clf_perf['test_acc']:.3f}")