In [None]:
#@title Imports & setup (run this first)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_breast_cancer, fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score

---
# Classification Trees

## **Example**: Binary Classification with DT (Banknote Authentication)

<img src="https://www.neuraldesigner.com/images/banknote-authentication.webp" height=200>

<img src="https://ars.els-cdn.com/content/image/1-s2.0-S0925231213003202-gr6.jpg" height=200>

[Image 1 Source](https://www.neuraldesigner.com/blog/banknote-authentication/),
[Image 2 Source](https://www.sciencedirect.com/science/article/abs/pii/S0925231213003202)

In [None]:
#@title **1) Load the dataset**

bank_data = pd.read_csv('banknotes.csv')
bank_data.head()

In [None]:
# Prepare predictors and target
X = bank_data.drop('class', axis=1)
y = bank_data['class']

# Peek at the data
print("Shape:", X.shape)
X.head()

In [None]:
#@title **2) Split the dataset into train/test datasets**

# Keep the test set aside for **final** evaluation.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape

In [None]:
#@title **3) Train a Baseline Decision Tree (default settings)**

dt_baseline = DecisionTreeClassifier(random_state=42)
dt_baseline.fit(X_train, y_train)

# Evaluate on test set
y_pred_test = dt_baseline.predict(X_test)

# Get accuracy score
acc_test = accuracy_score(y_test, y_pred_test)

print(f"Baseline Test Accuracy: {acc_test:.4f}")

print("\nClassification report (test):\n")
print(classification_report(y_test, y_pred_test, target_names=["Fake","Genuine"]))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_test)
sns.heatmap(cm, annot=True, fmt="")


In [None]:
#@title **4) Print the Decision Tree**
fig = plt.figure(figsize=plt.figaspect(0.35))
ax = fig.add_subplot(1, 1, 1)
plot_tree(dt_baseline,
          filled=True,
          class_names=["Fake","Genuine"],
          feature_names=X.columns,
          ax=ax,
          fontsize=6)
plt.show()

In [None]:
#@title **5) Feature importance**

importances = pd.Series(dt_baseline.feature_importances_, index=X.columns).sort_values(ascending=False)
importances


---
# **Example 2** - Cancer Classification Using Decision Trees

Let's try to use DTs to diagnose (breast) cancer patients. Further, let us investigate how various settings impact the performance.

- Load and inspect a real medical dataset.
- Build a baseline Decision Tree classifier (with default settings).
- Evaluate performance (accuracy, confusion matrix, precision/recall/F1).
- Explore how different hyperparameters affect performance:
  - `max_depth` (integer or `None`)
  - `min_samples_split` (integer or fraction)
  - `min_samples_leaf` (integer or fraction)
  - `max_features` (integer, fraction, `"sqrt"`, `"log2"`, or `None`)  

![What does breast cancer look like on mammography](https://healthimaging.com/sites/default/files/styles/gallery/public/2022-09/Series%20on%20annual%20mammograms%20showing%20cancer%20formation_RSNA.jpg.webp?itok=7rAqFvmS)

[image source](https://healthimaging.com/topics/medical-imaging/womens-imaging/breast-imaging/photo-gallery-what-does-breast-cancer-look-mammography)

In [None]:
#@title  1) Load & explore the dataset
# Load the dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="target")

# Peek at the data
print("Shape:", X.shape)
X.head()


In [None]:
# Class distribution
y.value_counts(normalize=True).rename(index={0:"malignant", 1:"benign"})

In [None]:
#@title 2) Create a train/test split

# Keep the test set aside for **final** evaluation.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape


In [None]:
#@title 3) Baseline Decision Tree (default settings)

dt_baseline = DecisionTreeClassifier(random_state=42)
dt_baseline.fit(X_train, y_train)

# Evaluate on test set
y_pred_test = dt_baseline.predict(X_test)
acc_test = accuracy_score(y_test, y_pred_test)
print(f"Baseline Test Accuracy: {acc_test:.4f}")
print("\nClassification report (test):\n")
print(classification_report(y_test, y_pred_test, target_names=["malignant","benign"]))

In [None]:

# Confusion matrix
cm = confusion_matrix(y_test, y_pred_test)

# Plot confusion matrix
classes = ["malignant","benign"]
sns.heatmap(cm, annot=True, xticklabels=classes, yticklabels=classes)

# Set the axis labels and title
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
depth_values = [None] + list(range(1, 21))  # None means expand until all leaves are pure or min_samples constraints are met
cv_means = []

for d in depth_values:
    clf = DecisionTreeClassifier(max_depth=d, random_state=42)
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring="accuracy")
    cv_means.append(scores.mean())

# Plot
plt.figure()
plt.plot([str(d) for d in depth_values], cv_means, marker="o")
plt.title("Cross-validated Accuracy vs max_depth")
plt.xlabel("max_depth")
plt.ylabel("Mean CV Accuracy")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

best_idx = int(np.argmax(cv_means))
best_depth = depth_values[best_idx]
print("Best max_depth by CV:", best_depth, "with mean accuracy:", f"{cv_means[best_idx]:.4f}")

In [None]:
split_values = [2, 5, 10, 20, 0.01, 0.05, 0.1, 0.2]
cv_means_split = []

for s in split_values:
    clf = DecisionTreeClassifier(min_samples_split=s, random_state=42)
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring="accuracy")
    cv_means_split.append(scores.mean())

plt.figure()
plt.plot([str(s) for s in split_values], cv_means_split, marker="o")
plt.title("Cross-validated Accuracy vs min_samples_split")
plt.xlabel("min_samples_split")
plt.ylabel("Mean CV Accuracy")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

best_idx = int(np.argmax(cv_means_split))
print("Best min_samples_split by CV:", split_values[best_idx], "with mean accuracy:", f"{cv_means_split[best_idx]:.4f}")

In [None]:

leaf_values = [1, 2, 5, 10, 20, 0.01, 0.02, 0.05, 0.1]
cv_means_leaf = []

for l in leaf_values:
    clf = DecisionTreeClassifier(min_samples_leaf=l, random_state=42)
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring="accuracy")
    cv_means_leaf.append(scores.mean())

plt.figure()
plt.plot([str(l) for l in leaf_values], cv_means_leaf, marker="o")
plt.title("Cross-validated Accuracy vs min_samples_leaf")
plt.xlabel("min_samples_leaf")
plt.ylabel("Mean CV Accuracy")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

best_idx = int(np.argmax(cv_means_leaf))
print("Best min_samples_leaf by CV:", leaf_values[best_idx], "with mean accuracy:", f"{cv_means_leaf[best_idx]:.4f}")


In [None]:

feature_settings = [None, "sqrt", "log2", 5, 10, 0.5, 0.8]
cv_means_feats = []

for f in feature_settings:
    clf = DecisionTreeClassifier(max_features=f, random_state=42)
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring="accuracy")
    cv_means_feats.append(scores.mean())

plt.figure()
plt.plot([str(f) for f in feature_settings], cv_means_feats, marker="o")
plt.title("Cross-validated Accuracy vs max_features")
plt.xlabel("max_features")
plt.ylabel("Mean CV Accuracy")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

best_idx = int(np.argmax(cv_means_feats))
print("Best max_features by CV:", feature_settings[best_idx], "with mean accuracy:", f"{cv_means_feats[best_idx]:.4f}")


In [None]:
from sklearn.model_selection import GridSearchCV

tree_para = {
    'criterion':['gini','entropy'],
    'max_depth': list(range(3, 30)),
    'min_samples_leaf': [1, 2, 5, 10, 20, 0.01, 0.02, 0.05, 0.1],
    'max_features': [None, "sqrt", "log2", 5, 10, 0.5, 0.8],
    'min_samples_split': [2, 5, 10, 20, 0.01, 0.05, 0.1, 0.2]
    }
# clf = GridSearchCV(estimator=DecisionTreeClassifier(),
#                    param_grid=tree_para,
#                    cv=5,
#                    scoring='accuracy')
# clf.fit(X_train, y_train)
# print(clf.best_params_)

In [None]:
# ðŸ‘‰ TODO: Replace these with the values you consider best from the sweeps above
best_params = {'criterion': 'entropy', 'max_depth': 9, 'max_features': 0.5, 'min_samples_leaf': 5, 'min_samples_split': 10}

dt_final = DecisionTreeClassifier(random_state=42, **best_params)
dt_final.fit(X_train, y_train)

y_pred_test_final = dt_final.predict(X_test)
acc_test_final = accuracy_score(y_test, y_pred_test_final)
print("Final Test Accuracy:", f"{acc_test_final:.4f}")
print("\nClassification report (test):\n")
print(classification_report(y_test, y_pred_test_final, target_names=["malignant","benign"]))

# Confusion Matrix for final model
cm_final = confusion_matrix(y_test, y_pred_test_final)

plt.figure()
plt.imshow(cm_final, interpolation='nearest')
plt.title("Final Model: Confusion Matrix (Test)")
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.colorbar()
plt.xticks([0,1], ["malignant","benign"])
plt.yticks([0,1], ["malignant","benign"])
for i in range(cm_final.shape[0]):
    for j in range(cm_final.shape[1]):
        plt.text(j, i, cm_final[i, j], ha="center", va="center")
plt.show()

In [None]:
importances = pd.Series(dt_final.feature_importances_, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(8, 10))
importances.iloc[:20].plot(kind="barh")
plt.title("Top 20 Feature Importances")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.gca().invert_yaxis()
plt.show()

importances.head(10)


In [None]:
fig = plt.figure(figsize=plt.figaspect(0.35))
ax = fig.add_subplot(1, 1, 1)
plot_tree(dt_final, filled=True, class_names=["malignant","benign"],
               feature_names=data.feature_names, ax=ax, fontsize=6)
plt.show()

In [None]:
importances = dt_final.feature_importances_
names = load_breast_cancer()['feature_names']

feature_importance = pd.DataFrame(zip(names, importances),
                                  columns=['Feature', 'Importance'])
feature_importance = feature_importance.sort_values(
    'Importance', ascending=False).reset_index()
feature_importance[:10]

## Random Forests
When used for classification, the trees "vote" when predicting. Use $\texttt{RandomForestClassifier}$. [Classifier](https://ogrisel.github.io/scikit-learn.org/sklearn-tutorial/modules/generated/sklearn.ensemble.RandomForestClassifier.html#:~:text=class%20sklearn.ensemble.)

When used for regression, the mean of the individual trees' predictions are used. Use $\texttt{RandomForestRegressor}$. [Regressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)

The parameters of the random forest are almost like those of a decision tree (after all, it is just multiple decision trees). The main new things are:

1. **n_estimators**: The number of trees in the forest. An integer. Any sufficiently large value is good.
1. **max_features**: The number of features to consider when looking for the best split.

In [None]:
from sklearn import ensemble

# Initialize
rf = ensemble.RandomForestClassifier()

# Fit
rf.fit(X_train, y_train)

# Predict
y_test_hat = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_test_hat)
print(
    f'''RF with default settings achieved {round(accuracy * 100, 1)}% accuracy.'''
)

In [None]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# base models (note: scale LR/SVM but not DT)
lr = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=200, class_weight="balanced"))
])
svm = Pipeline([
    ("scaler", StandardScaler()),
    ("model", SVC(kernel="rbf", C=1.0, probability=True, class_weight="balanced"))
])
dt = DecisionTreeClassifier(max_depth=5, random_state=42, class_weight="balanced")

# Soft voting (recommended when proba available)
soft_voter = VotingClassifier(
    estimators=[("lr", lr), ("svm", svm), ("dt", dt)],
    voting="soft",  # average probabilities
    weights=[1, 1, 1]  # you can tune these
)

soft_voter.fit(X_train, y_train)
y_pred = soft_voter.predict(X_test)
print("Soft Voting - Acc:", accuracy_score(y_test, y_pred), "F1:", f1_score(y_test, y_pred))

# Hard voting (majority vote on class labels)
hard_voter = VotingClassifier(
    estimators=[("lr", lr), ("svm", svm), ("dt", dt)],
    voting="hard"
)
hard_voter.fit(X_train, y_train)
y_pred = hard_voter.predict(X_test)
print("Hard Voting - Acc:", accuracy_score(y_test, y_pred), "F1:", f1_score(y_test, y_pred))

# Regression Task

In [None]:
data = 'HousingData.csv'
raw_df = pd.read_csv(data).dropna()

# Create a copy of the DataFrame with column names
df_copy = raw_df.copy()

# Separate the target variable (y) and features (X)
y = df_copy['MEDV']  # Replace 'TargetColumn' with your actual target column name
X = df_copy.drop(columns=['MEDV'])  # Remove the target column

# We use `train_test_split` to split our data into a train and a test set.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
dt_base = DecisionTreeRegressor(random_state=42)
dt_base.fit(X_train, y_train)

y_te_pred = dt_base.predict(X_test)

print("Baseline Decision Tree Regressor")
print("-"*32)
print("Test MSE: %.3f" % ( mean_squared_error(y_test, y_te_pred)))

In [None]:
# Initialize
rf = ensemble.RandomForestRegressor()

# Fit
rf.fit(X_train, y_train)

# Predict
y_test_hat = rf.predict(X_test)

mse = mean_squared_error(y_test, y_test_hat)
print(f'''RF with default settings achieved {round(mse, 3)} MSE.''')

In [None]:
feature_settings = [None, "sqrt", "log2", 5, 10, 0.5, 0.6, 0.7, 0.8]
estimators = [ 120, 180, 240, 300]

for f in feature_settings:
  for n in estimators:
    # Initialize
    rf = ensemble.RandomForestRegressor(max_features=f, n_estimators=n)

    # Fit
    rf.fit(X_train, y_train)

    # Predict
    y_test_hat = rf.predict(X_test)

    mse = mean_squared_error(y_test, y_test_hat)
    print(f'''RF with max_features={f} and n_estimators={n} achieved {round(mse, 3)} MSE.''')