In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load the Boston Housing dataset
boston = pd.read_csv('boston_house_prices.csv')
X = boston.drop('MEDV', axis=1)
y = boston['MEDV']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Decision Tree Regressor
regressor = DecisionTreeRegressor(random_state=42)

# Fit the model on the training data
regressor.fit(X_train, y_train)

# Predict on the test data
y_pred = regressor.predict(X_test)

# Calculate Mean Squared Error (MSE) as the evaluation metric
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 10.416078431372549


### Show that when the error function is squared error that the expected value at any leaf is the mean.

In [2]:
# Function to calculate the mean of values in a leaf node
def leaf_mean(node):
    return np.mean(y[regressor.apply(X) == node])

# Get the leaf nodes in the tree
leaf_nodes = np.unique(regressor.apply(X))

# Calculate the mean for each leaf node
means_at_leaves = [leaf_mean(node) for node in leaf_nodes]

# Print the means at each leaf
for i, node in enumerate(leaf_nodes):
    print(f"Leaf Node {node}: Mean = {means_at_leaves[i]}")

Leaf Node 4: Mean = 50.0
Leaf Node 5: Mean = 27.9
Leaf Node 10: Mean = 11.9
Leaf Node 12: Mean = 17.1
Leaf Node 13: Mean = 20.15
Leaf Node 16: Mean = 15.3
Leaf Node 22: Mean = 22.6
Leaf Node 23: Mean = 23.4
Leaf Node 24: Mean = 24.3
Leaf Node 29: Mean = 23.8
Leaf Node 31: Mean = 23.25
Leaf Node 33: Mean = 22.9
Leaf Node 35: Mean = 22.8
Leaf Node 36: Mean = 22.7
Leaf Node 37: Mean = 20.299999999999997
Leaf Node 39: Mean = 19.1
Leaf Node 40: Mean = 19.6
Leaf Node 43: Mean = 19.4
Leaf Node 44: Mean = 19.7
Leaf Node 45: Mean = 20.1
Leaf Node 48: Mean = 24.0
Leaf Node 49: Mean = 24.3
Leaf Node 50: Mean = 26.4
Leaf Node 53: Mean = 19.3
Leaf Node 54: Mean = 19.3
Leaf Node 55: Mean = 19.2
Leaf Node 60: Mean = 18.2
Leaf Node 61: Mean = 18.966666666666665
Leaf Node 62: Mean = 16.1
Leaf Node 69: Mean = 22.6
Leaf Node 70: Mean = 23.1
Leaf Node 72: Mean = 22.4
Leaf Node 73: Mean = 21.7
Leaf Node 74: Mean = 24.7
Leaf Node 77: Mean = 20.9
Leaf Node 80: Mean = 20.65
Leaf Node 82: Mean = 21.04999999999