In [None]:
from sklearn.datasets import make_regression
import matplotlib.pyplot as plt
import numpy as np
# could do make_classification but would need to include a way to get entropy

In [None]:
X, Y = make_regression(n_features=2, noise=0.25)
Y = (Y - Y.min()) / (Y.max() - Y.min())

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=Y, cmap='viridis')
plt.colorbar(label='Target Variable (Y)')
plt.title('Scatter plot colored by target variable (Y)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

In [None]:
X.shape, Y.shape

In [None]:
def compute_mse(ypred, ys):
    return np.mean((ypred - ys)**2)

def get_split(X, Y):
    best_split = None
    best_mse = np.inf
    for feature in range(X.shape[-1]):
        # for each possible feature
        all_values = np.sort(X[:, feature])
        possible_values = all_values[1:-1]
        for value in possible_values:
            # propose a split on it
            group1outs = Y[X[:, feature] > value]
            group2outs = Y[X[:, feature] <= value]
            group1pred = np.mean(group1outs)
            group2pred = np.mean(group2outs)
            total_mse = compute_mse(group1pred, group1outs) + compute_mse(group2pred, group2outs)
            if total_mse < best_mse:
                best_split = (feature, value)
                best_mse = total_mse
    return best_split
    
def build_tree(X, Y, max_depth=10, current_depth=0):
    if current_depth >= max_depth or len(X) == 0:
        return None
    
    split = get_split(X, Y)
    if split:    
        # Perform the split
        left_indices = np.where(X[:, split[0]] > split[1])[0]
        right_indices = np.where(X[:, split[0]] <= split[1])[0]
        
        left_tree = build_tree(X[left_indices], Y[left_indices], max_depth, current_depth + 1)
        right_tree = build_tree(X[right_indices], Y[right_indices], max_depth, current_depth + 1)
        
        return {
            'split': split,
            'left': left_tree,
            'right': right_tree,
        }

In [None]:
splits = build_tree(X, Y, max_depth=3)

In [None]:
depth_c = {0: 'red', 1: 'blue', 2: 'green'}
def plot_decision_boundary(tree, depth=0):
    if tree is None:
        return

    feature, value = tree['split']
    left_child = tree['left']
    right_child = tree['right']

    if feature == 0:
        plt.axvline(x=value, color=depth_c.get(depth), linestyle='--')
    elif feature == 1:
        plt.axhline(y=value, color=depth_c.get(depth), linestyle='--')

    plot_decision_boundary(left_child, depth + 1)
    plot_decision_boundary(right_child, depth + 1)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=Y, cmap='viridis')
plot_decision_boundary(splits)
plt.colorbar(label='Target Variable (Y)')
plt.title('Scatter plot colored by target variable (Y)')
plt.xlabel('0')
plt.ylabel('1')
plt.show()
print(depth_c)

In [None]:
splits

In [None]:
# right is <=
# left is >

def inference(x_new, X, Y, splits):
    # keep cutting the data according to splits
    # then find y's and average them
    if not splits['right'] and not splits['left']:
        return np.mean(Y)
    
    split_f, split_v = splits['split'][0], splits['split'][1]
    if x_new[split_f] <= split_v:
        mask = X[:, split_f] <= split_v
        X, Y = X[mask], Y[mask]
        return inference(x_new, X, Y, splits['right'])
    else:
        mask = X[:, split_f] > split_v
        X, Y = X[mask], Y[mask]
        return inference(x_new, X, Y, splits['left'])

In [None]:
mse_acc = []
for x,y in zip(X, Y):
    out = inference(x, X, Y, splits)
    mse_acc.append((out-y)**2)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=mse_acc, cmap='viridis')
plot_decision_boundary(splits)
plt.colorbar(label='Target Variable (Y)')
plt.title('Scatter plot colored by target variable (Y)')
plt.xlabel('0')
plt.ylabel('1')
plt.show()
print(depth_c)