In [7]:
# reading the data
import pandas as pd
import numpy as np

data_path = "./data/House_Price.csv"
df = pd.read_csv(data_path, header=0)
# removing columns with NaN values (waterbody and n_hos_beds)
df.dropna(axis=1, inplace=True)
# removing bus_ter column (it has the same value for each row)
df.drop(["bus_ter"], axis=1, inplace=True)
# encoding airport values
airport_mapping = {'YES': 1, 'NO': 0}
df['airport'] = df['airport'].map(airport_mapping)
df.head()

Unnamed: 0,price,crime_rate,resid_area,air_qual,room_num,age,dist1,dist2,dist3,dist4,teachers,poor_prop,airport,n_hot_rooms,rainfall,parks
0,24.0,0.00632,32.31,0.538,6.575,65.2,4.35,3.81,4.18,4.01,24.7,4.98,1,11.192,23,0.049347
1,21.6,0.02731,37.07,0.469,6.421,78.9,4.99,4.7,5.12,5.06,22.2,9.14,0,12.1728,42,0.046146
2,34.7,0.02729,37.07,0.469,7.185,61.1,5.03,4.86,5.01,4.97,22.2,4.03,0,101.12,38,0.045764
3,33.4,0.03237,32.18,0.458,6.998,45.8,6.21,5.93,6.16,5.96,21.3,2.94,1,11.2672,45,0.047151
4,36.2,0.06905,32.18,0.458,7.147,54.2,6.16,5.86,6.37,5.86,21.3,5.33,0,11.2896,55,0.039474


In [8]:
# regression tree functions
import sys

# class for tree nodes
class Node:
    def __init__(self, atribute=None, threshold=None, value=None, left=None, right=None):
        self.atribute = atribute # which atribute (index) are we splitting on
        self.threshold = threshold  # threshold value for the split
        self.value = value  # value to predict at this node (for leaf nodes)
        self.left = left  # left subtree
        self.right = right  # right subtree

# computes RSS for column of y values and y prediction (mean)
def compute_RSS(y_col: np.array, y_pred: float):
    rss = 0
    for y in y_col:
        rss += (y-y_pred)**2
    return rss

# finds best atribute and threshold for a split
def find_best_atr_and_threshold(data: np.array):
    # this assumes the order of the atributes is the same as the order of data columns
    # this also assumes the target is the last atribute
    min_RSS = sys.float_info.max
    best_atribute = None
    best_threshold = -1.0
    
    for atr_index in range(data.shape[1]-1):
        min_RSS_for_atribute = sys.float_info.max
        best_threshold_for_atribute = -1.0
        # Sorting the data by the curent column
        sorted_indices = np.argsort(data[:, atr_index])
        data = data[sorted_indices]

        for i in range(data.shape[0]-1): # TODO if column has only 0s and 1s this is a waste of time
            # splitting the data into two and calculating RSS for each
            data1 = data[:i+1, :]
            data2 = data[i+1:, :]
            cur_RSS = compute_RSS(data1[:, -1], data1[:, -1].mean()) + compute_RSS(data2[:, -1], data2[:, -1].mean())
            if cur_RSS < min_RSS_for_atribute:
                min_RSS_for_atribute = cur_RSS
                best_threshold_for_atribute = data[i, atr_index]
        if min_RSS_for_atribute < min_RSS:
            min_RSS = min_RSS_for_atribute
            best_threshold = best_threshold_for_atribute
            best_atribute = atr_index
    return best_atribute, best_threshold

# builds the regression tree via recursion (it's called from fit_regression_tree function)
def fit_regression_tree_recursion(data: np.array, min_instances, max_depth, cur_depth=0): 
    if cur_depth > max_depth:
        # create leaf node and return it
        if len(data[:,-1]) <= 0: # if data is empty set value to 0
            avg_y = 0
            return Node(None, None, value=avg_y, left=None, right=None)
        avg_y = np.mean(data[:, -1]) # calculate the mean value of all y 
        return Node(None, None, value=avg_y, left=None, right=None)
    if data.shape[0] <= min_instances:
        # create leaf node and return it
        if len(data[:,-1]) <= 0: # if data is empty set value to 0
            avg_y = 0
            return Node(None, None, value=avg_y, left=None, right=None)
        avg_y = np.mean(data[:, -1]) # calculate the mean value of all y 
        return Node(None, None, value=avg_y, left=None, right=None)
    # get the best current atribute and best threshold
    best_atr, best_threshold = find_best_atr_and_threshold(data)
    # sort the data for best atribute
    sorted_indices = np.argsort(data[:, best_atr])
    data = data[sorted_indices]
    # split the data on the threshold
    i = np.where(data[:, best_atr] == best_threshold)[0][-1]
    data_left = data[:i+1, :]
    data_right = data[i+1:, :]
    # recursively build left and right subtree and append them to current node
    node_left = fit_regression_tree_recursion(data_left, min_instances, max_depth, cur_depth = cur_depth + 1)
    node_right = fit_regression_tree_recursion(data_right, min_instances, max_depth, cur_depth = cur_depth + 1)
    return Node(atribute = best_atr, threshold=best_threshold, value=None, left=node_left, right=node_right)

# prints the tree (used for testing)
def print_tree(node, depth=0, prefix="Root: "):
    if node is not None:
        print("  " * depth + prefix + f"{node.atribute} {node.threshold} {node.value}")
        if node.left or node.right:
            print_tree(node.left, depth + 1, "L-> ")
            print_tree(node.right, depth + 1, "R-> ")

# fit the  regression tree and returns the root node of the regression tree
def fit_regression_tree(X: np.array, y: np.array, min_instances = 39, max_depth=10):
    data = np.hstack((X, y.reshape(-1, 1))) # adding y to X as the last column of data
    return fit_regression_tree_recursion(data, min_instances, max_depth) # returns the root node of the regression tree

# gets the predicted value from a tree via recursion (called from predict_many)
def predict_recursion(node: Node, x_row: np.array):
    if node.value is not None: # we have reached the leaf, returning the value stored in leaf
        return node.value
    x_val = x_row[node.atribute]
    if x_val <= node.threshold: # value is smaller or equal to threshold, we go to the left subtree
        return predict_recursion(node.left, x_row)
    else: # value is higher than threshold, we go to the left subtree
        return predict_recursion(node.right, x_row)

# gets the X df (without  the target) and  computes y predictions (vector)
def predict_many(root: Node, X: np.array):
    y_pred = np.zeros(X.shape[0])
    for i, x_row in enumerate(X):
        y_pred[i] = predict_recursion(root, x_row)
    return y_pred

In [9]:
# defining my regression tree class to make things a bit less messy
class MyRegressionTree:
    def __init__(self, min_instances=39, max_depth=10):
        self.root = None
        self.min_instances = min_instances
        self.max_depth = max_depth

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = np.array(X)
        if isinstance(y, pd.Series):
            y = np.array(y)
        self.root = fit_regression_tree(X, y, self.min_instances, self.max_depth)

    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = np.array(X)
        return predict_many(self.root, X)
    
    def display_tree(self):
        print_tree(self.root)

In [10]:
# reusing my cross validation from v2
import math 
from sklearn.metrics import r2_score

def k_fold_split(data, k): # returns k folds of the data
    data = np.array(data)
    fold_size = math.ceil(len(data) / k)
    folds = [data[i:i + fold_size] for i in range(0, len(data), fold_size)]
    return folds

def k_fold_cross_validation(model, X, y, k=2):
    if isinstance(X, pd.DataFrame):
        X = np.array(X)
    if isinstance(y, pd.Series):
        y = np.array(y)
    data = np.hstack((X, y.reshape(-1, 1))) # adding y to X as the last column of data
    
    folds = k_fold_split(data, k) # splitting the data into k folds
    scores = []

    for i in range(k):
        # using one of the folds in each iteration for testing, others for training the model
        test_set = folds[i]
        train_set = np.concatenate([folds[j] for j in range(k) if j != i])

        # fitting the model with train set
        model.fit(train_set[:,:-1], train_set[:,-1])
        score = r2_score(test_set[:,-1], model.predict(test_set[:,:-1]))
        scores.append(score)
    return np.mean(scores) # returns the mean of R2

In [11]:
df = df.sample(frac=1.0, random_state=0) # shuffling the data now (I'll be using the same shuffle for my regression tree and sklearn's)
y = df["price"]
X = df.drop("price", axis=1)

In [12]:
# Testing my regression tree and sklearn's regression tree with cross validation
my_reg_tree = MyRegressionTree() # default min_instances = 39 and max_depth = 10
my_reg_tree_cv_score = k_fold_cross_validation(my_reg_tree, X, y, k=5)
print(f"CV score for my regression tree: {my_reg_tree_cv_score}")
from sklearn.tree import DecisionTreeRegressor
sklearn_regression_tree = DecisionTreeRegressor()
sklearn_reg_tree_cv_score = k_fold_cross_validation(sklearn_regression_tree, X, y, k=5)
print(f"CV score for my regression tree: {sklearn_reg_tree_cv_score}")

CV score for my regression tree: 0.7494889606746485
CV score for my regression tree: 0.678187308834542


In [None]:
# TODO MAYBE DO HYPERPARAMETER TESTING