In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("airfoil_noise_data.csv")
data.head(5)

Unnamed: 0,x0,x1,x2,x3,x4,y
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


In [3]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values.reshape(-1,1)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)

In [4]:
def build_node(feature_index=None, threshold=None, left=None, right=None, var_red=None, value=None):
    return {
        "feature_index": feature_index,
        "threshold": threshold,
        "left": left,
        "right": right,
        "var_red": var_red,
        "value": value
    }

In [5]:
def build_decision_tree_regressor(min_samples_split=2, max_depth=2):
    return {
        "root": None,
        "min_samples_split": min_samples_split,
        "max_depth": max_depth
    }

In [6]:
def build_tree(dataset, decision_tree, curr_depth=0):
    X, Y = dataset[:, :-1], dataset[:, -1]
    num_samples, num_features = np.shape(X)
    best_split = {}
    if num_samples >= decision_tree["min_samples_split"] and curr_depth <= decision_tree["max_depth"]:
        best_split = get_best_split(dataset, num_samples, num_features, decision_tree)
        if best_split["var_red"] > 0:
            left_subtree = build_tree(best_split["dataset_left"], decision_tree, curr_depth + 1)
            right_subtree = build_tree(best_split["dataset_right"], decision_tree, curr_depth + 1)
            return build_node(best_split["feature_index"], best_split["threshold"], left_subtree, right_subtree, best_split["var_red"])
    leaf_value = calculate_leaf_value(Y)
    return build_node(value=leaf_value)

In [8]:
def get_best_split(dataset, num_samples, num_features, decision_tree):
    best_split = {}
    max_var_red = -float("inf")
    for feature_index in range(num_features):
        feature_values = dataset[:, feature_index]
        possible_thresholds = np.unique(feature_values)
        for threshold in possible_thresholds:
            dataset_left, dataset_right = split(dataset, feature_index, threshold)
            if len(dataset_left) > 0 and len(dataset_right) > 0:
                y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                curr_var_red = variance_reduction(y, left_y, right_y)
                if curr_var_red > max_var_red:
                    best_split["feature_index"] = feature_index
                    best_split["threshold"] = threshold
                    best_split["dataset_left"] = dataset_left
                    best_split["dataset_right"] = dataset_right
                    best_split["var_red"] = curr_var_red
                    max_var_red = curr_var_red
    return best_split

In [9]:
def split(dataset, feature_index, threshold):
    dataset_left = np.array([row for row in dataset if row[feature_index] <= threshold])
    dataset_right = np.array([row for row in dataset if row[feature_index] > threshold])
    return dataset_left, dataset_right

In [10]:
def variance_reduction(parent, l_child, r_child):
    weight_l = len(l_child) / len(parent)
    weight_r = len(r_child) / len(parent)
    reduction = np.var(parent) - (weight_l * np.var(l_child) + weight_r * np.var(r_child))
    return reduction

In [11]:

def calculate_leaf_value(Y):
    val = np.mean(Y)
    return val

In [12]:
def print_tree(tree=None, indent=" "):
    if not tree:
        return

    if tree["value"] is not None:
        print(tree["value"])
    else:
        print("X_" + str(tree["feature_index"]), "<=", tree["threshold"], "?", tree["var_red"])
        print("%sleft:" % (indent), end="")
        print_tree(tree["left"], indent + indent)
        print("%sright:" % (indent), end="")
        print_tree(tree["right"], indent + indent)

In [13]:
def fit(decision_tree, X, Y):
    dataset = np.concatenate((X, Y), axis=1)
    decision_tree["root"] = build_tree(dataset, decision_tree)

In [14]:
def make_prediction(x, tree):
    if tree["value"] is not None:
        return tree["value"]
    feature_val = x[tree["feature_index"]]
    if feature_val <= tree["threshold"]:
        return make_prediction(x, tree["left"])
    else:
        return make_prediction(x, tree["right"])

In [15]:

def predict(decision_tree, X):
    predictions = [make_prediction(x, decision_tree["root"]) for x in X]
    return predictions

In [16]:
# Example usage:
regressor = build_decision_tree_regressor(min_samples_split=3, max_depth=3)
fit(regressor, X_train, Y_train)
print_tree(regressor["root"])


X_0 <= 3150.0 ? 7.132048702017748
 left:X_4 <= 0.0337792 ? 3.590330569067664
  left:X_3 <= 55.5 ? 1.17898999813184
    left:X_4 <= 0.00251435 ? 1.614396721819876
        left:128.9919833333333
        right:125.90953579676673
    right:X_1 <= 15.4 ? 2.2342245360792994
        left:129.39160280373832
        right:123.80422222222222
  right:X_0 <= 1250.0 ? 9.970884020498868
    left:X_4 <= 0.0483159 ? 6.35527515982486
        left:124.38024528301887
        right:118.30039999999998
    right:X_3 <= 39.6 ? 5.036286657241031
        left:113.58091666666667
        right:118.07284615384616
 right:X_4 <= 0.00146332 ? 29.08299210506528
  left:X_0 <= 8000.0 ? 11.886497073996964
    left:X_2 <= 0.0508 ? 7.608945827689519
        left:134.04247500000002
        right:127.33581818181818
    right:X_4 <= 0.00076193 ? 10.6229193224008
        left:128.94078571428574
        right:122.40768750000001
  right:X_4 <= 0.0229028 ? 5.638575922510643
    left:X_0 <= 6300.0 ? 5.985051045988914
        left

In [17]:
#002