In [16]:
import pandas as pd
import random
import numpy as np

In [17]:
random.seed(6)

# define column names for the dataset
columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']

# read data 
data = pd.read_csv('auto-mpg.data', sep='\\s+', header=None, names=columns, quotechar='"')

X_train = data.drop('mpg', axis=1)
y_train  = data["mpg"]

In [18]:
feature_name = 'cylinders'

split_value = 2
left_df  = data[data[feature_name] < split_value]
right_df = data[data[feature_name] >= split_value]

left_target_mean  = left_df["mpg"].mean()
right_target_mean = right_df["mpg"].mean()

# create numpy array with target mean for error metric
left_mean_array  = np.repeat(left_target_mean, len(left_df))
right_mean_array = np.repeat(right_target_mean, len(right_df))

# calculate individual SSR (Sum Squared Residuals)
left_ssr  = np.sum(np.square(left_df["mpg"] - left_mean_array))
right_ssr = np.sum(np.square(right_df["mpg"] - right_mean_array))

total_ssr = left_ssr + right_ssr
tuple_split_val_ssr = (split_value, total_ssr, left_target_mean, right_target_mean)


In [21]:
left_ssr

np.float64(0.0)

In [20]:
print(left_target_mean)
print(left_mean_array)


nan
[]


In [22]:
total_ssr

np.float64(24252.575477386938)

In [23]:
right_target_mean

np.float64(23.514572864321607)

In [24]:
right_ssr

np.float64(24252.575477386938)

In [34]:
dict_ssrs_per_feature = {'cylinders': [{'Split value': np.float64(3.0), 'Total SSR': np.float64(11334.585929648241)}, {'Split value': np.float64(2.0), 'Total SSR': np.float64(113.585929648241)}]}

In [51]:
dict_best_values_per_feature = {}

for feature_name, dictionary_list in dict_ssrs_per_feature.items():
    min_ssr = dictionary_list[0]["Total SSR"]
    best_split_value = dictionary_list[0]["Split value"]
    for dictionary in dictionary_list:
        if dictionary["Total SSR"] < min_ssr:
            min_ssr = dictionary["Total SSR"]
            best_split_value = dictionary["Split value"]

    # Add feature name and best split values
    dict_best_values_per_feature[feature_name] = {"Best split": best_split_value, "Min SSR": min_ssr}

# find min SSR globally over every feature
print(dict_best_values_per_feature)
print(dict_best_values_per_feature['cylinders']['Min SSR'])
print(dict_best_values_per_feature[next(iter(dict_best_values_per_feature))])





global_min_ssr = dict_best_values_per_feature[str(next(iter(dict_best_values_per_feature)))]["Min SSR"]
global_best_split_value = dict_best_values_per_feature[next(iter(dict_best_values_per_feature))]["Best split"]
best_feature = dict_best_values_per_feature[next(iter(dict_best_values_per_feature))]



for feature_name, dictionary in dict_best_values_per_feature.items():
       
        if dictionary["Min SSR"] < global_min_ssr:
            global_min_ssr = dictionary["Total SSR"]
            global_best_split_value = dictionary["Best split"]
            best_feature = feature_name


{'cylinders': {'Best split': np.float64(2.0), 'Min SSR': np.float64(113.585929648241)}}
113.585929648241
{'Best split': np.float64(2.0), 'Min SSR': np.float64(113.585929648241)}


In [41]:
next(iter(dict_ssrs_per_feature))

'cylinders'

In [37]:
split_value

np.float64(2.0)