In [1]:
import numpy as np
import pandas as pd
import torch
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
from ipywidgets import interact

import warnings
warnings.filterwarnings('ignore')

import custom_gbm_lib

# Loading the data

In [2]:
boston = load_boston()
boston_df = pd.DataFrame(boston.data)
boston_df.columns = boston.feature_names
X = boston_df
y = pd.DataFrame(boston.target, columns=["Price"])
X["Price"] = y


# Starting point
### If we were given the data and somebody asked as : 
#### - What would your prediction be if you were given a new row ?

#### Our answer would be ....
#### the mean value of the prices we have already observed !

In [3]:
mean_value = y["Price"].mean()
mean_value

22.532806324110677

#### We can pose the problem in another way.
#### We are trying to find a single value that we would predict for each row by minimizing the distance between the single value and the target value (mean squared error across rows)

In [4]:
# Minmize loss function by calculating the gradient with autograd and updating manually the parameters
single_value = torch.tensor(0.0, requires_grad = True)
learning_rate = 0.05
for _ in range(2000):
    loss = torch.sqrt(torch.mean(torch.pow(torch.tensor(y.to_numpy()) - single_value, 2)))
    grad = autograd.grad([loss], [single_value])
    single_value = single_value - learning_rate * grad[0]
print(f"The parameter value (by using pytorch autograd) is : {single_value}")   

# Minmize loss function by using a pytorch optimizer 
single_value = torch.tensor(0.0, requires_grad = True)
learning_rate = 0.05
optimizer = optim.Adam([single_value], lr = learning_rate)
for _ in range(2000):
    loss = torch.sqrt(torch.mean(torch.pow(torch.tensor(y.to_numpy()) - single_value, 2)))    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
print(f"The parameter value (by using a pytorch optimizer) is : {single_value}")

tensor_y = torch.tensor(y.to_numpy())
model = LinearRegression(fit_intercept=False)
model.fit(torch.ones_like(tensor_y), tensor_y)
print(f"The parameter value (by LinearRegression Model fitting) is {model.coef_[0][0]}")

The parameter value (by using pytorch autograd) is : 22.531627655029297
The parameter value (by using a pytorch optimizer) is : 22.53276824951172
The parameter value (by LinearRegression Model fitting) is 22.532806324110677


# Running the custom implementation

In [5]:
# Reload the data
boston_df = pd.DataFrame(boston.data)
boston_df.columns = boston.feature_names
X = boston_df
y = pd.DataFrame(boston.target, columns=["Price"])
X["Price"] = y
# Definition of Hyper-Parameters
NUM_CLASSIFIERS = 5
MAX_DEPTH = 2
GRADIENT_BOOST_LEARNING_RATE = 0.1
MINIMIZER_LEARNING_RATE = 0.05
MINIMIZER_TRAINING_EPOCHS = 1000
# Running the custom algorithm 
custom = custom_gbm_lib.PytorchBasedGenericGradientBoost("regressor", NUM_CLASSIFIERS, MAX_DEPTH, GRADIENT_BOOST_LEARNING_RATE=GRADIENT_BOOST_LEARNING_RATE, MINIMIZER_LEARNING_RATE=MINIMIZER_LEARNING_RATE, MINIMIZER_TRAINING_EPOCHS=MINIMIZER_TRAINING_EPOCHS)
df_result = custom.fit(X, y.to_numpy())
df_result_X, df_result_y = df_result

In [6]:
feature_target_columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'Price']
def display_df(step=1):
    if step == 1:
        df_result_X['initial_prediction_x'] = "???"
        columns = feature_target_columns + ["initial_prediction_x"]
        return df_result_X.loc[:, columns]
    elif step == 2:
        df_result_X['initial_prediction_x'] = df_result_X.loc[:, "Linear_0"]
        columns = feature_target_columns + ["initial_prediction_x"]
        return df_result_X.loc[:, columns]
    elif step == 3:
        columns = feature_target_columns + ["initial_prediction_x", "Gradients_0_1"]
        return df_result_X.loc[:, columns]
    elif step == 4:
        columns = feature_target_columns + ["initial_prediction_x", "Gradients_0_1", "Clusters_1"]
        return df_result_X.loc[:, columns]
    elif step == 5:
        columns = feature_target_columns + ["initial_prediction_x", "Gradients_0_1", "Clusters_1"]
        return df_result_X.sort_values("Clusters_1", inplace=False).loc[:, columns]
    elif step == 6:
        columns = feature_target_columns + ["initial_prediction_x", "Gradients_0_1", "Clusters_1", "Leaf_Prediction_1"]
        return df_result_X.sort_values("Clusters_1", inplace=False).loc[:, columns]
    elif step == 7:
        columns = feature_target_columns + ["initial_prediction_x", "Gradients_0_1", "Clusters_1", "Leaf_Prediction_1", "Linear_1"]
        return df_result_X.sort_values("Clusters_1", inplace=False).loc[:, columns]
    else:
        return df_result_X
interact(display_df, step = (1, 7, 1))

interactive(children=(IntSlider(value=1, description='step', max=7, min=1), Output()), _dom_classes=('widget-i…

<function __main__.display_df(step=1)>

# Explanation of the steps taken by the algorithm
#### 1. Step 1 asks the question : What could be an initial prediction. We have covered this, it is the mean value of the target column.

#### 2. Step 2 has the initial prediction set to the mean value.

#### 3. Step 3 visits ***EACH and EVERY row*** respectively and considers the prediction of each row as the single parameter to the squared error loss function. 
#### It calculates the gradient of the loss with respect to the current prediction for the row.

#### 4. Step 4 creates a decision tree whereby the ***target values are the GRADIENTS calculated in step 3***. 
#### The rational behind this is that predictions who would have similar impact to the loss, should be grouped together.

#### 5. Step 5 sorts the rows based on cluster/leaf ids (the cluster id is the id of the decision tree leaf node in which the rows end up being present ***CLUSTER ID = LEAF NODE ID*** in decision tree).

#### 6. Step 6 is performed ***for EACH and EVERY cluster***. 
#### It keeps only the rows for a specific cluster/leaf index and ***finds the ADDITIONAL MARGIN that should be added in the current row predictions*** that would minimize the error. ***This DELTA/ADDIOTIONAL MARGIN is the same for all the rows that end up in the particular cluster/leaf and is a single value.***

#### 7. Step 7 updates the initial prediction with the one after the construction of the first decision tree.

### The process repeats for subsequent trees.

# Step 3 breakdown

In [7]:
# Minmize loss function by calculating the gradient with autograd
index = 0 
row_prediction = torch.tensor(df_result_X.iloc[index, :]["Linear_0"], requires_grad = True)
loss = torch.pow(torch.tensor(df_result_X.iloc[index, :]["Price"]) - row_prediction, 2)
grad = autograd.grad([loss], [row_prediction])
print(f"The parameter value (by using pytorch autograd) is : {grad[0]}") 

The parameter value (by using pytorch autograd) is : -3.58990478515625


# Step 6 breakdown 

In [7]:
df_clusters_1_0 = df_result_X.loc[df_result_X["Clusters_1"] == 0 , :]
# Minmize loss function by using a pytorch optimizer 
single_delta_value = torch.tensor(0.0, requires_grad = True)
learning_rate = 0.05
optimizer = optim.Adam([single_delta_value], lr = learning_rate)
for _ in range(1000):
    loss = F.mse_loss(torch.tensor(df_clusters_1_0["Price"].to_numpy()), 
                 torch.tensor(df_clusters_1_0["Linear_0"].to_numpy()) + single_delta_value)    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
print(f"The parameter value is : {single_delta_value}")

The parameter value is : 24.329490661621094


# Step 7 breakdown

In [9]:
# The predictions are updated based on the following formula : 
previous_prediction_column = "Linear_0"
leaf_prediction_column = "Leaf_Prediction_1"
df_result_X[previous_prediction_column] + GRADIENT_BOOST_LEARNING_RATE * df_result_X[leaf_prediction_column]

0      22.154355
1      22.154355
2      23.204271
3      23.204271
4      23.204271
         ...    
501    22.154355
502    22.154355
503    22.154355
504    22.154355
505    21.280440
Length: 506, dtype: float64