<a href="https://colab.research.google.com/github/hatttruong/machine-learning-from-scratch/blob/master/explore_gradient_boosting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook, I implement some functions to understanding more about gradient boosting based on the blog explained.ai

Ref: https://explained.ai/gradient-boosting/index.html

## Regression tree stump

According to Wiki:

A **decision stump** is a machine learning model consisting of a **one-level** decision tree. That is, it is a decision tree with one internal node (the root) which is immediately connected to the terminal nodes (its leaves). A decision stump makes a prediction based on the value of just a single input feature. Sometimes they are also called **1-rules**

In [54]:
import numpy as np
def regression_decision_stump(x, y, verbose=False):
    df_temp = pd.DataFrame({'x': x, 'y': y}).sort_values(by='x')

    # finding the best split
    min_var = None
    split = None
    mean_left = None
    mean_right = None
    for i in range(0, len(x) - 1):
        left = df_temp.loc[:i, 'y'].to_list()
        right = df_temp.loc[i + 1:, 'y'].to_list()
        var_left = np.var(left)
        var_right = np.var(right)
        var = (var_left + var_right) / 2
        if verbose:
            print(f'\niter: {i}')
            print(f'left: {left}')
            print(f'right: {right}')
            print(f'var_left={var_left}, var_right={var_right}, var={var}')
        if min_var is None or (min_var is not None and min_var > var):
            split = df_temp.loc[i, 'x'] + (df_temp.loc[i + 1, 'x'] - df_temp.loc[i, 'x']) * 1.0 / 2
            mean_left = np.mean(left)
            mean_right = np.mean(right)
            min_var = var
            if verbose:
                print(f'split={split}, min_var={min_var}')

    return split, mean_left, mean_right

# Gradient boosting: Distance to target

In this example, the weak models trained regression tree stumps on residual vector

In [58]:
import pandas as pd
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.DataFrame(
    {'sqfeet': [750, 800, 850, 900, 950],
     'rent': [1160, 1200, 1280, 1450, 2000]})
df.head()

Unnamed: 0,sqfeet,rent
0,750,1160
1,800,1200
2,850,1280
3,900,1450
4,950,2000


In [3]:
df = pd.DataFrame(
    [(750, 1160), (800, 1200), (850, 1280), (900, 1450), (950, 2000)],
    columns=['sqfeet', 'rent']
    )
df.head()

Unnamed: 0,sqfeet,rent
0,750,1160
1,800,1200
2,850,1280
3,900,1450
4,950,2000


### F_0

In [55]:
N = df.shape[0]
# F0 is average of rent
df['F_0'] = [df.rent.sum() * 1.0 / N] * N
df.head()

Unnamed: 0,sqfeet,rent,F_0,residual_0,weak_1,F_1,residual_1,weak_2,F_2
0,750,1160,1418.0,-258.0,-145.5,1272.5,-112.5,-92.5,1180.0
1,800,1200,1418.0,-218.0,-145.5,1272.5,-72.5,-92.5,1180.0
2,850,1280,1418.0,-138.0,-145.5,1272.5,7.5,61.666667,1334.166667
3,900,1450,1418.0,32.0,-145.5,1272.5,177.5,61.666667,1334.166667
4,950,2000,1418.0,582.0,582.0,2000.0,0.0,61.666667,2061.666667


In [56]:
df['residual_0'] = df.rent - df.F_0
df.head()

Unnamed: 0,sqfeet,rent,F_0,residual_0,weak_1,F_1,residual_1,weak_2,F_2
0,750,1160,1418.0,-258.0,-145.5,1272.5,-112.5,-92.5,1180.0
1,800,1200,1418.0,-218.0,-145.5,1272.5,-72.5,-92.5,1180.0
2,850,1280,1418.0,-138.0,-145.5,1272.5,7.5,61.666667,1334.166667
3,900,1450,1418.0,32.0,-145.5,1272.5,177.5,61.666667,1334.166667
4,950,2000,1418.0,582.0,582.0,2000.0,0.0,61.666667,2061.666667


### F_n

In [61]:
verbose = True
M = 3
N = df.shape[0]
learning_rate = 1.0

result = []
for i in range(1, M + 1):
    if verbose:
        print(f'\nIter {i}:')
    # weak_i
    s, l, r = regression_decision_stump(df.sqfeet.to_list(), df[f'residual_{i-1}'].to_list())
    if verbose:
        print(s, l, r)
    df[f'weak_{i}'] = np.where(df.sqfeet < s, l, r)
    #
    # F_i
    df[f'F_{i}'] = df[f'F_{i-1}'] + learning_rate * df[f'weak_{i}']
    #
    # residual_i
    df[f'residual_{i}'] = df.rent - df[f'F_{i}']
    #
    # mse
    mse = mean_squared_error(df.rent, df[f'F_{i}'])
    print('mse=', mse)
    result.append((learning_rate, i, mse))
df.head()


Iter 1:
925.0 -145.5 582.0
mse= 9895.0

Iter 2:
825.0 -92.5 61.666666666666664
mse= 4190.833333333328

Iter 3:
925.0 15.416666666666629 -61.666666666666515
mse= 3240.1388888888855


Unnamed: 0,sqfeet,rent,F_0,residual_0,weak_1,F_1,residual_1,weak_2,F_2,residual_2,weak_3,F_3,residual_3
0,750,1160,1418.0,-258.0,-145.5,1272.5,-112.5,-92.5,1180.0,-20.0,15.416667,1195.416667,-35.416667
1,800,1200,1418.0,-218.0,-145.5,1272.5,-72.5,-92.5,1180.0,20.0,15.416667,1195.416667,4.583333
2,850,1280,1418.0,-138.0,-145.5,1272.5,7.5,61.666667,1334.166667,-54.166667,15.416667,1349.583333,-69.583333
3,900,1450,1418.0,32.0,-145.5,1272.5,177.5,61.666667,1334.166667,115.833333,15.416667,1349.583333,100.416667
4,950,2000,1418.0,582.0,582.0,2000.0,0.0,61.666667,2061.666667,-61.666667,-61.666667,2000.0,0.0


In [62]:
df_mse = pd.DataFrame(result, columns=['learning_rate', 'i', 'mse'])
df_mse.head()

Unnamed: 0,learning_rate,i,mse
0,1.0,1,9895.0
1,1.0,2,4190.833333
2,1.0,3,3240.138889


### F_1

In [37]:
s, l, r = regression_decision_stump(df.sqfeet.to_list(), df.residual_0.to_list())
print(s, l, r)
df['weak_1'] = np.where(df.sqfeet < s, l, r)
df.head()

Unnamed: 0,sqfeet,rent,F_0,residual_0,weak_1
0,750,1160,1418.0,-258.0,-145.5
1,800,1200,1418.0,-218.0,-145.5
2,850,1280,1418.0,-138.0,-145.5
3,900,1450,1418.0,32.0,-145.5
4,950,2000,1418.0,582.0,582.0


In [39]:
df['F_1'] = df.F_0 + df.weak_1
df.head()

Unnamed: 0,sqfeet,rent,F_0,residual_0,weak_1,F_1
0,750,1160,1418.0,-258.0,-145.5,1272.5
1,800,1200,1418.0,-218.0,-145.5,1272.5
2,850,1280,1418.0,-138.0,-145.5,1272.5
3,900,1450,1418.0,32.0,-145.5,1272.5
4,950,2000,1418.0,582.0,582.0,2000.0


In [40]:
df['residual_1'] = df.rent - df.F_1
df.head()

Unnamed: 0,sqfeet,rent,F_0,residual_0,weak_1,F_1,residual_1
0,750,1160,1418.0,-258.0,-145.5,1272.5,-112.5
1,800,1200,1418.0,-218.0,-145.5,1272.5,-72.5
2,850,1280,1418.0,-138.0,-145.5,1272.5,7.5
3,900,1450,1418.0,32.0,-145.5,1272.5,177.5
4,950,2000,1418.0,582.0,582.0,2000.0,0.0


### F_2

In [51]:
s, l, r = regression_decision_stump(df.sqfeet.to_list(), df.residual_1.to_list(),
                                    verbose=False)
print(s, l, r)
df['weak_2'] = np.where(df.sqfeet < s, l, r)
df.head()

split=775.0, min_var=4206.8359375
split=825.0, min_var=3559.027777777778
825.0 -92.5 61.666666666666664


Unnamed: 0,sqfeet,rent,F_0,residual_0,weak_1,F_1,residual_1,weak_2
0,750,1160,1418.0,-258.0,-145.5,1272.5,-112.5,-92.5
1,800,1200,1418.0,-218.0,-145.5,1272.5,-72.5,-92.5
2,850,1280,1418.0,-138.0,-145.5,1272.5,7.5,61.666667
3,900,1450,1418.0,32.0,-145.5,1272.5,177.5,61.666667
4,950,2000,1418.0,582.0,582.0,2000.0,0.0,61.666667


In [53]:
df['F_2'] = df.F_1 + df.weak_2
df.head()

Unnamed: 0,sqfeet,rent,F_0,residual_0,weak_1,F_1,residual_1,weak_2,F_2
0,750,1160,1418.0,-258.0,-145.5,1272.5,-112.5,-92.5,1180.0
1,800,1200,1418.0,-218.0,-145.5,1272.5,-72.5,-92.5,1180.0
2,850,1280,1418.0,-138.0,-145.5,1272.5,7.5,61.666667,1334.166667
3,900,1450,1418.0,32.0,-145.5,1272.5,177.5,61.666667,1334.166667
4,950,2000,1418.0,582.0,582.0,2000.0,0.0,61.666667,2061.666667


In [15]:
x = df.sqfeet.to_list()
y = df.residual_0.to_list()
df_temp = pd.DataFrame({'x': x, 'y': y}).sort_values(by='x')
df_temp.head()

Unnamed: 0,x,y
0,750,-258.0
1,800,-218.0
2,850,-138.0
3,900,32.0
4,950,582.0


### Different learning rate

In [69]:
verbose = False
M = 3
N = df.shape[0]
learning_rates = [1.0, 0.9, 0.8, 0.7, 0.6]
result = []
for learning_rate in learning_rates:
    if verbose:
        print(f'\nlearning_rate {learning_rate}:')
    for i in range(1, M + 1):
        if verbose:
            print(f'\nIter {i}:')
        # weak_i
        s, l, r = regression_decision_stump(df.sqfeet.to_list(), df[f'residual_{i-1}'].to_list())
        if verbose:
            print(s, l, r)
        df[f'weak_{i}'] = np.where(df.sqfeet < s, l, r)
        #
        # F_i
        df[f'F_{i}'] = df[f'F_{i-1}'] + learning_rate * df[f'weak_{i}']
        #
        # residual_i
        df[f'residual_{i}'] = df.rent - df[f'F_{i}']
        #
        # mse
        mse = mean_squared_error(df.rent, df[f'F_{i}'])
        if verbose:
            print(f'mse={mse}')
        result.append((learning_rate, i, mse))
df_mse = pd.DataFrame(result, columns=['learning_rate', 'i', 'mse'])
df_mse.head(20)

Unnamed: 0,learning_rate,i,mse
0,1.0,1,9895.0
1,1.0,2,4190.833333
2,1.0,3,3240.138889
3,0.9,1,10741.81
4,0.9,2,2672.101788
5,0.9,3,1760.040274
6,0.8,1,13282.24
7,0.8,2,2063.1936
8,0.8,3,399.423886
9,0.7,1,17516.29


# Gradient boosting: Heading in the Right Direction

In this example, the weak models trained regression tree stumps on the sign of vector

In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

sign = lambda x: 1 if x > 0 else -1 if x < 0 else 0

In [2]:
df = pd.DataFrame(
    {'sqfeet': [750, 800, 850, 900, 950],
     'rent': [1160, 1200, 1280, 1450, 2000]})
df.head()

Unnamed: 0,sqfeet,rent
0,750,1160
1,800,1200
2,850,1280
3,900,1450
4,950,2000


In [8]:
N = df.shape[0]
df['F_0'] = [np.median(df.rent)] * N
df.head()

Unnamed: 0,sqfeet,rent,F_0,residual_0,sign_0
0,750,1160,1280.0,-120.0,-1
1,800,1200,1280.0,-80.0,-1
2,850,1280,1280.0,0.0,0
3,900,1450,1280.0,170.0,1
4,950,2000,1280.0,720.0,1


In [9]:
df['residual_0'] = df.rent - df.F_0
df['sign_0'] = df['residual_0'].apply(lambda x: sign(x))
df.head()

Unnamed: 0,sqfeet,rent,F_0,residual_0,sign_0
0,750,1160,1280.0,-120.0,-1
1,800,1200,1280.0,-80.0,-1
2,850,1280,1280.0,0.0,0
3,900,1450,1280.0,170.0,1
4,950,2000,1280.0,720.0,1
