Gradient boosting is one of the most popular machine learning algorithms for tabular datasets. It is powerful enough to find any nonlinear relationship between your model target and features and has great usability that can deal with missing values, outliers, and high cardinality categorical values on your features without any special treatment. 

### Algorithm with an Example
Gradient Boosting is one of the variants of ensemble methods where you create multiple weak models and combine them to get better performance as a whole.

https://medium.com/data-science/all-you-need-to-know-about-gradient-boosting-algorithm-part-1-regression-2520a34a502

# Code

In [1]:
from sklearn.tree import DecisionTreeRegressor
import numpy as np

class CustomGradientBoostingRegressor:
    def __init__(self, learning_rate=0.1, n_estimators=100, max_depth=1):
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.trees = []
        self.F0 = None  # initial prediction (mean of y)
        
    def fit(self, X, y):
        # Step 1: Initialize F0 as mean of y
        self.F0 = np.mean(y)
        Fm = np.full(y.shape, self.F0)  # initial predictions (vector)
        
        # Step 2: Boosting iterations
        for _ in range(self.n_estimators):
            # Compute residuals (actual - current prediction)
            residuals = y - Fm

            # Fit a regression tree to residuals
            tree = DecisionTreeRegressor(max_depth=self.max_depth, random_state=0)
            tree.fit(X, residuals)
            self.trees.append(tree)
            
            # Predict the residuals (gamma) for all X
            gamma = tree.predict(X)
            
            # Update model prediction
            Fm += self.learning_rate * gamma

    def predict(self, X):
        # Start with the initial model (mean)
        Fm = np.full((X.shape[0],), self.F0)
        
        # Add contributions from all trees
        for tree in self.trees:
            Fm += self.learning_rate * tree.predict(X)
        
        return Fm


In [3]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.datasets import make_regression

# Create a small regression dataset
x, y = make_regression(n_samples=200, n_features=1, noise=10, random_state=42)

# Train your custom GBM
custom_gbm = CustomGradientBoostingRegressor(
    n_estimators=20, 
    learning_rate=0.1, 
    max_depth=1
)
custom_gbm.fit(x, y)
custom_gbm_rmse = mean_squared_error(y, custom_gbm.predict(x), squared=False)
print(f"Custom GBM RMSE: {custom_gbm_rmse:.10f}")

# Train sklearn GBM
sklearn_gbm = GradientBoostingRegressor(
    n_estimators=20, 
    learning_rate=0.1, 
    max_depth=1,
    random_state=0
)
sklearn_gbm.fit(x, y)
sklearn_gbm_rmse = mean_squared_error(y, sklearn_gbm.predict(x), squared=False)
print(f"Sklearn GBM RMSE: {sklearn_gbm_rmse:.10f}")


NameError: name 'estimators' is not defined

In [4]:
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
import numpy as np

# ==========================
# 1. Create dataset
# ==========================
x, y = make_regression(
    n_samples=200,
    n_features=1,
    noise=10,
    random_state=42
)

# ==========================
# 2. Define your custom GBM
# ==========================
class CustomGradientBoostingRegressor:
    def __init__(self, learning_rate=0.1, n_estimators=100, max_depth=1):
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.trees = []
        self.F0 = None
        
    def fit(self, X, y):
        self.F0 = np.mean(y)
        Fm = np.full(y.shape, self.F0)
        
        for _ in range(self.n_estimators):
            residuals = y - Fm
            tree = DecisionTreeRegressor(max_depth=self.max_depth, random_state=0)
            tree.fit(X, residuals)
            self.trees.append(tree)
            gamma = tree.predict(X)
            Fm += self.learning_rate * gamma
            
    def predict(self, X):
        Fm = np.full((X.shape[0],), self.F0)
        for tree in self.trees:
            Fm += self.learning_rate * tree.predict(X)
        return Fm

# ==========================
# 3. Train and compare models
# ==========================
custom_gbm = CustomGradientBoostingRegressor(
    n_estimators=20,
    learning_rate=0.1,
    max_depth=1
)
custom_gbm.fit(x, y)
custom_gbm_rmse = mean_squared_error(y, custom_gbm.predict(x), squared=False)
print(f"Custom GBM RMSE: {custom_gbm_rmse:.10f}")

sklearn_gbm = GradientBoostingRegressor(
    n_estimators=20,
    learning_rate=0.1,
    max_depth=1,
    random_state=0
)
sklearn_gbm.fit(x, y)
sklearn_gbm_rmse = mean_squared_error(y, sklearn_gbm.predict(x), squared=False)
print(f"Sklearn GBM RMSE: {sklearn_gbm_rmse:.10f}")


TypeError: got an unexpected keyword argument 'squared'