In [11]:
import numpy as np
import pandas as pd                   
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
import time

In [12]:
train = pd.read_csv('train.csv')
X = train.drop(columns=['datetime','casual','registered','count'])
y = train['count']

In [13]:
class GBMRegressor:
    def __init__(self, learning_rate=0.1, n_estimators=100, max_depth=3, random_state=12345):
        self.learning_rate = learning_rate
        self.n_estimators   = n_estimators
        self.max_depth      = max_depth
        self.random_state   = random_state
        self.trees          = []
        self.initial_pred   = None

    def fit(self, X, y):
        # стартуем с константы = среднее y
        self.initial_pred = float(np.mean(y))
        preds = np.full(shape=len(y), fill_value=self.initial_pred, dtype=float)
        self.trees = []

        for _ in range(self.n_estimators):
            # градиент = y - preds
            residuals = y - preds
            tree = DecisionTreeRegressor(max_depth=self.max_depth,
                                         random_state=self.random_state)
            tree.fit(X, residuals)
            update = tree.predict(X)
            preds += self.learning_rate * update
            self.trees.append(tree)

    def predict(self, X):
        preds = np.full(shape=len(X), fill_value=self.initial_pred, dtype=float)
        for tree in self.trees:
            preds += self.learning_rate * tree.predict(X)
        return preds


In [14]:
def cross_val_eval(model_fn, X, y, metric, n_splits=5, random_state=12):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    scores = []

    for train_idx, val_idx in kf.split(X):
        X_tr, y_tr = X.iloc[train_idx].values, y.iloc[train_idx].values
        X_vl, y_vl = X.iloc[val_idx].values,   y.iloc[val_idx].values

        model = model_fn()             # fresh model
        model.fit(X_tr, y_tr)
        preds = model.predict(X_vl)
        scores.append(metric(y_vl, preds))

    return np.array(scores)

In [15]:
t0 = time.time()
mae_gbm = cross_val_eval(
    lambda: GBMRegressor(learning_rate=0.1, n_estimators=100, max_depth=3, random_state=12345),
    X, y, mean_absolute_error
)
mse_gbm = cross_val_eval(
    lambda: GBMRegressor(learning_rate=0.1, n_estimators=100, max_depth=3, random_state=12345),
    X, y, mean_squared_error
)
time_gbm = time.time() - t0

print("Custom GBM:")
print(f" MAE = {mae_gbm.mean():.2f} ± {mae_gbm.std():.2f}")
print(f" MSE = {mse_gbm.mean():.2f} ± {mse_gbm.std():.2f}")
print(f" Time = {time_gbm:.2f} s\n")

t1 = time.time()
mae_lib = cross_val_eval(
    lambda: GradientBoostingRegressor(learning_rate=0.1,
                                      n_estimators=100,
                                      max_depth=3,
                                      random_state=12345),
    X, y, mean_absolute_error
)
mse_lib = cross_val_eval(
    lambda: GradientBoostingRegressor(learning_rate=0.1,
                                      n_estimators=100,
                                      max_depth=3,
                                      random_state=12345),
    X, y, mean_squared_error
)
time_lib = time.time() - t1

print("Sklearn GBM:")
print(f" MAE = {mae_lib.mean():.2f} ± {mae_lib.std():.2f}")
print(f" MSE = {mse_lib.mean():.2f} ± {mse_lib.std():.2f}")
print(f" Time = {time_lib:.2f} s")

Custom GBM:
 MAE = 109.27 ± 2.10
 MSE = 22018.26 ± 843.71
 Time = 11.34 s

Sklearn GBM:
 MAE = 109.26 ± 2.08
 MSE = 22016.67 ± 841.07
 Time = 10.56 s
