In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingClassifier as SklearnGBC
from sklearn.model_selection import train_test_split, cross_val_score, KFold

In [2]:
class GradientBoostingClassifier(BaseEstimator):
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []
        self.classes = None

    def _to_one_hot(self, y):
        return np.eye(len(self.classes))[np.searchsorted(self.classes, y)]

    def fit(self, X, y):
        self.classes = np.unique(y)

        y_one_hot = self._to_one_hot(y)

        F = np.zeros((X.shape[0], len(self.classes)))

        for _ in range(self.n_estimators):
            gradients = y_one_hot - self._softmax(F)

            trees_iter = []
            for k in range(len(self.classes)):
                tree = DecisionTreeRegressor(max_depth=self.max_depth)
                tree.fit(X, gradients[:, k])
                trees_iter.append(tree)

                F[:, k] += self.learning_rate * tree.predict(X)

            self.trees.append(trees_iter)

        return self

    def _softmax(self, X):
        exp = np.exp(X - np.max(X, axis=1, keepdims=True))
        return exp / np.sum(exp, axis=1, keepdims=True)

    def predict_proba(self, X):
        F = np.zeros((X.shape[0], len(self.classes)))

        for trees_iter in self.trees:
            for k, tree in enumerate(trees_iter):
                F[:, k] += self.learning_rate * tree.predict(X)

        return self._softmax(F)

    def predict(self, X):
        probas = self.predict_proba(X)
        return self.classes[np.argmax(probas, axis=1)]

    def score(self, X, y):
        y_pred = self.predict(X)
        return accuracy_score(y, y_pred)

In [3]:
data = pd.read_csv("../Dry_Bean_Dataset.csv")
X = data.drop("Class", axis=1)
y = data["Class"]

In [4]:
le = LabelEncoder()
y = le.fit_transform(y)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
params = {"n_estimators": 50, "learning_rate": 0.01, "max_depth": 5}

In [7]:
custom_gb = GradientBoostingClassifier(**params)
start_time = time.time()
custom_gb.fit(X_train, y_train)
custom_time = time.time() - start_time
custom_score = custom_gb.score(X_test, y_test)

In [8]:
sklearn_gb = SklearnGBC(**params)
start_time = time.time()
sklearn_gb.fit(X_train, y_train)
sklearn_time = time.time() - start_time
sklearn_score = sklearn_gb.score(X_test, y_test)

In [9]:
print(f"CGB accuracy: {custom_score:.4f}")
print(f"CGB training time: {custom_time:.2f} seconds")
print(f"SGB accuracy: {sklearn_score:.4f}")
print(f"SGB training time: {sklearn_time:.2f} seconds")

CGB accuracy: 0.9188
CGB training time: 27.55 seconds
SGB accuracy: 0.9188
SGB training time: 28.55 seconds


In [10]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
custom_cv_scores = cross_val_score(custom_gb, X, y, cv=kf)
sklearn_cv_scores = cross_val_score(sklearn_gb, X, y, cv=kf)

In [11]:
print(f"Custom implementation CV scores: {custom_cv_scores.mean():.4f}")
print(f"Scikit-learn CV scores: {sklearn_cv_scores.mean():.4f}")

Custom implementation CV scores: 0.9118
Scikit-learn CV scores: 0.9140
