In [53]:
# Read in file
exec(open("gboost.py").read())

In [54]:
help(GBoost)

Help on class GBoost in module __main__:

class GBoost(sklearn.base.BaseEstimator)
 |  Basic implementation of Gradient Boosting that performs regression with 
 |  MSE or MAE loss functions, and classification with logistic loss function.
 |  
 |  :param n_estimators: `int`. Number of trees to train
 |  :param learning_rate: `float`. shrinkage parameter. Contribution of each tree to overall estimator
 |  :param tol: `float`. Tolerance level used in binary classification to convert probabilities to classes
 |  :param loss: `str`. Default='ls'. Can be one of:
 |      - 'ls': Regression with Mean Square Error
 |      - 'lad': Regression with Mean Absolute Error
 |      - 'logistic': Binary Classification with Logistic loss.
 |  
 |  Method resolution order:
 |      GBoost
 |      sklearn.base.BaseEstimator
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, n_estimators=100, learning_rate=0.1, loss='ls', tol=0.5)
 |      Initialize self.  See help(type(self)) fo

In [55]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score, precision_score
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier

M = 100
LEARNING_RATE = 0.1

In [56]:
# Regression dataset
boston = load_boston()
X = boston.data
y = boston.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [57]:
# Custom Implementation
bst = GBoost(n_estimators=M, learning_rate=LEARNING_RATE, loss='ls')

bst.fit(X_train, y_train)

GBoost(learning_rate=0.1, loss=<function mse_gradient at 0x7f29d22d2b70>,
       n_estimators=100, tol=0.5)

In [58]:
# SKLearn implementation
sklearn_bst = GradientBoostingRegressor(
    n_estimators=M,
    learning_rate=LEARNING_RATE,
    max_depth=1,
    loss='ls'
)

sklearn_bst.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=1,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [59]:
print('Custom MSE:', mean_squared_error(y_test, bst.predict(X_test)))
print('SKLearn MSE:', mean_squared_error(y_test, sklearn_bst.predict(X_test)))

Custom MSE: 14.52702166909985
SKLearn MSE: 14.52681979717445


In [60]:
# Classification dataset
from sklearn.datasets import make_classification

X, y = make_classification(n_classes=2, n_samples=1000, n_features=20,
                           n_redundant=3, n_informative=3,
                           random_state=2, n_clusters_per_class=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [61]:
# SKLearn implementation
sklearn_clf = GradientBoostingClassifier(
    n_estimators=M,
    learning_rate=LEARNING_RATE,
    max_depth=1,
)

sklearn_clf.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=1,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [62]:
clf = GBoost(n_estimators=M, learning_rate=LEARNING_RATE, loss='logistic')

clf.fit(X_train, y_train)

GBoost(learning_rate=0.1, loss=<function logistic_gradient at 0x7f29d2305268>,
       n_estimators=100, tol=0.5)

In [63]:
print('Custom ROC AUC:', roc_auc_score(y_test, clf.predict_class(X_test)))
print('SKLearn ROC AUC:', roc_auc_score(y_test, sklearn_clf.predict(X_test)))
print('\n')
print('Custom Precision:', precision_score(y_test, clf.predict_class(X_test)))
print('SKLearn Precision:', precision_score(y_test, sklearn_clf.predict(X_test)))

Custom ROC AUC: 0.9322489754098362
SKLearn ROC AUC: 0.9597848360655737


Custom Precision: 0.944
SKLearn Precision: 0.9538461538461539
