Modify the Gradient Boosting scratch code in our lecture such that:
- Notice that we are still using max_depth = 1.  Attempt to tweak min_samples_split, max_depth for the regression and see whether we can achieve better mse on our boston data
- Notice that we only write scratch code for gradient boosting for regression, add some code so that it also works for binary classification.  Load the breast cancer data from sklearn and see that it works.
- Further change the code so that it works for multiclass classification.  Load the digits data from sklearn and see that it works
- Put everything into class

In [6]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split

In [126]:
class GradientBoosting:

    
    def __init__ (self, method = 'regression', S=5, alpha = 0.1, max_depth = 1, min_samples_split = 2):
        self.S = S
        self.alpha = alpha
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
#         self.regression = regression
        self.method = method
        
        tree_params = {'max_depth' : self.max_depth, 'min_samples_split' : self.min_samples_split}
        self.models = [DecisionTreeRegressor(**tree_params) for _ in range(S)]
        first_model = DummyRegressor(strategy= 'mean')
        self.models.insert(0, first_model)
        

    def residual(self, y, h):
        return (y - h)  
    
    def fit(self, X, y):
        self.models[0].fit(X, y)
        
        for i in range(self.S):
            y_hat = self.predict(X, self.models[:i+1], with_argmax = False)
#             print(y_hat)
            
            gradient = self.residual(y, y_hat)       
            self.models[i+1].fit(X, gradient)
#             print('gradient: ', gradient)
        
    def predict(self, X, models = None, with_argmax = True):
        if models is None:
            models = self.models
        h0 = models[0].predict(X)
        boosting = sum(self.alpha * model.predict(X) for model in models[1:])
        y_hat = h0 + boosting
        
        if self.method == 'binary' or self.method == 'multiclass':
            y_hat = np.exp(y_hat)/ np.sum(np.exp(y_hat), axis = 1, keepdims = True)
            if with_argmax:
                y_hat = np.argmax(y_hat, axis = 1)
        elif self.method != 'binary' and self.method != 'multiclass' and self.method != 'regression':
            raise ValueError ('Please choose method between "binary", "multiclass" and "regression"! ')
        return y_hat     

In [127]:
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error

X, y = load_boston(return_X_y = True)

In [128]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
# print(y_train)

(354, 13)
(152, 13)
(354,)
(152,)


In [129]:
from sklearn.ensemble import GradientBoostingRegressor


model = GradientBoosting(method = 'regression', S = 200, alpha= 0.1, max_depth= 3, min_samples_split= 3)
model.fit(X_train, y_train)
y_hat = model.predict(X_test)

print("MSE: ", mean_squared_error(y_test, y_hat))

n_estimators = 200

#=====SKlearn========
#Compare to sklearn: ls is the same as our mse
sklearn_model = GradientBoostingRegressor(
    n_estimators=n_estimators,
    learning_rate = 0.1,
    max_depth=3,
    loss='ls'
)

yhat_sk = sklearn_model.fit(X_train, y_train).predict(X_test)
print("Sklearn MSE: ", mean_squared_error(y_test, yhat_sk))

MSE:  7.280103220453387
Sklearn MSE:  7.687009933213661


In [130]:
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

X, y = load_breast_cancer(return_X_y = True)
# print(y)

In [131]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

y_train_encoded = np.zeros((y_train.shape[0], len(set(y))))
for each_class in range(len(set(y))):
    cond = y_train==each_class
    y_train_encoded[np.where(cond), each_class] = 1

model = GradientBoosting(method = 'binary', S=200, alpha=0.1, max_depth = 3, min_samples_split = 3)
model.fit(X_train, y_train_encoded)
y_hat = model.predict(X_test)

print("Our accuracy: ", accuracy_score(y_test, y_hat))


#=====SKlearn========
#Compare to sklearn: ls is the same as our accuracy
sklearn_model = GradientBoostingClassifier(
    n_estimators=n_estimators,
    learning_rate = 0.1,
    max_depth=3
)

yhat_sk = sklearn_model.fit(X_train, y_train).predict(X_test)
print("Sklearn accuracy: ", accuracy_score(y_test, yhat_sk))

Our accuracy:  0.9649122807017544
Sklearn accuracy:  0.9649122807017544


In [132]:
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

X, y = load_digits(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
y_train_encoded = np.zeros((y_train.shape[0], len(set(y))))
for each_class in range(len(set(y))):
    cond = y_train==each_class
    y_train_encoded[np.where(cond), each_class] = 1

model = GradientBoosting(method = 'multiclass', S=200, alpha=0.1, max_depth = 3, min_samples_split = 3)
model.fit(X_train, y_train_encoded)
y_hat = model.predict(X_test)

# #print metrics
print("Our accuracy: ", accuracy_score(y_test, y_hat))

#=====SKlearn========
#Compare to sklearn: ls is the same as our accuracy
sklearn_model = GradientBoostingClassifier(
    n_estimators=n_estimators,
    learning_rate = 0.1,
    max_depth=3
)

yhat_sk = sklearn_model.fit(X_train, y_train).predict(X_test)
print("Sklearn accuracy: ", accuracy_score(y_test, yhat_sk))

Our accuracy:  0.924074074074074
Sklearn accuracy:  0.9555555555555556
