In [55]:
import numpy as np
class DecisionNode():
    def __init__(self,feature_i = None, threshold = None, value = None,
                 true_branch = None, false_branch = None):
        self.feature_i = feature_i 
        self.threshold = threshold
        self.value = value
        self.true_branch = true_branch
        self.false_branch = false_branch

In [56]:
class XgboostTree():
    
    def __init__(self, min_samples_split =2, min_score  = 1e-7,
                max_depth = float("inf"), loss = None, lambda_ = 1, gamma_ = 1,eps = 0.1 ):
        self.root = None
        self.max_depth = max_depth
        self.lambda_ = lambda_
        self.gamma_ = gamma_
        self.eps = eps
        self.min_samples_split = min_samples_split
        self.min_score= min_score
        self.loss = loss
    
    def split_on_feature(self,X,y,g,h,feature_i,value):
        idx = X[:,feature_i]<=value
        
        Xl,Xr = X[idx],X[~idx]
        yl,yr = y[idx],y[~idx]
        gl,gr = g[idx],g[~idx]
        hl,hr = h[idx],h[~idx]
        return Xl,yl,gl,hl,Xr,yr,gr,hr
    
    
    def find_best_split(self,X,y,g,h):
        score = np.NINF
        split_col_value = None
        G = g.sum()
        H = h.sum()
        n_samples, n_features = np.shape(X)
        for feature_i in range(n_features):
            G_L = 0.; H_L = 0.
            for j in np.argsort(X[:,feature_i]):
                G_L += g[j]
                H_L += h[j]
                G_R = G - G_L
                H_R = H-H_L
                
                score_new = (G_L**2)/(H_L +self.lambda_) +(G_R**2)/(H_R +self.lambda_) -(G**2)/(H +self.lambda_) - self.gamma_
                
                if score_new > score:
                    score = score_new
                    split_col_value = {'score':score,'feature_i':feature_i,'split_value':X[j,feature_i] }
        return split_col_value
    
    def calc_leaf(self,g,h):
        w = -g.sum()/(h.sum()+self.lambda_)
        return w 
    
    def create_tree(self,X,y,g,h,current_depth = 0):
        
        n_samples, n_features = np.shape(X)
        if n_samples >= self.min_samples_split and current_depth <= self.max_depth:
            
            if self.eps is None:
                split_col_value = self.find_best_split(X,y,g,h)
            else:
                split_col_value = self.approximate_split(X,y,g,h)
                
            if split_col_value is not None and split_col_value['score'] > self.min_score:
                Xl,yl,gl,hl,Xr,yr,gr,hr = self.split_on_feature(X,y,g,h,
                                                       split_col_value['feature_i'],
                                                       split_col_value['split_value'])
                true_branch = self.create_tree(Xl,yl,gl,hl,current_depth +1)
                false_branch = self.create_tree(Xr,yr,gr,hr, current_depth+1)  
                return DecisionNode(feature_i =split_col_value['feature_i'],threshold =split_col_value['split_value'],
                               true_branch = true_branch, false_branch = false_branch)  
        leaf_value = self.calc_leaf(g,h)
        return DecisionNode(value = leaf_value)
    
    
    def predict_value(self,x,tree = None):
        if tree is None:
            tree = self.root
            
        if tree.value is not None:
            return tree.value
        
        feature_values = x[tree.feature_i]
        branch = tree.false_branch
        if isinstance(feature_values, int) or isinstance(feature_values, float):
            if feature_values <= tree.threshold:
                branch = tree.true_branch
            elif feature_values == tree.threshold:
                branch = tree.true_branch
        
        return self.predict_value(x,branch)
    
    def predict(self,X):
        y_pred = [self.predict_value(sample) for sample in X]
        return np.array(y_pred)
    
    
    def calc_grad_hessain(self,y,pred_y):
        _,g,h = self.loss(y,pred_y)
        return g,h
    
    def fit(self,X,y,pred_y):
        g,h = self.calc_grad_hessain(y,pred_y)
        self.root = self.create_tree(X,y,g,h)
        return self
    
    
    def approximate_split(self,X,y,g,h):
        """
        Approximate algorithm using the weigted quantile sketch
        """
        score = np.NINF
        split_col_value = None
        G = g.sum()
        H = h.sum()
        n_samples, n_features = np.shape(X)
        for feature_i in range(n_features):
            x_h = np.vstack([X[:,feature_i],g,h]).T
            x_h = x_h[x_h[:,0].argsort()]
            x_h[:,2] = x_h[:,2]/x_h[:,2].sum()
            hh = x_h[:,2].cumsum()
            start = 0;j=1
            while True:
                if hh[j]-start >= self.eps:
                    start = hh[j-1]
                    G_L = x_h[:j,1].sum(); H_L =x_h[:j,2].sum()
                    G_R = G-G_L; H_R = H-H_L
                    score_new = (G_L**2)/(H_L +self.lambda_) +(G_R**2)/(H_R +self.lambda_) -(G**2)/(H +self.lambda_) - self.gamma_
                    if score_new > score:
                        score = score_new
                        split_col_value = {'score':score,'feature_i':feature_i,'split_value':x_h[j-1,0] }
                        
                    # G_L = x_h[:(j+1),1].sum(); H_L =x_h[:(j+1),2].sum() 
                    # G_R = G-G_L; H_R = H-H_L
                    # score_new = (G_L**2)/(H_L +self.lambda_) +(G_R**2)/(H_R +self.lambda_) -(G**2)/(H +self.lambda_)
                    # if score_new > score:
                    #     score = score_new
                    #     split_col_value = {'score':score,'feature_i':feature_i,'split_value':X[j,feature_i] }
                j +=1
                if j == n_samples:
                    break   
        return split_col_value

        

In [57]:
def square_loss(y,pred_y):
    loss = np.sum((y-pred_y)**2)
    g = 2*(pred_y - y)
    h = np.full(g.shape,2.)
    return loss,g,h

def sigmoid(x):
    return 1 / (1 + np.exp(-x))
    
# first order gradient logLoss
def sigmoid_loss( labels,preds):
    preds = sigmoid(preds)
    loss =-(labels*np.log(preds) + (1-labels)*np.log(1-preds)).mean()
    g =preds - labels 
    h =preds * (1 - preds)
    return loss,g,h

In [58]:
class xgboost:
    """
    n_iters: the number of weak learners.
    lr: the learning rate of each learner.
    """
    def __init__(self,n_iters = 10,lr = 0.1,loss = None ):
        self.n_iters = 10
        self.loss = loss
        self.lr = lr
        self.models = []
    def fit(self,X,y,min_samples_split =2, min_score  = 1e-7,
                max_depth = float("inf"), lambda_ = 1, gamma_ = 1,eps = 0.1):
        y0 = np.full(y.shape,1.)
        y_pred = y0.flatten()
        for epoch in range(self.n_iters):
            boosting_tree = XgboostRegressionTree(loss =self.loss,eps = eps,min_samples_split =min_samples_split, min_score  = min_score,
                max_depth = max_depth, lambda_ = lambda_, gamma_ = gamma_)
            boosting_tree.fit(X,y,y_pred)
            y_pred += self.lr*boosting_tree.predict(X)
            self.models.append(boosting_tree)
    def predict(self,X):
        pred = np.zeros(X.shape[0])
        for model in self.models:
            pred += self.lr*model.predict(X)
        return np.full((X.shape[0], 1), 1.).flatten().astype('float64') + pred 

In [44]:
def xgboost(X,y,n_iters =5,loss =square_loss,eps = 0.1):
    models = []
    y0 = np.full(y.shape,1.)
    y_pred = y0.flatten()
    for epoch in range(n_iters):
        boosting_tree = XgboostTree(loss =loss,eps = eps)
        boosting_tree.fit(X,y,y_pred)
        y_pred += 0.1*boosting_tree.predict(X)
        models.append(boosting_tree)
    
    return(models)

In [45]:
def xgboost_predict(X,y,loss,eps = 0.1):
    pred = np.zeros(X.shape[0])
    for model in xgboost(X,y,loss=loss,eps = eps):
        pred += 0.1*model.predict(X)
    return np.full((X.shape[0], 1), 1.).flatten().astype('float64') + pred

In [59]:
np.random.seed(1)
K =20
p =2
N =300
mu = np.random.normal(0.,4.,(K,2))
component = np.random.randint(0,K,(N,))
assignment = np.random.randint(0,2,(K,))
X = mu[component,:] + np.random.normal(0.,1.,(N,p))
y = assignment[component]

In [62]:
xgb = xgboost(n_iters = 10,lr = 0.1,loss = sigmoid_loss)
xgb.fit(X,y,eps=0.1)
r =xgb.predict(X)
predicted_probas = sigmoid(r)
# manually set up the threshold for the binary classification
# the xgboost.predict returns raw scores for each observation
preds = np.where(predicted_probas > np.mean(predicted_probas), 1, 0)

In [61]:
(preds ==y).mean()

0.91

In [49]:
import numpy as np
import pandas as pd
from math import e

class Node:
    
    '''
    A node object that is recursivly called within itslef to construct a regression tree. Based on Tianqi Chen's XGBoost 
    the internal gain used to find the optimal split value uses both the gradient and hessian. Also a weighted quantlie sketch 
    and optimal leaf values all follow Chen's description in "XGBoost: A Scalable Tree Boosting System" the only thing not 
    implemented in this version is sparsity aware fitting or the ability to handle NA values with a default direction.
    Inputs
    ------------------------------------------------------------------------------------------------------------------
    x: pandas datframe of the training data
    gradient: negative gradient of the loss function
    hessian: second order derivative of the loss function
    idxs: used to keep track of samples within the tree structure
    subsample_cols: is an implementation of layerwise column subsample randomizing the structure of the trees
    (complexity parameter)
    min_leaf: minimum number of samples for a node to be considered a node (complexity parameter)
    min_child_weight: sum of the heassian inside a node is a meaure of purity (complexity parameter)
    depth: limits the number of layers in the tree
    lambda: L2 regularization term on weights. Increasing this value will make model more conservative.
    gamma: This parameter also prevents over fitting and is present in the the calculation of the gain (structure score). 
    As this is subtracted from the gain it essentially sets a minimum gain amount to make a split in a node.
    eps: This parameter is used in the quantile weighted skecth or 'approx' tree method roughly translates to 
    (1 / sketch_eps) number of bins
    Outputs
    --------------------------------------------------------------------------------------------------------------------
    A single tree object that will be used for gradient boosintg.
    '''

    def __init__(self, x, gradient, hessian, idxs, subsample_cols = 0.8 , min_leaf = 5, min_child_weight = 1 ,depth = 10, lambda_ = 1, gamma = 1, eps = 0.1):
      
        self.x, self.gradient, self.hessian = x, gradient, hessian
        self.idxs = idxs 
        self.depth = depth
        self.min_leaf = min_leaf
        self.lambda_ = lambda_
        self.gamma  = gamma
        self.min_child_weight = min_child_weight
        self.row_count = len(idxs)
        self.col_count = x.shape[1]
        self.subsample_cols = subsample_cols
        self.eps = eps
        self.column_subsample = np.random.permutation(self.col_count)[:round(self.subsample_cols*self.col_count)]
        
        self.val = self.compute_gamma(self.gradient[self.idxs], self.hessian[self.idxs])
          
        self.score = float('-inf')
        self.find_varsplit()
        
        
    def compute_gamma(self, gradient, hessian):
        '''
        Calculates the optimal leaf value equation (5) in "XGBoost: A Scalable Tree Boosting System"
        '''
        return(-np.sum(gradient)/(np.sum(hessian) + self.lambda_))
        
    def find_varsplit(self):
        '''
        Scans through every column and calcuates the best split point.
        The node is then split at this point and two new nodes are created.
        Depth is only parameter to change as we have added a new layer to tre structure.
        If no split is better than the score initalised at the begining then no splits further splits are made
        '''
        for c in self.column_subsample: self.find_greedy_split(c)
        if self.is_leaf: return
        x = self.split_col
        lhs = np.nonzero(x <= self.split)[0]
        rhs = np.nonzero(x > self.split)[0]
        self.lhs = Node(x = self.x, gradient = self.gradient, hessian = self.hessian, idxs = self.idxs[lhs], min_leaf = self.min_leaf, depth = self.depth-1, lambda_ = self.lambda_ , gamma = self.gamma, min_child_weight = self.min_child_weight, eps = self.eps, subsample_cols = self.subsample_cols)
        self.rhs = Node(x = self.x, gradient = self.gradient, hessian = self.hessian, idxs = self.idxs[rhs], min_leaf = self.min_leaf, depth = self.depth-1, lambda_ = self.lambda_ , gamma = self.gamma, min_child_weight = self.min_child_weight, eps = self.eps, subsample_cols = self.subsample_cols)
        
    def find_greedy_split(self, var_idx):
        '''
         For a given feature greedily calculates the gain at each split.
         Globally updates the best score and split point if a better split point is found
        '''
        x = self.x[self.idxs, var_idx]
        
        for r in range(self.row_count):
            lhs = x <= x[r]
            rhs = x > x[r]
            
            lhs_indices = np.nonzero(x <= x[r])[0]
            rhs_indices = np.nonzero(x > x[r])[0]
            if(rhs.sum() < self.min_leaf or lhs.sum() < self.min_leaf 
               or self.hessian[lhs_indices].sum() < self.min_child_weight
               or self.hessian[rhs_indices].sum() < self.min_child_weight): continue

            curr_score = self.gain(lhs, rhs)
            if curr_score > self.score: 
                self.var_idx = var_idx
                self.score = curr_score
                self.split = x[r]
                
    def weighted_qauntile_sketch(self, var_idx):
        '''
        XGBOOST Mini-Version
        Yiyang "Joe" Zeng
        Is an approximation to the eact greedy approach faster for bigger datasets wher it is not feasible
        to calculate the gain at every split point. Uses equation (8) and (9) from "XGBoost: A Scalable Tree Boosting System"
        '''
        x = self.x[self.idxs, var_idx]
        hessian_ = self.hessian[self.idxs]
        df = pd.DataFrame({'feature':x,'hess':hessian_})
        
        df.sort_values(by=['feature'], ascending = True, inplace = True)
        hess_sum = df['hess'].sum() 
        df['rank'] = df.apply(lambda x : (1/hess_sum)*sum(df[df['feature'] < x['feature']]['hess']), axis=1)
        
        for row in range(df.shape[0]-1):
            # look at the current rank and the next ran
            rk_sk_j, rk_sk_j_1 = df['rank'].iloc[row:row+2]
            diff = abs(rk_sk_j - rk_sk_j_1)
            if(diff >= self.eps):
                continue
                
            split_value = (df['rank'].iloc[row+1] + df['rank'].iloc[row])/2
            lhs = x <= split_value
            rhs = x > split_value
            
            lhs_indices = np.nonzero(x <= split_value)[0]
            rhs_indices = np.nonzero(x > split_value)[0]
            if(rhs.sum() < self.min_leaf or lhs.sum() < self.min_leaf 
               or self.hessian[lhs_indices].sum() < self.min_child_weight
               or self.hessian[rhs_indices].sum() < self.min_child_weight): continue
                
            curr_score = self.gain(lhs, rhs)
            if curr_score > self.score: 
                self.var_idx = var_idx
                self.score = curr_score
                self.split = split_value
                
    def gain(self, lhs, rhs):
        '''
        Calculates the gain at a particular split point bases on equation (7) from
        "XGBoost: A Scalable Tree Boosting System"
        '''
        gradient = self.gradient[self.idxs]
        hessian  = self.hessian[self.idxs]
        
        lhs_gradient = gradient[lhs].sum()
        lhs_hessian  = hessian[lhs].sum()
        
        rhs_gradient = gradient[rhs].sum()
        rhs_hessian  = hessian[rhs].sum()
        
        gain = 0.5 *( (lhs_gradient**2/(lhs_hessian + self.lambda_)) + (rhs_gradient**2/(rhs_hessian + self.lambda_)) - ((lhs_gradient + rhs_gradient)**2/(lhs_hessian + rhs_hessian + self.lambda_))) - self.gamma
        return(gain)
                
    @property
    def split_col(self):
        '''
        splits a column 
        '''
        return self.x[self.idxs , self.var_idx]
                
    @property
    def is_leaf(self):
        '''
        checks if node is a leaf
        '''
        return self.score == float('-inf') or self.depth <= 0                 

    def predict(self, x):
        return np.array([self.predict_row(xi) for xi in x])
    
    def predict_row(self, xi):
        if self.is_leaf:
            return(self.val)

        node = self.lhs if xi[self.var_idx] <= self.split else self.rhs
        return node.predict_row(xi)

    
class XGBoostTree:
    '''
    Wrapper class that provides a scikit learn interface to the recursive regression tree above
    
    Inputs
    ------------------------------------------------------------------------------------------------------------------
    x: pandas datframe of the training data
    gradient: negative gradient of the loss function
    hessian: second order derivative of the loss function
    idxs: used to keep track of samples within the tree structure
    subsample_cols: is an implementation of layerwise column subsample randomizing the structure of the trees
    (complexity parameter)
    min_leaf: minimum number of samples for a node to be considered a node (complexity parameter)
    min_child_weight: sum of the heassian inside a node is a meaure of purity (complexity parameter)
    depth: limits the number of layers in the tree
    lambda: L2 regularization term on weights. Increasing this value will make model more conservative.
    gamma: This parameter also prevents over fitting and is present in the the calculation of the gain (structure score). 
    As this is subtracted from the gain it essentially sets a minimum gain amount to make a split in a node.
    eps: This parameter is used in the quantile weighted skecth or 'approx' tree method roughly translates to 
    (1 / sketch_eps) number of bins
    
    Outputs
    --------------------------------------------------------------------------------------------------------------------
    A single tree object that will be used for gradient boosintg.
    
    '''
    def fit(self, x, gradient, hessian, subsample_cols = 0.8 , min_leaf = 5, min_child_weight = 1 ,depth = 10, lambda_ = 1, gamma = 1, eps = 0.1):
        self.dtree = Node(x, gradient, hessian, np.array(np.arange(len(x))), subsample_cols, min_leaf, min_child_weight, depth, lambda_, gamma, eps)
        return self
    
    def predict(self, X):
        return self.dtree.predict(X)
   
   
class XGBoostClassifier:
    '''
    Full application of the XGBoost algorithm as described in "XGBoost: A Scalable Tree Boosting System" for 
    Binary Classification.
    Inputs
    ------------------------------------------------------------------------------------------------------------------
    x: pandas datframe of the training data
    gradient: negative gradient of the loss function
    hessian: second order derivative of the loss function
    idxs: used to keep track of samples within the tree structure
    subsample_cols: is an implementation of layerwise column subsample randomizing the structure of the trees
    (complexity parameter)
    min_leaf: minimum number of samples for a node to be considered a node (complexity parameter)
    min_child_weight: sum of the heassian inside a node is a meaure of purity (complexity parameter)
    depth: limits the number of layers in the tree
    lambda: L2 regularization term on weights. Increasing this value will make model more conservative.
    gamma: This parameter also prevents over fitting and is present in the the calculation of the gain (structure score). 
    As this is subtracted from the gain it essentially sets a minimum gain amount to make a split in a node.
    eps: This parameter is used in the quantile weighted skecth or 'approx' tree method roughly translates to 
    (1 / sketch_eps) number of bins
    Outputs
    --------------------------------------------------------------------------------------------------------------------
    A single tree object that will be used for gradient boosintg.
    '''
    def __init__(self):
        self.estimators = []
    
    @staticmethod
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))
    
    # first order gradient logLoss
    def grad(self, preds, labels):
        preds = self.sigmoid(preds)
        return(preds - labels)
    
    # second order gradient logLoss
    def hess(self, preds, labels):
        preds = self.sigmoid(preds)
        return(preds * (1 - preds))
    
    @staticmethod
    def log_odds(column):
        binary_yes = np.count_nonzero(column == 1)
        binary_no  = np.count_nonzero(column == 0)
        return(np.log(binary_yes/binary_no))
    
    
    def fit(self, X, y, subsample_cols = 0.8 , min_child_weight = 1, depth = 5, min_leaf = 5, learning_rate = 0.4, boosting_rounds = 5, lambda_ = 1.5, gamma = 1, eps = 0.1):
        self.X, self.y = X, y
        self.depth = depth
        self.subsample_cols = subsample_cols
        self.eps = eps
        self.min_child_weight = min_child_weight 
        self.min_leaf = min_leaf
        self.learning_rate = learning_rate
        self.boosting_rounds = boosting_rounds 
        self.lambda_ = lambda_
        self.gamma  = gamma
    
        self.base_pred = np.full((X.shape[0], 1), 1).flatten().astype('float64')
    
        for booster in range(self.boosting_rounds):
            Grad = self.grad(self.base_pred, self.y)
            Hess = self.hess(self.base_pred, self.y)
            boosting_tree = XGBoostTree().fit(self.X, Grad, Hess, depth = self.depth, min_leaf = self.min_leaf, lambda_ = self.lambda_, gamma = self.gamma, eps = self.eps, min_child_weight = self.min_child_weight, subsample_cols = self.subsample_cols)
            self.base_pred += self.learning_rate * boosting_tree.predict(self.X)
            self.estimators.append(boosting_tree)
          
    def predict_proba(self, X):
        pred = np.zeros(X.shape[0])
        
        for estimator in self.estimators:
            pred += self.learning_rate * estimator.predict(X) 
          
        return(self.sigmoid(np.full((X.shape[0], 1), 1).flatten().astype('float64') + pred))
    
    def predict(self, X):
        pred = np.zeros(X.shape[0])
        for estimator in self.estimators:
            pred += self.learning_rate * estimator.predict(X) 
        
        predicted_probas = self.sigmoid(np.full((X.shape[0], 1), 1).flatten().astype('float64') + pred)
        preds = np.where(predicted_probas > np.mean(predicted_probas), 1, 0)
        return(preds)
       
class XGBoostRegressor:
    '''
    Full application of the XGBoost algorithm as described in "XGBoost: A Scalable Tree Boosting System" for 
    regression.
    Inputs
    ------------------------------------------------------------------------------------------------------------------
    x: pandas datframe of the training data
    gradient: negative gradient of the loss function
    hessian: second order derivative of the loss function
    idxs: used to keep track of samples within the tree structure
    subsample_cols: is an implementation of layerwise column subsample randomizing the structure of the trees
    (complexity parameter)
    min_leaf: minimum number of samples for a node to be considered a node (complexity parameter)
    min_child_weight: sum of the heassian inside a node is a meaure of purity (complexity parameter)
    depth: limits the number of layers in the tree
    lambda: L2 regularization term on weights. Increasing this value will make model more conservative.
    gamma: This parameter also prevents over fitting and is present in the the calculation of the gain (structure score). 
    As this is subtracted from the gain it essentially sets a minimum gain amount to make a split in a node.
    eps: This parameter is used in the quantile weighted skecth or 'approx' tree method roughly translates to 
    (1 / sketch_eps) number of bins
    Outputs
    --------------------------------------------------------------------------------------------------------------------
    A single tree object that will be used for gradient boosintg.
    '''
    def __init__(self):
        self.estimators = []
    
    # first order gradient mean squared error
    @staticmethod
    def grad(preds, labels):
        return(2*(preds-labels))
    
    # second order gradient logLoss
    @staticmethod
    def hess(preds, labels):
        '''
        hessian of mean squared error is a constant value of two 
        returns an array of twos
        '''
        return(np.full((preds.shape[0], 1), 2).flatten().astype('float64'))
    
    
    def fit(self, X, y, subsample_cols = 0.8 , min_child_weight = 1, depth = 5, min_leaf = 5, learning_rate = 0.4, boosting_rounds = 5, lambda_ = 1.5, gamma = 1, eps = 0.1):
        self.X, self.y = X, y
        self.depth = depth
        self.subsample_cols = subsample_cols
        self.eps = eps
        self.min_child_weight = min_child_weight 
        self.min_leaf = min_leaf
        self.learning_rate = learning_rate
        self.boosting_rounds = boosting_rounds 
        self.lambda_ = lambda_
        self.gamma  = gamma
    
        self.base_pred = np.full((X.shape[0], 1), np.mean(y)).flatten().astype('float64')
    
        for booster in range(self.boosting_rounds):
            Grad = self.grad(self.base_pred, self.y)
            Hess = self.hess(self.base_pred, self.y)
            boosting_tree = XGBoostTree().fit(self.X, Grad, Hess, depth = self.depth, min_leaf = self.min_leaf, lambda_ = self.lambda_, gamma = self.gamma, eps = self.eps, min_child_weight = self.min_child_weight, subsample_cols = self.subsample_cols)
            self.base_pred += self.learning_rate * boosting_tree.predict(self.X)
            self.estimators.append(boosting_tree)
          
    def predict(self, X):
        pred = np.zeros(X.shape[0])
        
        for estimator in self.estimators:
            pred += self.learning_rate * estimator.predict(X) 
          
        return np.full((X.shape[0], 1), np.mean(self.y)).flatten().astype('float64') + pred

In [50]:
a = XGBoostClassifier()

In [51]:
a.fit(X, y, subsample_cols = 1, min_child_weight = 1, depth = 5, min_leaf = 2, learning_rate = 0.1, boosting_rounds = 5, lambda_ = 1, gamma = 1, eps = 0.1)

In [39]:
(a.predict(X) ==y).mean()

0.8633333333333333

In [54]:
b = a.predict_proba(X).mean()
b

0.580765290274654