## Gradient Boosting

reference:https://towardsdatascience.com/all-you-need-to-know-about-gradient-boosting-algorithm-part-2-classification-d3ed8f56541e

In [16]:
import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor 

In [9]:
def logisticloss(y,y_pred):
    l = np.log(1+np.exp(-2*y*y_pred))
    return l

def logisticloss_gradient(y,y_pred):
    ri = 2*y/(1 + np.exp(2*y*y_pred))

    return ri

In [26]:
class gradient_boosting(object):
    def __init__(self,M,base_model="CART",max_depth = 1,learning_rate = 1, method = "classification",loss = "logistic",tol = None):
        self.M = M
        self.base_model = base_model
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.method = method
        self.loss = loss
        self.tol = tol
        self.trees = []

    def fit(self,x,y):

        #initial 
        
        fm_0 = np.log(np.mean(y)/(1-np.mean(y)))
        self.F0 = np.full(len(y), fm_0)

        fm = self.F0.copy()

        for m in range(self.M):
            # compute residual
            p = np.exp(fm)/(1 + np.exp(fm))
            r_im = y - p

            if self.base_model == "CART":
                tree = DecisionTreeRegressor(max_depth=self.max_depth, random_state=0)
                tree.fit(x, r_im)
                nodes = tree.apply(x)

                for i in np.unique(nodes):
                    sub = i == nodes
                    gamma = (np.sum(r_im[sub]))/(np.sum(p[sub]*(1-p[sub])))

                    # update 
                    fm[sub] += self.learning_rate*gamma

                    tree.tree_.value[i, 0, 0] = gamma
            
                self.trees.append(tree)
            
    
    def predit(self,x):

        Fm = self.F0

        for m in range(self.M):
            Fm += self.learning_rate * self.trees[m].predict(x)

        prob = np.exp(Fm)/(1 + np.exp(Fm))
        
        return prob




In [27]:
df_heart = pd.read_csv('D:\\STATS4T06\\Datasets\\heartdf.csv')

trainx_heart = df_heart.iloc[:,1:14]

scaler = MinMaxScaler()
trainx_heart['age'] = scaler.fit_transform(trainx_heart[['age']])
trainx_heart['trestbps'] = scaler.fit_transform(trainx_heart[['trestbps']])
trainx_heart['chol'] = scaler.fit_transform(trainx_heart[['chol']])
trainx_heart['thalach'] = scaler.fit_transform(trainx_heart[['thalach']])
trainx_heart['oldpeak'] = scaler.fit_transform(trainx_heart[['oldpeak']])

trainy_heart = df_heart.iloc[:,14]


trainx_heart

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex,cp,fbs,restecg,exang,slope,ca,thal
0,0.479167,0.292453,0.196347,0.740458,0.161290,1,0,0,1,0,2,2,3
1,0.500000,0.433962,0.175799,0.641221,0.500000,1,0,1,0,1,0,0,3
2,0.854167,0.481132,0.109589,0.412214,0.419355,1,0,0,1,1,0,0,3
3,0.666667,0.509434,0.175799,0.687023,0.000000,1,0,0,1,0,2,1,3
4,0.687500,0.415094,0.383562,0.267176,0.306452,0,0,1,1,0,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,0.625000,0.433962,0.216895,0.709924,0.000000,1,1,0,1,1,2,0,2
1021,0.645833,0.292453,0.301370,0.534351,0.451613,1,0,0,0,1,1,1,3
1022,0.375000,0.150943,0.340183,0.358779,0.161290,1,0,0,0,1,1,1,2
1023,0.437500,0.150943,0.292237,0.671756,0.000000,0,0,0,0,0,2,0,2


In [38]:
gb = gradient_boosting(M=10,max_depth = 3)
gb.fit(trainx_heart.iloc[:,0:4],trainy_heart)

heart_log_loss = logisticloss(trainy_heart,gb.predit(trainx_heart.iloc[:,0:4]))

heart_log_loss


0       0.693147
1       0.693147
2       0.693147
3       0.693147
4       0.693147
          ...   
1020    0.485399
1021    0.693147
1022    0.693147
1023    0.167605
1024    0.693147
Name: target, Length: 1025, dtype: float64