## CART regression tree

In [None]:
class Node:
    def __init__(self, score = None):
        self.score = score
        self.left = None
        self.right = None
        self.feature = None
        self.split = None

## Gradient Boosting

reference:https://towardsdatascience.com/all-you-need-to-know-about-gradient-boosting-algorithm-part-2-classification-d3ed8f56541e

In [58]:
import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor 
from sklearn.metrics import accuracy_score
from sklearn.metrics.cluster import adjusted_rand_score

In [9]:
def logisticloss(y,y_pred):
    l = np.log(1+np.exp(-2*y*y_pred))
    return l

def logisticloss_gradient(y,y_pred):
    ri = 2*y/(1 + np.exp(2*y*y_pred))

    return ri

In [94]:
class gradient_boosting(object):
    def __init__(self,M,base_model="CART",max_depth = 1,learning_rate = 1, method = "binary classification",loss = "logistic",tol = None):
        self.M = M
        self.base_model = base_model
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.method = method
        self.loss = loss
        self.tol = tol
        self.trees = []
        self.num_class = None
        self.col_names = None

    def fit(self,x,y):

        one_hot = None

        # binary classification#############################################################################################

        # initial prediction
        if self.method == "binary classification":

            self.num_class = 2

            F0 = np.log(np.mean(y)/(1-np.mean(y)))
            self.F0 = np.array([F0]*len(y))

            # mth tree's predict value
            fm = self.F0.copy()
        

        # multiclass classification#########################################################################################

        

        if self.method == "multiclass classification":
            # count number of classes 
            num_classes = y.nunique()
            self.num_class = num_classes

            # one-hot encoding
            one_hot = pd.get_dummies(y)

            # get column names
            self.col_names = one_hot.columns.tolist()
            
            # initial prediction
            F0 = []

            for i in range(num_classes):
                coli = one_hot.iloc[:,i]
                num_class_i = coli.value_counts()[1]
                F0.append(np.array([num_class_i/len(y)]*len(y)))
            
            
            self.F0 = F0

            fm = self.F0.copy()

        # binary classification######################################################################################################

        if self.method == "binary classification":

            for m in range(self.M):
                
                
                # compute residual
                p = np.exp(fm)/(1 + np.exp(fm))
                r_im = y - p

                if self.base_model == "CART":
                    #fit a regression tree using residual
                    tree = DecisionTreeRegressor(max_depth=self.max_depth)
                    tree.fit(x, r_im)
                    nodes = tree.apply(x)

                    for i in np.unique(nodes):

                        # find subset of data points belong to node i
                        sub = i == nodes

                        #compute the leaf value for node i
                        gamma = (np.sum(r_im[sub]))/(np.sum(p[sub]*(1-p[sub])))

                        # update predict value for all data points belong to node i
                        fm[sub] += self.learning_rate*gamma

                        # update leaf value
                        tree.tree_.value[i, 0, 0] = gamma
                
                    self.trees.append(tree)
            
        # multiclass classification############################################################################################################

        if self.method == "multiclass classification":

            for m in range(self.M):
                K = y.nunique()

                if self.base_model == "CART":
                    # compute p
                    p = []
                    ktrees = []

                    # compute denominator

                    den = np.array([0]*len(y)).astype(np.float64)
                    for l in range(K):
                        den += np.exp(fm[l])


                    for k in range(K):

                        # kth tree
                        p_km = np.exp(fm[k]) / den
                        p.append(p_km)

                        # compute residual for all y
                        r_km = one_hot[k] - p_km

                    

                        # fit kth CART tree
                        tree = DecisionTreeRegressor(max_depth=self.max_depth)
                        tree.fit(x, r_km)
                        nodes = tree.apply(x)

                        for i in np.unique(nodes):

                            # find subset of data points belong to node i
                            sub = i == nodes

                            #compute the leaf value for node i
                            gamma = ((K-1)/K) * (np.sum(np.abs(r_km[sub]))/(np.sum(np.abs(r_km[sub])*(1-np.abs(r_km[sub])))))

                            # update predict value for all data points belong to node i
                            fm[k][sub] += self.learning_rate*gamma

                            # update leaf value
                            tree.tree_.value[i, 0, 0] = gamma
                
                        ktrees.append(tree)

                    self.trees.append(ktrees)






            
    
    
    def predit(self,x):
        if self.method == "binary classification":
            Fm = self.F0

            for m in range(self.M):
                Fm += self.learning_rate * self.trees[m].predict(x)

            prob = np.exp(Fm)/(1 + np.exp(Fm))
            
            return prob


        if self.method == "multiclass classification":
            Fm = self.F0

            # create probability dataframe
            prob = pd.DataFrame(columns=self.col_names)

            for m in range(self.M):
                for k in range(self.num_class):

                    Fm[k] += self.learning_rate * self.trees[m][k].predict(x)


            den = np.array([0]*len(x)).astype(np.float64)
            for l in range(self.num_class):
                den += np.exp(Fm[l])

            for k in range(self.num_class):

                # kth tree
                p_km = np.exp(Fm[k]) / den
                prob.iloc[:, k] = p_km


            
            return prob




In [66]:
# Create a sample dataframe
data = {
        'Color': [1, 1, 0, 2, 1, 2]}
df = pd.DataFrame(data)

# Perform one-hot encoding on the 'Color' column
one_hot_encoded = pd.get_dummies(df['Color'])

# Add the one-hot encoded columns to the original dataframe
df = pd.concat([df, one_hot_encoded], axis=1)

# Print the result
print(df)

   Color  0  1  2
0      1  0  1  0
1      1  0  1  0
2      0  1  0  0
3      2  0  0  1
4      1  0  1  0
5      2  0  0  1


In [83]:
np.abs(np.array([1,-1,-2,0]))

array([1, 1, 2, 0])

## Heart data

In [42]:
df_heart = pd.read_csv('D:\\STATS4T06\\Datasets\\heartdf.csv')

trainx_heart = df_heart.iloc[:,1:14]

scaler = MinMaxScaler()
trainx_heart['age'] = scaler.fit_transform(trainx_heart[['age']])
trainx_heart['trestbps'] = scaler.fit_transform(trainx_heart[['trestbps']])
trainx_heart['chol'] = scaler.fit_transform(trainx_heart[['chol']])
trainx_heart['thalach'] = scaler.fit_transform(trainx_heart[['thalach']])
trainx_heart['oldpeak'] = scaler.fit_transform(trainx_heart[['oldpeak']])

trainy_heart = df_heart.iloc[:,14]


trainx_heart

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex,cp,fbs,restecg,exang,slope,ca,thal
0,0.479167,0.292453,0.196347,0.740458,0.161290,1,0,0,1,0,2,2,3
1,0.500000,0.433962,0.175799,0.641221,0.500000,1,0,1,0,1,0,0,3
2,0.854167,0.481132,0.109589,0.412214,0.419355,1,0,0,1,1,0,0,3
3,0.666667,0.509434,0.175799,0.687023,0.000000,1,0,0,1,0,2,1,3
4,0.687500,0.415094,0.383562,0.267176,0.306452,0,0,1,1,0,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,0.625000,0.433962,0.216895,0.709924,0.000000,1,1,0,1,1,2,0,2
1021,0.645833,0.292453,0.301370,0.534351,0.451613,1,0,0,0,1,1,1,3
1022,0.375000,0.150943,0.340183,0.358779,0.161290,1,0,0,0,1,1,1,2
1023,0.437500,0.150943,0.292237,0.671756,0.000000,0,0,0,0,0,2,0,2


In [95]:
gb = gradient_boosting(M=10,max_depth = 3,method="multiclass classification")
gb.fit(trainx_heart.iloc[:,0:4],trainy_heart)

# heart_log_loss = logisticloss(trainy_heart,gb.predit(trainx_heart.iloc[:,0:4]))

# heart_log_loss


In [99]:
np.unique(gb.predit(trainx_heart.iloc[:,0:4]).iloc[:,0])

#np.where(gb.predit(trainx_heart.iloc[:,0:4]) > 0.5, 1, 0)

  prob.iloc[:, k] = p_km


array([0.49341501, 0.49341501, 0.49341501, 0.49341501, 0.49341501,
       0.49341501, 0.49341501])

In [63]:
accuracy_score(trainy_heart, np.where(gb.predit(trainx_heart.iloc[:,0:4]) > 0.5, 1, 0))
adjusted_rand_score(trainy_heart, np.where(gb.predit(trainx_heart.iloc[:,0:4]) > 0.5, 1, 0))

0.5921264209375903

## Wine data