In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

In [2]:
class GdBoostClassifier():
    def __init__(self):
        self.B = int(input('Enter the number of Boosting [ex) 101] : '))
        self.learning_rate = float(input('Enter the Learning rate [ex) 0.1] : '))
    
    def fit(self):
        self.__input_train()
        X = self.X ; y = self.y
        B = self.B ; learning_rate = self.learning_rate
        ytype = np.sort(y.unique())
        self.ytype = ytype
        
        # recoding y
        newy = np.zeros(len(y))
        for i in range(len(y)):
            if y[i] == ytype[0]:
                newy[i] = 0
            elif y[i] == ytype[1]:
                newy[i] = 1
            else:
                print('this is error')

        # probability for each type
        tree = DecisionTreeClassifier(max_depth=3)
        tree.fit(X,newy)
        temp = tree.predict_proba(X)
        prob_tp1 = temp[:,0] ; prob_tp2 = temp[:,1]

        # finding residuals
        resid = newy - prob_tp2

        # fit regression tree for residuals
        mod = []
        mod.append(tree)

        for iter in range(B-1):
            tree2 = DecisionTreeRegressor(max_depth = 3)
            tree2.fit(X,resid)
            mod.append(tree2)
            temp2 = tree2.predict(X)
            resid = resid - learning_rate * temp2

        self.mod = mod
        pass
        
    def predict(self):
        mod = self.mod
        B = self.B ; learning_rate = self.learning_rate
        ytype = self.ytype
        y_tst,X_tst = self.__input_test()
        y_hat = []
        y_hat.append(mod[0].predict_proba(X_tst)[:,1])
        for i in range(B-1):
            y_hat.append(learning_rate * mod[i+1].predict(X_tst))

        y_prob = np.zeros(len(X_tst))
        for b in range(B):
            y_prob = y_prob + y_hat[b]

        y_pred = []
        for i in range(len(y_tst)):
            if y_prob[i] >=0.5 :
                y_pred.append(ytype[1])
            elif y_prob[i] < 0.5:
                y_pred.append(ytype[0])
            else:
                print('error')

        result = pd.DataFrame()
        result['Actual Class'] = y_tst
        result['Predicted Class'] = y_pred
        return result
    
    
    def confusion_matrix_accuracy(self):
        ytype = self.ytype
        result = self.predict()
        C = []
        for i in range(len(ytype)):
            for j in range(len(ytype)):
                C.append(((result['Actual Class'] == ytype[i]) & (result['Predicted Class'] == ytype[j])).sum())

        C2 = pd.DataFrame(np.array(C).reshape(len(ytype),len(ytype)),
                          index = ytype, columns = ytype)
        acc_tst = round(np.diag(C2).sum()/C2.sum().sum(),3)

        return C2, acc_tst
    
    def print_results(self):
        out_name = input("Enter the output file name to export [(ex) result.txt] : ")
        C2, acc_tst = self.confusion_matrix_accuracy()
        ytype = self.ytype
        X = self.X
        
        C2.to_csv('temp2.txt', sep = '\t')
        temp2 = []
        temp2.append('\tPredicted Class \n')
        r = open('temp2.txt','r')
        for line in r.readlines():
            temp2.append(line)
        r.close()    
        for i in range(len(temp2)):
            if i == 2:
                temp2[i] = 'Actual \t' + temp2[i]+ '\n'
            elif i == 3:
                temp2[i] = 'Class \t' + temp2[i]+ '\n'
            else:
                temp2[i] = '\t' + temp2[i]+ '\n'


        f = open(out_name, 'w+')

        f.write('Confusion Matrix (Gradient Boosting) \n')
        f.write('---------------------------------\n')
        for line in temp2:
            f.write(line)
        f.write('\n')
        f.write('Model Summary (Gradient Boosting) \n')
        f.write('---------------------------------\n')
        f.write(f'Overall accuracy = {acc_tst}')
        f.close()
        
        r = open(out_name,'r')
        for line in r.readlines():
            print(line)
        r.close()
                
        os.remove('temp2.txt')        
        
        pass
    

    def __input_train(self):
        # input data file name
        train_data=input("Enter the name of training data file [(ex) pid.dat] : ") # data name
        self.train_data = train_data
        coding_fm=int(input("Select the data Seperator(1 = ' ' or 2 = ','): ")) # data separator of training data
        self.coding_fm = coding_fm
        separator_fm={coding_fm ==1 : " "}.get(True, ",")
        self.separator_fm = separator_fm

        # input header & assign response variables
        self.header=input("Does the data have column header? (y/n) : ")
        header = self.header
        if(self.header=="y") :
            trdata=pd.read_csv(train_data, sep=separator_fm) # loading data with header
            self.trdata = trdata
            res_pos = input(f'Enter the column name of response variable among {list(trdata.columns)} : ')
            self.res_pos = res_pos           
            self.y = trdata[res_pos]
            self.X = trdata.drop(self.res_pos, axis = 1)

        else : 
            self.trdata=pd.read_csv(train_data, sep=separator_fm, header=None) # loading data without header
            trdata = self.trdata
            self.res_pos = int(input(f'Enter the column position of the response variable : \n [from 1 to {trdata.shape[1]}] : '))
            res_pos = self.res_pos
            X_index = []
            for i in range(len(trdata.columns)): 
                if i == res_pos-1 : continue
                X_index.append(i)
            self.y = trdata.iloc[:,res_pos-1]
            self.X = trdata.iloc[:,X_index]
    
    
    def __input_test(self):
        test_data =input("Enter the name of test data file [(ex) pidtest.dat] : ")
        coding_fm = self.coding_fm
        separator_fm = self.separator_fm
        header = self.header
        res_pos = self.res_pos
        
        if(self.header=="y") :
            tstdata=pd.read_csv(test_data, sep=separator_fm) # loading data with header
            y_tst = tstdata[res_pos]
            X_tst = tstdata.drop(self.res_pos, axis = 1)

        else : 
            tstdata=pd.read_csv(test_data, sep=separator_fm, header=None) # loading data without header
            X_index = []
            for i in range(len(tstdata.columns)): 
                if i == res_pos-1 : continue
                X_index.append(i)
            y_tst = tstdata.iloc[:,res_pos-1]
            X_tst = tstdata.iloc[:,X_index]
        
        return y_tst, X_tst

In [3]:
gdboost = GdBoostClassifier()

Enter the number of Boosting [ex) 101] :  101
Enter the Learning rate [ex) 0.1] :  0.1


In [4]:
gdboost.fit()

Enter the name of training data file [(ex) pid.dat] :  pid.dat
Select the data Seperator(1 = ' ' or 2 = ','):  2
Does the data have column header? (y/n) :  n
Enter the column position of the response variable : 
 [from 1 to 8] :  8


In [5]:
gdboost.print_results()

Enter the output file name to export [(ex) result.txt] :  result.txt
Enter the name of test data file [(ex) pidtest.dat] :  pidtest.dat


Confusion Matrix (Gradient Boosting) 

---------------------------------

		Predicted Class 



		1	2



Actual 	1	40	47



Class 	2	15	124





Model Summary (Gradient Boosting) 

---------------------------------

Overall accuracy = 0.726
