In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis  

In [2]:
class EnsembleClassifier():
    def __init__(self):
        self.B = int(input('Enter the number of Bootstrap[ex) 101] : '))
        self.method = input('Enter the name of method you want to use [1 : Bagging, 2: Random feature ensemble]')

        
    def fit(self):
        self.__input_train()
        X = self.X
        y = self.y
        self.ytype = np.sort(y.unique())
        if self.method == '1':
            self.__fit_bagging(X,y)
        else:
            self.__fit_random_feature(X,y)
        
    def predict(self):
        y_tst,X_tst = self.__input_test()

        if self.method == '1':
            y_pred = self.__predict_bagging(X_tst)
        else:
            y_pred = self.__predict_random_feature(X_tst)
        
        result = pd.DataFrame()
        result['Actual Class'] = y_tst
        result['Predicted Class'] = y_pred

        return result
        
    def confusion_matrix_accuracy(self):
        ytype = self.ytype
        result = self.predict()
        C = []
        for i in range(len(ytype)):
            for j in range(len(ytype)):
                C.append(((result['Actual Class'] == ytype[i]) & (result['Predicted Class'] == ytype[j])).sum())

        C2 = pd.DataFrame(np.array(C).reshape(len(ytype),len(ytype)),
                          index = ytype, columns = ytype)
        acc_tst = round(np.diag(C2).sum()/C2.sum().sum(),3)

        return C2, acc_tst

    
    def print_results(self):
        out_name = input("Enter the output file name to export [(ex) result.txt] : ")
        C2, acc_tst = self.confusion_matrix_accuracy()
        ytype = self.ytype
        X = self.X
        
        C2.to_csv('temp2.txt', sep = '\t')
        temp2 = []
        temp2.append('\tPredicted Class \n')
        r = open('temp2.txt','r')
        for line in r.readlines():
            temp2.append(line)
        r.close()    
        for i in range(len(temp2)):
            if i == 2:
                temp2[i] = 'Actual \t' + temp2[i]+ '\n'
            elif i == 3:
                temp2[i] = 'Class \t' + temp2[i]+ '\n'
            else:
                temp2[i] = '\t' + temp2[i]+ '\n'


        f = open(out_name, 'w+')
        f.write('Variable Importance \n')
        for i in range(len(X.columns)):
            f.write(f'\tx{i+1} : {np.round(self.importance[i],3)} \n')
        f.write('\n\n')
        
        if self.method == '1':
            f.write('Confusion Matrix (LDA - bagging) \n')
            f.write('---------------------------------\n')
            for line in temp2:
                f.write(line)
            f.write('\n')
            f.write('Model Summary (LDA - bagging) \n')
           
        else:
            f.write('Confusion Matrix (LDA - random feature) \n')
            f.write('---------------------------------\n')
            for line in temp2:
                f.write(line)
            f.write('\n')
            f.write('Model Summary (LDA - random feature) \n')
    
        f.write('---------------------------------\n')
        f.write(f'Overall accuracy = {acc_tst}')
        f.close()
        
        r = open(out_name,'r')
        for line in r.readlines():
            print(line)
        r.close()
                
        os.remove('temp2.txt')        
        
        pass
    

    def __input_train(self):
        # input data file name
        train_data=input("Enter the name of training data file [(ex) veh.dat] : ") # data name
        self.train_data = train_data
        coding_fm=int(input("Select the data Seperator(1 = ' ' or 2 = ','): ")) # data separator of training data
        self.coding_fm = coding_fm
        separator_fm={coding_fm ==1 : " "}.get(True, ",")
        self.separator_fm = separator_fm

        # input header & assign response variables
        self.header=input("Does the data have column header? (y/n) : ")
        header = self.header
        if(self.header=="y") :
            trdata=pd.read_csv(train_data, sep=separator_fm) # loading data with header
            self.trdata = trdata
            res_pos = input(f'Enter the column name of response variable among {list(trdata.columns)} : ')
            self.res_pos = res_pos           
            self.y = trdata[res_pos]
            self.X = trdata.drop(self.res_pos, axis = 1)

        else : 
            self.trdata=pd.read_csv(train_data, sep=separator_fm, header=None) # loading data without header
            trdata = self.trdata
            self.res_pos = int(input(f'Enter the column position of the response variable : \n [from 1 to {trdata.shape[1]}] : '))
            res_pos = self.res_pos
            X_index = []
            for i in range(len(trdata.columns)): 
                if i == res_pos-1 : continue
                X_index.append(i)
            self.y = trdata.iloc[:,res_pos-1]
            self.X = trdata.iloc[:,X_index]
    
    
    def __input_test(self):
        test_data =input("Enter the name of test data file [(ex) vehtest.dat] : ")
        coding_fm = self.coding_fm
        separator_fm = self.separator_fm
        header = self.header
        res_pos = self.res_pos
        
        if(self.header=="y") :
            tstdata=pd.read_csv(test_data, sep=separator_fm) # loading data with header
            y_tst = tstdata[res_pos]
            X_tst = tstdata.drop(self.res_pos, axis = 1)

        else : 
            tstdata=pd.read_csv(test_data, sep=separator_fm, header=None) # loading data without header
            X_index = []
            for i in range(len(tstdata.columns)): 
                if i == res_pos-1 : continue
                X_index.append(i)
            y_tst = tstdata.iloc[:,res_pos-1]
            X_tst = tstdata.iloc[:,X_index]
        
        return y_tst, X_tst

    
    def __sample_with_replacement(self,X,y):
        n = len(X)
        # index of sampling with replacement
        bagidx = np.random.choice(n, n, replace =  True)
        df_new = pd.concat([X,y],axis = 1)

        # make sampled dataframe
        samp_T = pd.DataFrame()
        for i in range(len(X)):
            samp_T = pd.concat([samp_T,df_new.iloc[bagidx[i],:]],axis =1)
        samp_T = np.transpose(samp_T)
        samp_y = samp_T.iloc[:,len(X.columns)]
        samp_X = samp_T.drop(len(X.columns),axis = 1)

        #oob data
        oob_X = X.drop(np.unique(bagidx),axis = 0); oob_X.index = range(len(oob_X))
        oob_y = y.drop(np.unique(bagidx));  oob_y.index = range(len(oob_y))
        return samp_X, samp_y, oob_X, oob_y
    
    def __fit_bagging(self,X,y):
        ytype = self.ytype
        B = self.B
        model = []
        diff = []
        for i in range(B):
            samp_X, samp_y, oob_X, oob_y = self.__sample_with_replacement(X,y)
            # save fitted model
            mod = LinearDiscriminantAnalysis().fit(samp_X, samp_y)
            model.append(mod)

            # OOB error
            oob_error = (len(oob_y) - sum(mod.predict(oob_X) == oob_y)) / len(oob_y)


            # permutation data
            o_n = len(oob_X)
            for col in range(len(oob_X.columns)):
                X_copy = oob_X.copy()
                perm_idx = np.random.choice(o_n, o_n, replace = False)
                perm_val = [oob_X.iloc[x,col] for x in perm_idx]
                X_copy.iloc[:,col] = perm_val
                perm_err = sum(mod.predict(X_copy) != oob_y) / len(oob_y)
                diff.append(perm_err - oob_error)


        perm_diff = np.array(diff).reshape(B, X.shape[1])
        d = perm_diff.mean(axis = 0)
        sd = np.sqrt(np.sum((perm_diff - d)**2,axis = 0) / (B-1))
        importance = d / sd
        
        self.model = model ; self.importance = importance
        pass
        
    def __predict_bagging(self,X_tst):
        ytype = self.ytype
        B = self.B
        m = self.model
        pred = pd.DataFrame(np.zeros((len(X_tst),len(ytype))), columns=ytype, dtype = int)
        for i in range(B):
            res = m[i].predict(X_tst)
            for tp in ytype:
                pred[tp] = pred[tp] + (res == tp)

        y_pred = np.zeros(len(X_tst)).astype(type(ytype[0]))
        for i in range(len(X_tst)):
            y_pred[i] = ytype[np.argmax(pred.iloc[i,:])]

        return y_pred
    
    def __fit_random_feature(self,X,y):
        model = []
        subset_index = []
        B = self.B
        m = int(np.floor(len(X.columns) / 2))
        diff = np.zeros((B,len(X.columns)))
        diff[:]=  np.nan
        for i in range(B):
            samp_X, samp_y, oob_X, oob_y = self.__sample_with_replacement(X,y)    
            ss_idx = np.random.choice(len(X.columns), m, replace = False)
            subset_index.append(ss_idx)
            mod = LinearDiscriminantAnalysis().fit(samp_X[ss_idx],samp_y)
            model.append(mod)

            #OOB error
            oob_error = sum(mod.predict(oob_X[ss_idx]) != oob_y) / len(oob_y)

            #permutation data
            o_n = len(oob_X)
            for col in ss_idx:
                X_copy = oob_X.copy()
                perm_idx = np.random.choice(o_n,o_n, replace = False)
                perm_val = [oob_X.iloc[x,col] for x in perm_idx]
                X_copy.iloc[:,col] = perm_val
                perm_err = sum(mod.predict(X_copy[ss_idx]) != oob_y) / len(oob_y)
                diff[i,col] = perm_err - oob_error

        d = np.nanmean(diff, axis = 0)
        n_of_obs = B -  np.isnan(diff).sum(axis = 0)
        sd = np.sqrt(np.nansum((diff - d)**2,axis = 0) / (n_of_obs-1))
        importance = d / sd
        
        self.model = model
        self.subset_index = subset_index
        self.importance = importance
        pass
    
    def __predict_random_feature(self,X_tst):
        ytype = self.ytype
        s = self.subset_index ; m = self.model
        B = self.B

        pred = pd.DataFrame(np.zeros((len(X_tst),len(ytype))), columns = ytype, dtype = int)
        for i in range(B):
            res = m[i].predict(X_tst[s[i]])
            for tp in ytype:
                pred[tp] = pred[tp] + (res == tp)

        y_pred = np.zeros(len(X_tst)).astype(ytype[0])
        for i in range(len(X_tst)):
            y_pred[i] = ytype[np.argmax(pred.iloc[i,:])]

        return y_pred

In [3]:
bagging = EnsembleClassifier()

Enter the number of Bootstrap[ex) 101] :  101
Enter the name of method you want to use [1 : Bagging, 2: Random feature ensemble] 1


In [4]:
bagging.fit()

Enter the name of training data file [(ex) veh.dat] :  veh.dat
Select the data Seperator(1 = ' ' or 2 = ','):  2
Does the data have column header? (y/n) :  n
Enter the column position of the response variable : 
 [from 1 to 19] :  19


In [5]:
bagging.print_results()

Enter the output file name to export [(ex) result.txt] :  result.txt
Enter the name of test data file [(ex) vehtest.dat] :  vehtest.dat


Variable Importance 

	x1 : 2.968 

	x2 : 5.361 

	x3 : 4.227 

	x4 : 8.946 

	x5 : 6.13 

	x6 : 1.237 

	x7 : 5.265 

	x8 : 4.298 

	x9 : 1.695 

	x10 : 5.639 

	x11 : 1.758 

	x12 : 4.494 

	x13 : 1.667 

	x14 : 2.003 

	x15 : 0.206 

	x16 : 1.272 

	x17 : 5.109 

	x18 : 4.514 





Confusion Matrix (LDA - bagging) 

---------------------------------

		Predicted Class 



		1	2	3	4



Actual 	1	49	29	4	4



Class 	2	24	45	9	7



	3	0	0	85	1



	4	0	0	2	77





Model Summary (LDA - bagging) 

---------------------------------

Overall accuracy = 0.762


In [6]:
rfe = EnsembleClassifier()

Enter the number of Bootstrap[ex) 101] :  101
Enter the name of method you want to use [1 : Bagging, 2: Random feature ensemble] 2


In [7]:
rfe.fit()

Enter the name of training data file [(ex) veh.dat] :  veh.dat
Select the data Seperator(1 = ' ' or 2 = ','):  2
Does the data have column header? (y/n) :  n
Enter the column position of the response variable : 
 [from 1 to 19] :  19


In [8]:
rfe.print_results()

Enter the output file name to export [(ex) result.txt] :  result.txt
Enter the name of test data file [(ex) vehtest.dat] :  vehtest.dat


Variable Importance 

	x1 : 2.544 

	x2 : 2.024 

	x3 : 3.315 

	x4 : 1.078 

	x5 : 1.181 

	x6 : 1.115 

	x7 : 2.432 

	x8 : 4.81 

	x9 : 2.085 

	x10 : 3.116 

	x11 : 2.254 

	x12 : 2.387 

	x13 : 1.321 

	x14 : 2.104 

	x15 : 0.48 

	x16 : 0.572 

	x17 : 1.759 

	x18 : 1.611 





Confusion Matrix (LDA - random feature) 

---------------------------------

		Predicted Class 



		1	2	3	4



Actual 	1	42	33	5	6



Class 	2	23	42	13	7



	3	0	0	83	3



	4	0	0	3	76





Model Summary (LDA - random feature) 

---------------------------------

Overall accuracy = 0.723
