In [1]:
import pandas as pd
import numpy as np
import math
from numpy.linalg import inv, det
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import scipy.stats as stats
import itertools

In [2]:
tr_data = pd.read_csv('data/veh.dat', sep=",", header=None)
test_data = pd.read_csv('data/vehtest.dat', sep=',', header=None)

In [3]:
class BaggingEnsembler():
    def __init__(self, tr_data, test_data, seed=2000):
        self.train_data = tr_data
        self.test_data = test_data
        self.response_column = len(self.train_data.columns) - 1
        self.n_class = len(self.train_data.iloc[:, self.response_column].value_counts())       
        self.seed = seed
    
    def bootstrap(self):
        num_obs = len(self.train_data)
        bs_train = self.train_data.sample(n=num_obs, replace=True)
        return bs_train.sort_index()
    
    @staticmethod
    def generate_components(input):
        dat_label = pd.DataFrame(input.iloc[:,-1]).copy()
        dat_label.columns = ['class']
        n_class = len(dat_label['class'].value_counts())
        dat_pred = pd.DataFrame(input.iloc[:,:-1]).copy()
        freq_vec = np.array([len(dat_pred[dat_label['class'] == i]) for i in range(1, n_class +1)])
        categorized = [dat_pred[dat_label['class'] == i] for i in range(1, n_class + 1)]
        mean_vec = [categorized[i].apply(np.mean) for i in range(0, n_class)]
        cov_vec = [categorized[i].cov() for i in range(0, n_class)]
        return dat_pred, dat_label, mean_vec, cov_vec, freq_vec, n_class
    
    @staticmethod
    def run_classification(input_pred, mean, cov, freq, n_class):
        prior_vec = freq/sum(freq)
        dev_mat = [input_pred - mean[i] for i in range(n_class)]
        cov_lda = sum([(freq[i] - 1) * cov[i] / (sum(freq) - n_class) for i in range(n_class)])
        res = [dev_mat[i].apply(lambda x: math.exp(-0.5 * np.matmul(np.matmul(x,  inv(cov_lda)), np.transpose(x))), axis=1) * prior_vec[i] for i in range(n_class)]
        res_ = pd.DataFrame(np.transpose(res))
        pred_res = res_.idxmax(axis=1)
        pred_res = pred_res.map(lambda x: x+1)
        pred_res = pd.DataFrame(pred_res, columns=['pred_label'])
        return pred_res, res_
    
    def bagging(self, num=51):
        res_list = []
        for i in range(num):
            new_train = self.bootstrap()
            tr_pred, tr_label, mean_vec, cov_vec, freq_vec, n_class = self.generate_components(new_train)
            test_pred, test_label, _, _, _, _ = self.generate_components(self.test_data)
            predicted_label_train, _ = self.run_classification(tr_pred, mean_vec, cov_vec, freq_vec, n_class)
            predicted_label_test, _ = self.run_classification(test_pred, mean_vec, cov_vec, freq_vec, n_class)
            res_list.append(predicted_label_test)
        bag_result =[pd.DataFrame(one_res) for one_res in res_list]
        bag_result = pd.concat(bag_result, axis=1)
        count_res = bag_result.apply(lambda x: x.value_counts().idxmax(), axis=1)
        return count_res

In [4]:
def accuracy(model_res, label):
    return sum(model_res == label)/len(label)

def get_confmat(pred_res, true, n_class, method='4'):
    if method != '4' and method != '6':
        conf_mat = pd.DataFrame([[sum(pred_res[true == i] == j) for i in range(1, n_class+1)] for j in range(1, n_class + 1)])
    elif method == '6':
        class_label = ["No", "Yes"]
        conf_mat = pd.DataFrame([[sum(pred_res[true == i] == j) for i in class_label] for j in class_label])
    else:
        conf_mat = pd.DataFrame([[sum(pred_res[true == i] == j) for i in range(0, n_class)] for j in range(0, n_class)])
    conf_mat = conf_mat.rename(columns={0:1, 1:2, 2:3, 3:4}, index={0:1, 1:2, 2:3, 3:4})
    conf_mat.index.name = 'Predicted Class'
    conf_mat.columns.name = 'Actual Class'
    return conf_mat

In [7]:
BE = BaggingEnsembler(tr_data, test_data)
# no bagging result
tr_pred, tr_label, mean_vec, cov_vec, freq_vec, n_class = BE.generate_components(tr_data)
test_pred, test_label, _, _, _, _ = BE.generate_components(test_data)
predicted_label_test, _ = BE.run_classification(test_pred, mean_vec, cov_vec, freq_vec, n_class)
# bagging result
pred_res = BE.bagging(num=51)

o_file = 'ace.txt'
with open(o_file, 'w') as outf:
    outf.write("(1) LDA - no bagging\n")
    outf.write("ID, Actual class, LDA-nobagging pred\n")
    outf.write("====================================\n")
    for i in range(5):
        outf.write("{}, {}, {}\n".format(i, test_label['class'][i], predicted_label_test['pred_label'][i]))
    outf.write("(continue)\n\n")
    outf.write("Confusion Matrix (LDA - no bagging)\n")
    outf.write("===================================\n")
    no_bag_mat = get_confmat(predicted_label_test['pred_label'], test_label['class'], BE.n_class, method='7')
    outf.write(str(no_bag_mat))
    outf.write("\n")
    outf.write("\n")
    outf.write("Model Summary (LDA - no bagging)\n")
    outf.write("================================\n")
    outf.write("Overall accuracy: {:.3f}\n".format(accuracy(predicted_label_test['pred_label'], test_label['class'])))
    outf.write("\n(2) LDA - bagging\n")
    outf.write("ID, Actual class, LDA-bagging pred\n")
    outf.write("==================================\n")
    for i in range(5):
        outf.write("{}, {}, {}\n".format(i, test_label['class'][i], pred_res[i]))
    outf.write("(continue)\n\n")
    outf.write("Confusion Matrix (LDA - bagging)\n")
    outf.write("===================================\n")
    bag_mat = get_confmat(pred_res, test_label['class'], BE.n_class, method='7')
    outf.write(str(bag_mat))
    outf.write("\n")
    outf.write("\n")
    outf.write("Model Summary (LDA - bagging)\n")
    outf.write("================================\n")
    outf.write("Overall accuracy: {:.3f}\n".format(accuracy(pred_res, test_label['class'])))