In [136]:
class TreeNode(object):
    def __init__(self, X, y, childs=[], info=None, isLeaf=True, split_feature=None, split_value=None):
        self.X = X
        self.y = y
        self.split_feature = split_feature
        self.split_value = split_value
        self.childs = childs
        self.info = info
        self.classes = np.unique(y)
        counter=[]
        for c in self.classes:
            counter.append(np.sum(y==c))
        self.class_ = self.classes[np.argmax(counter)]

In [137]:
import numpy as np
class DecisionTree(object):
    def __init__(self, eta=0.01, n_iter=50, random_state=1, impurity_mode='gini', max_depth=5, min_gain=0.0):
        self.eta = eta
        self.n_iter = n_iter
        self.random_state = random_state
        self.max_depth = max_depth
        self.min_gain = min_gain
        self.impurity_mode = impurity_mode
        # self.gini_max, self.entropy_max, self.classification_error_max = 0.5, 1.0, 0.5
        
    def impurity(self, y):
        if self.impurity_mode == 'gini':
            return self.gini(y)
        elif self.impurity_mode == 'entropy':
            return self.entropy(y)
        else:
            return self.classification_error(y)
        
    def gini(self, t):  # gini impuruty, t is the set of labels
        tags = np.unique(t)
        ig = 1
        for tag in tags:
            prob = np.sum(t==tag)/len(t)
            ig -= prob**(2)
        return ig 
    
    def entropy(self, t):
        tags = np.unique(t)
        en = 0
        for tag in tags:
            prob = np.sum(t==tag)/len(t)
            en -= prob*np.log2(prob)
        return en        
    
    def classification_error(self, t):
        tags = np.unique(t)
        prob = []
        for tag in tags:
            prob.append(np.sum(t==tag)/len(t))
        return 1-max(prob)
    
    def split(self, X, y, feature, value):  # binary split
        n_child = 2
        childnode_X = []
        childnode_y = []
        for _ in range(n_child):
            childnode_X.append([])
            childnode_y.append([])
        if ((type(value) == int) | (type(value) == float)):
            for i in range(X.shape[0]):
                if (X[i, feature] >= value):
                    childnode_X[0].append(X[i])
                    childnode_y[0].append(y[i])
                else:
                    childnode_X[1].append(X[i])
                    childnode_y[1].append(y[i])
        else:
            for i in range(X.shape[0]):
                if (X[i, feature] == value):
                    childnode_X[0].append(X[i])
                    childnode_y[0].append(y[i])
                else:
                    childnode_X[1].append(X[i])
                    childnode_y[1].append(y[i])
        return childnode_X, childnode_y
    
    def build_decision_tree(self, X, y, depth=0):
        impurity_current = self.impurity(y)
        gain_best, feature_best, value_best, child_best = 0.0, None, None, None
        
        n_features = X.shape[1]
        for feature in range(n_features):
            values_ = np.unique(X[:, feature])
            for value in values_:
                childnode_X, childnode_y = self.split(X, y, feature, value)
                p = len(childnode_y[0])/len(y)
                info_gain = impurity_current - p * self.impurity(childnode_y[0]) - (1-p) * self.impurity(childnode_y[1])
                if info_gain > gain_best:
                    gain_best, feature_best, value_best, child_best = info_gain, feature, value, childnode
        
        info = (depth, impurity_current, (len(childnode_y[0]), len(childnode_y[1])), child_best)
        
        if ((gain_best > self.min_gain) & (depth < self.max_depth)):
            childs = []
            for i in range(2):
                childs.append(build_decision_tree(childnode_X[i], child_node_y[i], depth+1))
            return TreeNode(X=X, y=y, childs=childs, info=info, isLeaf=False, split_feature=feature_best, split_value=value_best)
        else:
            return TreeNode(X=X, y=y, info=info, isLeaf=True)

In [138]:
import numpy as np
import pandas as pd  
df_trainX = pd.read_csv("/home/jiarui/Documents/3314/COMP3314_a1/dataset_files/car_X_train.csv", header=0, error_bad_lines=False)
df_trainy = pd.read_csv("/home/jiarui/Documents/3314/COMP3314_a1/dataset_files/car_y_train.csv", header=0, error_bad_lines=False)
df_testX = pd.read_csv("/home/jiarui/Documents/3314/COMP3314_a1/dataset_files/car_X_test.csv", header=0, error_bad_lines=False)
df_testy = pd.read_csv("/home/jiarui/Documents/3314/COMP3314_a1/dataset_files/car_y_test.csv", header=0, error_bad_lines=False)
X_train = df_trainX.iloc[:].values
y_train = df_trainy.iloc[:, 0].values 
X_test = df_testX.iloc[:].values
y_test = df_testy.iloc[:, 0].values

markers = ('o', 'x', 's', '^', 'v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
labels = np.unique(y_train)

In [139]:
dt = DecisionTree()
dt.gini(y_train)

0.45210959162772174

In [141]:
root=dt.build_decision_tree(X_train, y_train)