In [21]:
import numpy as np
import math

$$ Gini:Gini(E)=1 - \sum_{c_j=1}^c {p_j}^2 $$
$$ Entropy:H(E)= -\sum_{c_j=1}^c {p_j}\log{p_j} $$

In [23]:
def calculate_gini(y):
    n_instances = float(sum([len(group) for group in groups]))
    unique_labels = np.unique(y)
    g = 0.0
    score = 0.0
    for label in classes:
        p = [row[-1] for row in group].count(label) / size
        score += p * p
    gini = (1.0 - score) * (size / n_instances) 
    return gini

In [22]:
def calculate_entropy(y):
    log2 = lambda x: math.log(x) / math.log(2)
    unique_labels = np.unique(y)
    entropy = 0
    for label in unique_labels:
        count = len(y[y == label])
        p = count / len(y)
        entropy += -p * log2(p)
    return entropy

In [17]:
class DecisionNode:
    def __init__(self, feature_i=None, threshold=None, value=None, left_branch=None, right_branch=None):
        self.feature_i = feature_i        # Index for feature that is tested
        self.threshold = threshold        # Threshold value for a feature
        self.value = value                # Value if the node is a leaf in tree
        self.left_branch = left_branch    # Left Subtree
        self.right_branch = right_branch  # Right Subtree

In [24]:
class DecisionTree:
    def __init__(self, min_samples_split=2, min_impurity=1e-7, max_depth=None, loss=None):
        self.root = None
        self.min_samples_split = min_samples_split
        self.min_impurity = min_impurity
        self.max_depth = max_depth if max_depth else float('inf')
        self._impurity_calculation = None
        self._leaf_value_calculation = None
        self.one_dim = None
        self.loss = loss
    
    def fit(self, X, y, loss):
        self.one_dim = len(np.shape(y)) == 1
        self.root = self._build_tree(X, y)
        self.loss=None
        
    def _build_tree(self, X, y, current_depth=0):
        largest_impurity = 0
        best_criteria = None
        best_sets = None
        if len(np.shape(y)) == 1:
            y = np.expand_dims(y, axis=1)
        Xy = np.concatenate((X, y), axis=1)
        
        n_samples, n_features = np.shape(X)
        
        if n_samples >= self.min_samples_split and current_depth <= self.max_depth:
            for feature_i in range(n_features):
                feature_values = np.expand_dims(X[:, feature_i], axis=1)
                unique_values = np.unique(feature_values)
                
                for threshold in unique_values:
                    Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)
                    if len(Xy1) > 0 and len(Xy2) > 0:
                        y1 = Xy1[:, n_features:]
                        y2 = Xy2[:, n_features:]
                        
                        impurity = self._impurity_calculation(y, y1, y2)
                        
                        if impurity > largest_impurity:
                            largest_impurity = impurity
                            best_criteria = {"feature_i": feature_i, "threshold": threshold}
                            best_sets = {
                                "leftX": Xy1[:, :n_features],   # X of left subtree
                                "lefty": Xy1[:, n_features:],   # y of left subtree
                                "rightX": Xy2[:, :n_features],  # X of right subtree
                                "righty": Xy2[:, n_features:]   # y of right subtree
                                }