In [None]:
import numpy as np
import pandas as pd
from collections import Counter

df = ("diabetes.csv", delimiter=",")
X = data[:, :-1]
y = data[:, -1]

In [None]:
class Node:
    def __init__(self, ids=None, children=[], entropy=0, depth=0):
        self.ids = ids           # index of data in this node
        self.entropy = entropy   # entropy, will fill later
        self.depth = depth       # distance to root node
        self.split_attribute = None # which attribute is chosen, it non-leaf
        self.children = children # list of its child nodes
        self.order = None       # order of values of split_attribute in children
        self.label = None       # label of node if it is a leaf

    def set_properties(self, split_attribute, order):
        self.split_attribute = split_attribute
        self.order = order

    def set_label(self, label):
        self.label = label

In [None]:
class Tree:
    def __init__(self, max_depth=10, min_gain=1e-4):
        self.root = None
        self.max_depth = max_depth 
        self.Ntrain = 0
        self.min_gain = min_gain
    

    def fit(self, X, y):
        self.Ntrain = len(X)
        self.X = X
        self.attributes = [i for i in range(X[0].shape)]
        self.y = y
        self.labels = y.unique()
        
        ids = range(self.Ntrain)
        self.root = Node(ids=ids, entropy=self._entropy(ids), depth=0)
        queue = [self.root]
        while queue:
            node = queue.pop()
            if node.depth < self.max_depth or node.entropy < self.min_gain:
                node.children = self._split(node)
                if not node.children: #leaf node
                    self._set_label(node)
                queue += node.children
            else:
                self._set_label(node)
                
    def _entropy(self, ids):
        # calculate entropy of a node with index ids
        if len(ids) == 0: return 0
        #ids = [i+1 for i in ids] # panda series index starts from 1
        #freq = np.array(self.y[ids].value_counts())
        uni, cnt = np.unique(self.y[ids], return_counts="true")
        
        cnt = cnt[cnt != 0]
        ratio = cnt / cnt.sum()
        return -np.sum(ratio*np.log(ratio))

    def _set_label(self, node):
        # find label for a node if it is a leaf
        # simply chose by major voting 
        #y_ids = [i + 1 for i in node.ids]  # y is a series variable
        #node.set_label(self.y[y_ids].mode()[0]) # most frequent label
        uni, cnt = np.unique(y[node.ids], return_counts="true")
        most_common = Counter(dict(zip(uni, cnt))).most_common()[0][0]
        node.set_label(most_common)
    
    def _HxS(attribute):
        best_HxS = 0
        for val in attribute:
            threshold = val
            lesser = [i for i, x in enumerate(attribute) if x <= threshold]
            greater= [i for i, x in enumerate(attribute) if x > threshold]
        
            splits = np.vstack((lesser, greater))
            HxS = 0
            for split in splits:
                HxS += len(split)*self._entropy(split)/len(attribute)
            
            if HxS > best_HxS:
                best_HxS = HxS
        
        return best_HxS
            

    def _split(self, node):
        ids = node.ids 
        best_gain = 0
        best_splits = []
        best_attribute = None
        order = None
        node_X = self.X[ids]
        for i in self.attributes:
            best_HxS = 0
            for val in attribute:
                threshold = val
                lesser = [i for i, x in enumerate(attribute) if x <= threshold]
                greater= [i for i, x in enumerate(attribute) if x > threshold]
            
                splits = np.vstack((lesser, greater))
                HxS = 0
                for split in splits:
                    HxS += len(split)*self._entropy(split)/len(attribute)
                
                if HxS > best_HxS:
                    best_HxS = HxS
                    
           
            gain = node.entropy - best_HxS
            if gain < self.min_gain: continue # stop if small gain 
            if gain > best_gain:
                best_gain = gain 
                best_splits = splits
                best_attribute = att
                order = values
        node.set_properties(best_attribute, order)
        child_nodes = [TreeNode(ids=split,
                     entropy=self._entropy(split), depth=node.depth+1) for split in best_splits]
        return child_nodes 