In [159]:
import numpy as np



In [178]:
def gini(X, class_values):
        gini = 0.0
        for c in class_values:
            for group in X:
                size = float(len(group))
                if (size == 0): continue
                portion = [row[-1] for row in group].count(c)/size
                gini += (portion*(1-portion))
                
        return gini 


#split a given data set at index, "< threshold" on left
def split(X, index, threshold):
    new_X = [[],[]]
    for element in X:
        if element[index] < threshold:
            new_X[0].append(element)
        else:
            new_X[1].append(element)
    return new_X        


#return the result of splitting dataset
def get_split_dataset(X):
    best_index, best_threshold, best_gini, best_group = 99,99,99, None
    class_values = [row[-1] for row in X]
    for index in range(X[0].shape[0] -1):
        for row in X:
            new_X = split(X, index, row[index])
            
            gini_ = gini(new_X, class_values)
            if (gini_ < best_gini):
                best_index = index
                best_threshold = row[index]
                best_gini = gini_
                best_group = new_X
    return  {'index':best_index, 'threshold':best_threshold,\
        'gini':best_gini, 'groups':best_group   }

def final_output(X):
    output = [row[-1] for row in X]
    return max(set(output), key = output.count)


#split a node. each node contains { 'index', 'threshold', 'gini',
#**'left node and right node or a number  '}
def split_node(node, max_depth, depth):
    left, right = node['groups']
    #will replace the orignal node['groups'] by two childs
    del node['groups']
    #the pure case, only left or right, which means no need to continue
    if not left or not right:
        if not left:
            node['right'] = node['left']= final_output(right)
            #node['left'] = 'NULL'
        else:
            node['left'] = node['right'] = final_output(left)
            #node['right'] = 'NULL'
            
        return 
    
    if depth >= max_depth:
        node['left'] = final_output(left)
        node['right'] = final_output(right)
        return 
    
    #the case we have both left and right
    #we first make left chile really a node, that is, containing gini, threshold...
    node['left'] = get_split_dataset(left)
    #then recursively run on the left kid
    split_node(node['left'], max_depth, depth + 1)
    
    node['right'] = get_split_dataset(right)
    split_node(node['right'], max_depth, depth + 1)


#built a tree
def tree(dataset, max_depth):
    root_node = get_split_dataset(dataset)
    split_node(root_node, max_depth, 1)
    return root_node
    
#prediction, tree_root is a node, row is the data we want to predict
def predict(tree_root, data):
    #compare data with root threshold
    if data[tree_root['index']] < tree_root['threshold']:
        #check if left node is already pure
        if isinstance(tree_root['left'], dict):
            return predict(tree_root['left'], data)
        #if pure, return the value
        else: return tree_root['left']  
    
    else:
        if isinstance(tree_root['right'], dict):
            return predict(tree_root['right'], data)
        else: return tree_root['right']    

Now we try a toy example. 

In [179]:
dataset = np.array([[2.771244718,1.784783929,0],
[1.728571309,1.169761413,0],
[3.678319846,2.81281357,0],
[3.961043357,2.61995032,0],
[2.999208922,2.209014212,0],
[7.497545867,3.162953546,1],
[9.00220326,3.339047188,1],
[7.444542326,0.476683375,1],
[10.12493903,3.234550982,1],
[6.642287351,3.319983761,1]])

root = tree(dataset, 2)
root

{'gini': 0.0,
 'index': 0,
 'left': {'gini': 0.0,
  'index': 0,
  'left': 0.0,
  'right': 0.0,
  'threshold': 2.7712447180000002},
 'right': {'gini': 0.0,
  'index': 0,
  'left': 1.0,
  'right': 1.0,
  'threshold': 7.4975458670000004},
 'threshold': 6.6422873510000002}

In [181]:
data = [7.497545867,3.162953546,1]
print predict(root, data)


1.0


True