In [1]:
from sklearn.datasets import make_classification

x, y = make_classification(n_classes=5, n_informative=7, n_features=10, n_redundant=2)

In [3]:
import numpy as np
from IPython.core.debugger import Tracer
from collections import namedtuple

SplitResult = namedtuple('SplitResult', 'left right')
Dataset = namedtuple('Dataset', 'x y')

def split_dataset(x, y, index, value):
    """Splits matrix in two parts based on row `value` at `index` parameter
    
    Returns
    -------
    split
        Matrix with the first row containing a subset of x and y where x[:, index] < value 
        and a complement of this subset in the second row"""

    index_set = np.argwhere(x[:, index] < value).flatten()
    index_set_compl = np.argwhere(x[:, index] >= value).flatten()
    return SplitResult(left = Dataset(x = x[index_set], y = y[index_set]), 
                       right = Dataset(x = x[index_set_compl], y = y[index_set_compl]))

assert len(split_dataset(x, y, 1, 1).left.x) + len(split_dataset(x, y, 1, 1).right.x) == len(x)

In [4]:
from collections import Counter

# TODO create metric strategy-class to implement various plug-in metrics via fixed interface
def gini_impurity(split, classes):
    """Calculates Gini impurify coefficient.
    
    Parameters
    ----------
    split
        intended to be passed as a result of the `split_dataset` function
    
    See Also
    --------
    split_dataset(x, y, index, value)
    https://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity"""
    
    gini = 0
    
    for class_val in classes:
        for group in split:
            group_size = len(group.y)
            
            if group_size == 0:
                continue
            
            counts = Counter(group.y)
            proportion = counts[class_val] / group_size
            gini += proportion * (1.0 - proportion)
            
    return gini
            
assert gini_impurity([Dataset(x = [1, 1], y = [0, 1]), Dataset(x = [1, 1], y = [0, 1])], [0, 1]) == 1.0
assert gini_impurity([Dataset(x =[1, 1], y = [1, 1]), Dataset(x = [1, 1], y = [0, 0])], [0, 1]) == 0.0
gini_impurity([Dataset(x =[1, 1], y = [1, 1]), Dataset(x = [1, 1], y = [1, 1])], [0, 1])

0.0

In [5]:
from collections import namedtuple

# TODO refactor this to a class
BestSplit = namedtuple('BestSplit', 'data split_index split_value gini')

def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

# TODO create Splitter class to implement various plug-in split strategies via fixed interface
def best_split(x, y, min_samples_split = 2):
    """Finds best split of a dataset based on gini index.
    
    Returns
    -------
    result : dict
        a dictionary containing best gini impurity and best split"""
    
    classes = np.unique(y)
    
    result = BestSplit(data = None, split_index = None, split_value = None, gini = 2)
    x_shuffled, y_shuffled = unison_shuffled_copies(x, y)
    
#     range(0, x.shape[1])
    for feature_index in np.random.choice(x_shuffled.shape[1], x_shuffled.shape[1], replace=False):
        for row in x_shuffled:
            split = split_dataset(x_shuffled, y_shuffled, feature_index, row[feature_index])
            gini = gini_impurity(split, classes)
            
            if gini <= result.gini and len(split.left.x) > min_samples_split and len(split.right.x) > min_samples_split:
                result = BestSplit(data = split,
                                    split_index = feature_index,
                                    split_value = row[feature_index],
                                    gini = gini)
    
    if not result.data:
        result = BestSplit(data = SplitResult(left = Dataset(x = x, y = y), right = Dataset(x = np.array([]), y = np.array([]))), split_index = None, split_value = None, gini = 2)
    
    return result

%time best_split(x,y, min_samples_split=5)

CPU times: user 172 ms, sys: 4 ms, total: 176 ms
Wall time: 173 ms


BestSplit(data=SplitResult(left=Dataset(x=array([[-0.87605503, -1.98779465,  1.25783197,  2.15380162, -1.06581946,
        -0.48838044,  1.12979039, -5.47313796,  2.19445741,  0.11751629],
       [ 0.01735175, -2.32959031,  2.20521961,  5.99430543,  0.29296566,
         1.15088118, -0.40860719, -2.10744055,  3.42933707, -1.0317391 ],
       [-1.83836834,  1.61357976,  0.18414449,  1.05350347, -0.11351791,
         3.71444323,  0.03241307,  3.05667651,  1.10219899,  1.67035608],
       [ 0.81671198, -1.4160242 , -0.35869577,  2.5560828 ,  0.21236339,
        -0.84269325, -2.16490598,  0.33262261,  0.16012379, -1.22059318],
       [-1.35649278, -0.96302216, -1.78807828,  1.27889702, -0.34237053,
        -1.66242489, -2.23841713, -1.77591925,  1.17449006, -1.22041871],
       [-0.64880527,  1.02869576,  1.44953118,  2.48584172, -0.04374698,
         1.3507413 , -1.33501284,  1.36163883,  1.71427057,  1.44756917],
       [ 1.74840666, -0.49133734,  0.67031032,  2.32661874,  0.8533285 ,
   

In [28]:
class Node(object):
    """Classification tree node.
    
    Attributes
    ----------
    parent : Node
        this nodes parent
        
    is_leaf : bool
        indicates that this is a terminal node
        
    leaf_value : int
        resulting class value for a terminal node
        
    depth : int
        depth level of this node
        
    left : Node
        left child
        
    right : Node
        right child
        
    split : SplitResult
        split data stored at this node (could be deleted in the fitting process)
        
    min_samples_split : int
        minimum number of samples to be contained in children of this node
    """
    
    def __init__(self, parent, is_leaf, depth, split = None, left = None, right = None, min_samples_split = 2):
        self.split = split
        self.parent = parent
        self.is_leaf = is_leaf
        self.left = left
        self.right = right
        self.depth = depth
        self.min_samples_split = min_samples_split
        
    def create_child(self, is_leaf, left):
        """Create a child of this node
        
        Parameters
        ----------
        is_leaf : bool
            the child should be a leaf
        left : bool
            left or right child
        
        Returns
        -------
        child : Node
        """
        result = Node(parent = self,
                      is_leaf = is_leaf,
                      depth = self.depth + 1,
                      min_samples_split=self.min_samples_split)
        
        if not is_leaf:
            if left:
                result.split = best_split(self.split.data.left.x, 
                                          self.split.data.left.y,
                                          min_samples_split=self.min_samples_split)
            elif not left:
                result.split = best_split(self.split.data.right.x, 
                                          self.split.data.right.y, 
                                          min_samples_split=self.min_samples_split)
            
#             print(result.split)
            
            # if the best split is a no-split then we have a terminal node
            if (result.split.data.left.x.size == 0) or (result.split.data.right.x.size == 0):
                result.is_leaf = True

        return result
    
    @property
    def is_leaf(self):
        return self.__is_leaf
    
    @is_leaf.setter
    def is_leaf(self, value):
        self.__is_leaf = value
        
        if value == True:
            self._finalize_leaf()
    
    def _finalize_leaf(self):
        """Make this node terminal and calculate leaf_value. """
        data = np.concatenate([self.split.data.left.y, self.split.data.right.y], axis = 0).astype(int)
        self.leaf_value = np.bincount(data).argmax()

class ClassificationTree(object):
    """Classification tree model. This implementation is not indended to be 
    the most fast or feature-rich. It is created mainly for educational purposes 
    so the creator tried to achive code readability and simplicity.
    
    Attributes
    ----------
    max_depth : int
        maximum tree depth constraint
    
    min_samples_split : int
        minimum number of samples contained at each node
    """
    
    def __init__(self, max_depth, min_samples_split):
        self.max_depth=max_depth
        self.min_samples_split=min_samples_split
        self.root_node = None
        
    def fit(self, x, y):
        """Fits a classification tree.
        
        Parameters
        ----------
        x : numpy matrix
            feature vectors
        y : numpy array
            class values (only integer values are supported for now)"""
        node_stack = []
        
        # create root node and push it to the recusion stack
        # TODO no-split check
        self.root_node = Node(split = best_split(x, y, min_samples_split=self.min_samples_split),
                              parent=None, 
                              is_leaf=False, 
                              depth=0,
                              min_samples_split=self.min_samples_split)
        
        node_stack.append(self.root_node)
        
        while node_stack:
            node = node_stack.pop()
            
            # if node became a leaf as a result of a no-split (see Node.create_child)
            if node.is_leaf:
                continue
            
            # stop if we have reached maximum depth
            if node.depth >= self.max_depth:
                node.is_leaf = True
                continue
            
            # Create children. Child should be a leaf if min_node_size constraint is unfitfulled
            if (len(node.split.data.left.x) >= self.min_samples_split):
                node.left = node.create_child(is_leaf=False, left=True)
                node_stack.append(node.left)
            else:
                node.is_leaf = True
            
            if (len(node.split.data.right.x) >= self.min_samples_split):
                node.right = node.create_child(is_leaf=False, left=False)
                node_stack.append(node.right)
            else:
                node.is_leaf = True
            
            
            # TODO make BestSplit a class so we can delete redundant split data
            # del node.split
            
    def predict_vector(self, x):
        """Make class prediction for a single row of data
        
        Parameters
        ----------
        x : numpy array
        """
        if not self.root_node:
            raise Exception('You should call fit(x, y) first')
            
        node_stack = [self.root_node]
        
        while node_stack:
            node = node_stack.pop()
            
            if node.is_leaf:
                return node.leaf_value
            
            if x[node.split.split_index] < node.split.split_value:
                node_stack.append(node.left)
            else:
                node_stack.append(node.right)
                
    def predict(self, x):
        """Make class predictions
        
        Parameters
        ----------
        x : numpy matrix"""
        results = []

        for row in x:
            results.append(self.predict_vector(row))

        return results
        
    def print_tree(self):
        """Print fitted tree"""
        if not self.root_node:
            raise Exception('You should call fit(x, y) first')
            
        node_stack = [self.root_node]
        
        while node_stack:
            node = node_stack.pop()
            
            if node.is_leaf:
                print("%s terminal - class %d" % ('-' * node.depth, node.leaf_value))
            else:
                print("%s feature[%d] > %f" % ('-' * node.depth, node.split.split_index, node.split.split_value))
                node_stack.append(node.left)
                node_stack.append(node.right)
                
            
tree = ClassificationTree(max_depth=20, min_samples_split=5)
tree.fit(x, y)
tree.print_tree()

 feature[4] > 2.270284
- terminal - class 1
- feature[6] > -2.029364
-- feature[1] > -2.356281
--- feature[9] > -1.645933
---- feature[9] > -1.038460
----- feature[4] > -2.092146
------ feature[7] > -5.031622
------- feature[7] > -3.517247
-------- feature[9] > 1.447569
--------- feature[9] > 2.469019
---------- terminal - class 0
---------- terminal - class 0
--------- feature[6] > 1.095621
---------- terminal - class 1
---------- feature[9] > 0.025360
----------- terminal - class 0
----------- feature[9] > -0.591778
------------ terminal - class 2
------------ terminal - class 4
-------- terminal - class 1
------- terminal - class 0
------ terminal - class 3
----- terminal - class 3
---- terminal - class 2
--- terminal - class 3
-- terminal - class 2


In [25]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.cross_validation import train_test_split

tree = ClassificationTree(max_depth=50, min_samples_split=2)
tree.fit(x, y)
print(accuracy_score(y, tree.predict(x)))
print(classification_report(y, tree.predict(x)))

0.8
             precision    recall  f1-score   support

          0       0.80      0.76      0.78        21
          1       0.74      0.91      0.82        22
          2       0.87      0.68      0.76        19
          3       0.82      0.90      0.86        20
          4       0.81      0.72      0.76        18

avg / total       0.81      0.80      0.80       100



In [26]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=50, min_samples_split=2)
clf.fit(x, y)
print(accuracy_score(y, clf.predict(x)))
print(classification_report(y, clf.predict(x)))

1.0
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        21
          1       1.00      1.00      1.00        22
          2       1.00      1.00      1.00        19
          3       1.00      1.00      1.00        20
          4       1.00      1.00      1.00        18

avg / total       1.00      1.00      1.00       100

