In [3]:
import numpy as np

## Create some data

In [11]:
data = [  # first two columns being features and last one being response
    ['Green', 3, 'Apple'],
    ['Yellow', 3, 'Apple'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],
]
header = ['color', 'diameter', 'label']

In [21]:
labels = []
for row in data:
    labels.append(row[0])
unique_labels = set(labels)
unique_labels

{'Green', 'Red', 'Yellow'}

## CART model

impurity: chance of being incorrect if randomly assign a label to an example in the same set

* What questions to ask?
* When to ask question?

To generate a list of questions, we'll iterate over every value for every feature in the data.

In [33]:
def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

In [38]:
class Question:
    """A Question is used to partition a dataset."""
    
    def __init__(self, column, value):
        self.column = column  # column no (0 for color)
        self.value = value  # column value (e.g. Green)
        
    def match(self, example):
        """Compare feature value in Question to feature value in example"""
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value
        
    def __repr__(self):
        """Print Question"""
        condition = '=='
        if is_numeric(self.value):
            condition = '>='
        return 'Is ' + str(header[self.column]) + str(condition) + str(self.value) + '?'

In [41]:
def partition(rows, question):
    """For each row in the dataset, check if it matches the question. If
    so, add it to 'true rows', otherwise, add it to 'false rows'."""
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

The best question is the one which reduces the uncertainity the most. Gini impurity quantifies how much uncertainity is at a node. Information gain quantifies how much a question reduces the uncertainity.
* Gini impurity is a measure of how often a randomly chosen element from the set would be incorrectly labeled if it was randomly labeled according to the distribution of labels in the subset
* Information gain is used to decide which feature to split on at each step in building the tree. Simplicity is best, so we want to keep our tree small. To do so, at each step we should choose the split that results in the purest daughter nodes.

To compute Gini impurity for a set of items with $J$ classes, suppose $i ∈ { 1 , 2 , . . . , J }$, and let $p_i$ be the fraction of items labeled with class $i$ in the set:  
[Gini impurity](https://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity) = $\sum($P(item with label $i$) * P(mistake in  categorizing that label $i$)$)$
$$I_G = 1 - \sum_{i=1}^J p_i^2$$

In [42]:
def class_counts(rows):
    """Counts number of each type of example in the dataset"""
    counts = {}
    for row in rows:
        label = row[-1]  # last column
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

In [43]:
def gini(rows):
    """Calculate Gini impurity for list of rows"""
    counts = class_counts(rows)
    impurity = 1
    for label in counts:
        prob_label = counts[label] / float(len(rows))
        impurity -= prob_label ** 2
    return impurity

In [None]:
def info_gain(left, right, current_uncertainity):
    """Information gain: the uncertainty of the starting node, minus the weighted impurity of
    two child nodes."""
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainity - p*gini(left) - (1 - p)*gini(right)

In [4]:
def find_best_split(rows):
    pass

In [3]:
def build_tree(rows):
    info, question = find_best_split(rows)
    # base case (no information gain)
    if info == 0:
        return Leaf(tree)
    true_rows, false_rows = partition(rows, question)
    # recursive call to build true_branch
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    return Decision_Node(question, true_branch, false_branch)

In [44]:
def print_tree(node, spacing=''):
    pass