In [35]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import numpy as np

In [36]:
# Load the IRIS dataset
iris = load_iris()
# Features
X = iris.data.copy() 
# Classes
y = iris.target.reshape(150,1)
# Splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
# Horizontally stacking the train_dataset together
train_dataset = np.hstack((X_train,y_train))

In [44]:
print(X.shape)
print(X[0])
print(y.shape)
print(len(set(y.reshape(150,))))
print(train_dataset.shape)
print(train_dataset[0])

(150, 4)
[ 5.1  3.5  1.4  0.2]
(150, 1)
3
(100, 5)
[ 6.2  2.2  4.5  1.5  1. ]


In [11]:
# Calculate the Gini index for a split dataset
def gini_index(groups, classes):
    # count all samples at split point
    n_instances = float(sum([len(group) for group in groups]))
    # sum weighted Gini index for each group
    gini = 0.0
    for group in groups:
        size = float(len(group))
        # avoid divide by zero
        if size == 0:
            continue
        score = 0.0
        # score the group based on the score for each class
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            score += p * p
        # weight the group score by its relative size
        gini += (1.0 - score) * (size / n_instances)
    return gini

In [12]:
def split_data(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

In [28]:
# Select the best split point for a dataset
def get_split(dataset):
    class_values = list(set(row[-1] for row in dataset))
    print("class_values:{}".format(class_values))
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    for index in range(len(dataset[0])-1):
        print("\tindex:{}".format(index))
        for row in dataset:
#             print("\t\trow:{}".format(row))
            groups = split_data(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    return {'index':b_index, 'value':b_value, 'groups':b_groups}

In [45]:
def to_leaf(group):
    outcomes = [row[-1] for row in group]
    results = max(set(outcomes), key=outcomes.count)
    return results

In [55]:
# Create child splits for a node or make terminal
def split(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    # check for a no split
    if not left or not right:
#         node['left'] = node['right'] = to_leaf(left + right)
        return
    # check for max depth
    if depth >= max_depth:
        node['left'], node['right'] = to_leaf(left), to_leaf(right)
        return
    # process left child
    if len(left) <= min_size:
        node['left'] = to_leaf(left)
    else:
        node['left'] = get_split(left)
        split(node['left'], max_depth, min_size, depth+1)
    # process right child
    if len(right) <= min_size:
        node['right'] = to_leaf(right)
    else:
        node['right'] = get_split(right)
        split(node['right'], max_depth, min_size, depth+1)

In [56]:
def build_tree(train, max_depth, min_size):
    root = get_split(train)
    split(root, max_depth, min_size, 1)
    return root

In [57]:
tree = build_tree(train_dataset, 2, 1)

class_values:[0.0, 1.0, 2.0]
	index:0
	index:1
	index:2
	index:3
class_values:[0.0]
	index:0
	index:1
	index:2
	index:3
0.0
0.0
class_values:[1.0, 2.0]
	index:0
	index:1
	index:2
	index:3
1.0
2.0
