In [1]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
iris = load_iris()

df = pd.DataFrame(data = np.hstack([iris["data"], iris["target"].reshape(-1, 1)]), 
             columns = [x[:-5].replace(" ", "_") for x in iris["feature_names"]] + ["type"])

df["type"] = np.int32(df["type"])

del iris

df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,type
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [3]:
len(df)

150

### Splitting Data

In [4]:
train_size = 0.7

In [5]:
train_idx = np.random.choice([True, False], len(df), p=[train_size, 1-train_size])

In [6]:
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']].values
Y = df["type"].values

x_train, x_test, y_train, y_test = X[train_idx], X[~train_idx], Y[train_idx], Y[~train_idx]

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(103, 4)
(103,)
(47, 4)
(47,)


## Utility Functions

In [7]:
def entropy(x):
    p = np.array(list(Counter(x).values()))
    p = p/np.sum(p)
    return -np.sum(np.log2(p)*p)

In [8]:
info_gain = lambda x, ys: entropy(x) - sum([sum(y)*entropy(x[y])/len(x) for y in ys])

In [9]:
class Tree:
    def __init__(self, bins = 5, depth=3):
        self.head = None
        self.arg_value = None
        self.children = []
        self.bins = bins
        self.depth = depth
    
    def fit(self, X, Y):
        n = len(Y)-1
        best_arg = 0
        arg_val = None
        gain = -1

        # Iterating Over all Parameters
        for x in range(X.shape[1]):
            # Quantiles based bins
            bins = np.insert(np.arange(1/(self.bins-1), 1, 1/(self.bins-1)), [0, self.bins-2], [0,-1])
            bins[:-1] = np.ceil(n*bins[:-1])
            bins = np.sort(X[:, x])[bins.astype(np.int32)]
        
            # Calculating Informtion Gain for Each Split.
            for i in range(len(bins)-1):
                p_gain = info_gain(Y, [bins[i]<=Y, Y<bins[i+1]])
                
                if gain < p_gain:
                    gain = p_gain
                    best_arg = x
                    arg_val = bins[i+1]

        self.head = best_arg
        self.arg_value = arg_val
        if self.depth > 1:
            self.children = [Tree(depth=self.depth-1), Tree(depth=self.depth-1)]
            idx = X[:, best_arg]<arg_val
            self.children[0].fit(X[idx], Y[idx])
            self.children[1].fit(X[~idx], Y[~idx])

## Model Construction

In [10]:
obj = Tree(bins=5)

In [11]:
obj.fit(x_train, y_train)

## Model Visualization

I've wriiten code for a binary decision tree. So each left child operates on values samller than argument value shown in title and right child operates on values greater or equal to argument value shown in title. The hierarchy is as follow

$$\text{Parent}$$
$$\big/ \ \ \setminus $$
$$\text{Child 1}\ \ \ \ \ \ \ \ \text{Child 2}$$
$$\big/ \ \ \ \ \ \ \ \ \setminus \ \ \ \ \ \  \big/ \ \ \ \ \ \ \ \ \setminus$$
$$\text{GC 11}\ \ \ \text{GC 12}\ \ \ \text{GC 21}\ \ \ \text{GC 22}$$



Here decison boundry of each child is reprented by a different colour.

In [12]:
cols = df.columns

In [13]:
range_dict = {x: [y, z] for x,y,z in zip(cols, x_train.min(axis=0), x_train.max(axis=0))}
range_dict

{'sepal_length': [4.3, 7.9],
 'sepal_width': [2.3, 4.4],
 'petal_length': [1.0, 6.7],
 'petal_width': [0.1, 2.5]}

In [14]:
def decison_tree_seq(arg_idx, arg_val, col, ls, fig=None, ax=None):
    """
    Function to Sequnetailly plot each decsion boundry of a Decison Tree.
    Solid circle points represent training data.
    Transaprent + points represent test data.
    The colour of the plot title is same as the colour of decsion boundry introduced by the node in tree.
    Plot title contain name of attribute and threshold for which data is divide into binary bins.
    """
    
    if fig is None or ax is None:
        fig, ax = plt.subplots(2, 3, figsize=(20, 8))
        
        comb = [[("sepal_length", "sepal_width"), ("sepal_length", "petal_length"), ("sepal_length", "petal_width")],
                [("sepal_width", "petal_length"), ("sepal_width", "petal_width"), ("petal_length", "petal_width")]]
        
        train = df[train_idx]
        test = df[~train_idx]
        
        for i in range(2):
            for j in range(3):
                ax[i,j].scatter(train[comb[i][j][0]], train[comb[i][j][1]], c=train["type"])
                ax[i,j].scatter(test[comb[i][j][0]], test[comb[i][j][1]], c=test["type"], marker="+", alpha=0.5)
                ax[i,j].set_xlabel(" ".join(list(map(lambda x: x.capitalize(), comb[i][j][0].split("_")))))
                ax[i,j].set_ylabel(" ".join(list(map(lambda x: x.capitalize(), comb[i][j][1].split("_")))))
        
        
    arg_name = cols[arg_idx]
    fig.suptitle((arg_name, arg_val), color=col)
    
    if arg_name == "sepal_length":
        ax[0,0].plot([arg_val, arg_val], range_dict["sepal_width"], color=col, linestyle=ls)
        ax[0,1].plot([arg_val, arg_val], range_dict["petal_length"], color=col, linestyle=ls)
        ax[0,2].plot([arg_val, arg_val], range_dict["petal_width"], color=col, linestyle=ls)
        
    elif arg_name == "sepal_width":
        ax[0,0].plot(range_dict["sepal_length"], [arg_val, arg_val], color=col, linestyle=ls)
        ax[1,0].plot([arg_val, arg_val], range_dict["petal_length"], color=col, linestyle=ls)
        ax[1,1].plot([arg_val, arg_val], range_dict["petal_width"], color=col, linestyle=ls)
        
    elif arg_name == "petal_length":
        ax[0,1].plot(range_dict["sepal_length"], [arg_val, arg_val], color=col, linestyle=ls)
        ax[1,0].plot(range_dict["sepal_width"], [arg_val, arg_val], color=col, linestyle=ls)
        ax[1,2].plot([arg_val, arg_val], range_dict["petal_width"], color=col, linestyle=ls)
        
    else:#if arg_name == "petal_width":
        ax[0,2].plot(range_dict["sepal_length"], [arg_val, arg_val], color=col, linestyle=ls)
        ax[1,1].plot(range_dict["sepal_width"], [arg_val, arg_val], color=col, linestyle=ls)
        ax[1,2].plot(range_dict["petal_length"], [arg_val, arg_val], color=col, linestyle=ls)
        
    plt.close()
    return fig, ax

In [None]:
# Parent

fig, ax = decison_tree_seq(obj.head, 
                           obj.arg_value, 
                           "black", "-")
fig

In [None]:
# Child 1

fig, ax = decison_tree_seq(obj.children[0].head, 
                           obj.children[0].arg_value, 
                           "red", "--", fig, ax)
fig

In [None]:
# Child 2

fig, ax = decison_tree_seq(obj.children[1].head, 
                           obj.children[1].arg_value, 
                           "blue", "--", fig, ax)
fig

In [None]:
# GrandChild 11

fig, ax = decison_tree_seq(obj.children[0].children[0].head, 
                           obj.children[0].children[0].arg_value, 
                           "green", "-.", fig, ax)
fig

In [None]:
# GrandChild 12

fig, ax = decison_tree_seq(obj.children[0].children[1].head, 
                           obj.children[0].children[1].arg_value, 
                           "magenta", "-.", fig, ax)
fig

In [None]:
# GrandChild 21

fig, ax = decison_tree_seq(obj.children[1].children[0].head, 
                           obj.children[1].children[0].arg_value, 
                           "brown", "-.", fig, ax)
fig

In [None]:
# GrandChild 22

fig, ax = decison_tree_seq(obj.children[1].children[1].head, 
                           obj.children[1].children[1].arg_value, 
                           "cyan", "-.", fig, ax)
fig