<a href="https://colab.research.google.com/github/tsakailab/prml/blob/master/ipynb/rnd2d_ex1_dtree_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%matplotlib inline

Classification by decision tree
===================

Define functions
----------------

In [0]:
#@title Define functions to visualize classification results
import numpy as np
from matplotlib import pyplot as plt
# Visualization of the decision boundary and regions
def plot2d_classification(decision_function, X_train, y_train, X_test=None, y_test=None, w=None, cmap=plt.cm.bwr, xlim=None, ylim=None, levels=None, colors='k', linestyles=None):

    plt.figure()
    ax = plt.axes()

    if xlim is None:
        xlim = [X_train[:, 0].min() - .5, X_train[:, 0].max() + .5]
    if ylim is None:
        ylim = [X_train[:, 1].min() - .5, X_train[:, 1].max() + .5]

    xx, yy = np.meshgrid(np.arange(xlim[0], xlim[1], 0.02), np.arange(ylim[0], ylim[1], 0.02))    

    # Show prediction (P(y=+1 | X) by color by assigning a color to each point in the mesh [x_min, x_max]x[y_min, y_max].
    Z = decision_function(np.c_[xx.ravel(), yy.ravel()])
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    if levels is not None:
        ax.contour(xx, yy, Z, levels=levels, colors=colors, linestyles=linestyles, alpha=1)
    else:
        ax.pcolor(xx, yy, Z, cmap=cmap, alpha=0.1, edgecolors=None)

    # Plot the decision boundary
    if w is not None:
        x1 = np.linspace(xx.min(), xx.max(), 1000)
        x2 = -(w[0] + w[1] * x1) / w[2]
        cnd = np.logical_and(x2<yy.max(), x2>yy.min())
        plt.plot(x1[cnd], x2[cnd], 'k-')
        plt.axvline(x=1.22, color='k')

    # Plot also the training points
    ax.scatter(X_train[y_train>0, 0], X_train[y_train>0, 1], c='r',  marker='s', cmap=cmap, edgecolors='k', label='Training data', alpha=1)
    ax.scatter(X_train[y_train<=0, 0], X_train[y_train<=0, 1], c='b', marker='o', cmap=cmap, edgecolors='k', label='Training data', alpha=1)
    # and testing points if given
    if X_test is not None and y_test is not None:
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cmap, edgecolors='k',label='Test data', marker='x', alpha=0.3)
        plt.legend(loc="upper right", fontsize=16, frameon=True)
        ax.get_legend().legendHandles[0].set_color('k')
        ax.get_legend().legendHandles[1].set_color('k')

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    plt.axis('tight')
    plt.xlabel('x1', fontsize=16)
    plt.ylabel('x2', fontsize=16)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.gca().set_aspect('equal')
    plt.tight_layout()


def  histogram_predict(decision_function, X_train, y_train, X_test=None, y_test=None, bins=None, normed=False):
    if bins is None:
        bins = len(y_train) // 4
    plt.figure()
    ax = plt.axes()
    pred = decision_function(X_train)
    plt.hist( [ pred[y_train>0], pred[y_train<=0] ], bins=bins, histtype='stepfilled', density=False, alpha=0.5, color=['r', 'b'], label=['$y=+1$', '$y=-1$'])
    if X_test is not None and y_test is not None:
        pred = decision_function(X_test)
        plt.hist( [ pred[y_test>0], pred[y_test<=0] ], bins=bins, histtype='stepfilled', density=False, alpha=0.3, color=['r', 'b'], label=['$y_{test}=+1$', '$y_{test}=-1$'])
    plt.xlabel("$g(x)$", fontsize=16)
    plt.ylabel("Frequency", fontsize=16)
    plt.legend(loc="upper right", fontsize=16, frameon=True)
    plt.axis('tight')
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    from matplotlib.ticker import FormatStrFormatter
    plt.gca().yaxis.set_major_formatter(FormatStrFormatter('%1.0f'))
    plt.tight_layout()


def show_dtree_structure(model):
    n_nodes = model.tree_.node_count
    children_left = model.tree_.children_left
    children_right = model.tree_.children_right
    feature = model.tree_.feature
    threshold = model.tree_.threshold

    node_depth = np.zeros(shape=n_nodes,dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, -1)]  # seed is the root node id and its parent depth
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()
        node_depth[node_id] = parent_depth + 1

        # If we have a test node
        if (children_left[node_id] != children_right[node_id]):
            stack.append((children_left[node_id], parent_depth + 1))
            stack.append((children_right[node_id], parent_depth + 1))
        else:
            is_leaves[node_id] = True

    print("The binary tree structure has %s nodes and has the following tree structure:" % n_nodes)
    for i in range(n_nodes):
        if is_leaves[i]:
            print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
        else:
            if feature[i] == 0:
                print("%snode=%s test node: go to node %s if x1 <= %.2f else to node %s." % (node_depth[i] * "\t", i, children_left[i], threshold[i], children_right[i] ))
            elif feature[i] == 1:
                print("%snode=%s test node: go to node %s if x2 <= %.2f else to node %s." % (node_depth[i] * "\t", i, children_left[i], threshold[i], children_right[i] ))
            else:
                print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to node %s." % (node_depth[i] * "\t", i, children_left[i], feature[i], threshold[i], children_right[i] ))

Make training data
------------------

In [0]:
# Example 1: define manually
X = np.array([[0, 0], [1,0], [0,1], [1,1]])
y = np.array([-1,1,1,1])

In [0]:
# Example 2: draw npos and nneg points from the Gaussian distribution for each class
npos = 30
nneg = 30
np.random.seed(432)
X = np.r_[np.random.randn(npos, 2) + [3, 3], np.random.randn(nneg, 2)]
# [1,1,...,1,-1,-1,...,-1]
y = np.array([1] * npos + [-1] * nneg)

In [0]:
# Example 3: create moons using sklearn
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=100, noise=0.3, random_state=0)
y[y==0] = -1

In [0]:
# Example 4: create circles using sklearn
from sklearn.datasets import make_circles
X, y = make_circles(n_samples=150, noise=0.1, random_state=0, factor=0.3)
y[y==0] = -1

Plot the training points

In [0]:
# Plot the training points
ax = plt.figure()
ax = plt.axes()
ax.scatter(X[y>0, 0], X[y>0, 1], c='r',  marker='s', cmap=plt.cm.bwr, edgecolors='k', label='Training data', alpha=1)
ax.scatter(X[y<=0, 0], X[y<=0, 1], c='b', marker='o', cmap=plt.cm.bwr, edgecolors='k', label='Training data', alpha=1)
plt.xlabel('x1', fontsize=16)
plt.ylabel('x2', fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.gca().set_aspect('equal')
ax.set_xlim(X[:,0].min()-0.5, X[:,0].max()+0.5)
ax.set_ylim(X[:,1].min()-0.5, X[:,1].max()+0.5)
plt.tight_layout()

Run the training
----------------

In [0]:
# Decision tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
model = DecisionTreeClassifier(max_depth=15, min_samples_leaf=1)
model.fit(X,y)

In [0]:
# Visualize the decision boundary and margin
plot2d_classification(lambda X: model.predict_proba(X)[:,1], X, y)
plt.savefig('dtree.png', transparent=True, dpi=300)
histogram_predict(lambda X: model.predict_proba(X)[:,1], X, y)
plt.savefig('hist_dtree.png', transparent=True)

In [0]:
show_dtree_structure(model)