# <center>IEE 520: Fall 2019</center>

# <center> Decision Tree (10/24/19)</center>

## <center>Klim Drobnyh (klim.drobnyh@asu.edu)</center>

**NOTE: TO SUPPORT INTERACTIVE PLOTS IN JUPYTER LAB, RUN**

conda install -c conda-forge nodejs

jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [None]:
# For compatibility with Python 2
from __future__ import print_function

# To load datasets
from sklearn import datasets

# To import the models (Decision Tree Classifier and Regressor)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

# To display a tree
from sklearn.tree import plot_tree

# To measure accuracy
from sklearn import metrics

from sklearn.model_selection import cross_validate, KFold

# To support plots
from ipywidgets import interact
import ipywidgets as widgets
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

import numpy as np
import pandas as pd

import math

# To display all the plots inline
%matplotlib inline

Custom function to plot trees, taken from scikit-learn/sklearn/tree/export.py

In [None]:
# To increase quality of figures
plt.rcParams["figure.figsize"] = (30, 10)

In [None]:
# To import the scalers
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Binarizer


class DummyScaler:
    
    def fit(self, data):
        pass
    
    def transform(self, data):
        return data

def create_scaler_dummy():
    return DummyScaler()
    
def create_scaler_standard():
    return StandardScaler()

def create_scaler_minmax():
    return MinMaxScaler()

def crete_scaler_binarizer():
    return Binarizer()

## <center>Toy dataset</center>

### <center>Prepare the dataset</center>

The Iris flower data set or Fisher's Iris data set is a multivariate data set introduced by the British statistician and biologist Ronald Fisher in his 1936 paper "The use of multiple measurements in taxonomic problems" as an example of linear discriminant analysis.

The data set consists of 50 samples from each of three species of Iris (Iris setosa, Iris virginica and Iris versicolor). Four features were measured from each sample: the length and the width of the sepals and petals, in centimeters. Based on the combination of these four features, Fisher developed a linear discriminant model to distinguish the species from each other.

In [None]:
X, y = datasets.load_iris(True)

Let's trim the data to have just first 2 variables (length and width of the sepals).
Also, let's remove repeating instances (just to make visualization more tractable).

In [None]:
X_trimmed = X[:, :2]
X_trimmed, indxs = np.unique(X_trimmed, return_index=True, axis=0)
y_trimmed = y[indxs]

In [None]:
def plot_iris(X, y):
    alpha = 1.0
    X_0 = X[y==0, :]
    X_1 = X[y==1, :]
    X_2 = X[y==2, :]
    plt.plot(X_0[:, 0], X_0[:, 1], 'ro', alpha=alpha)
    plt.plot(X_1[:, 0], X_1[:, 1], 'go', alpha=alpha)
    plt.plot(X_2[:, 0], X_2[:, 1], 'bo', alpha=alpha)
    plt.xlabel('Sepal length (cm)')
    plt.ylabel('Sepal width (cm)')
    plt.title('IRIS dataset')
    plt.show()

plot_iris(X_trimmed, y_trimmed)

### <center>Decision Tree Classifier</center>

You can find a full list of parameters here:

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [None]:
# Here we use closure to store the related variables
def create_plot_dt_iris(_X, _y):
    X, y = _X, _y
    X_0 = X[y==0, :]
    X_1 = X[y==1, :]
    X_2 = X[y==2, :]
    def plot_dt_iris(max_depth=1, min_samples_split=2, expand=3.1):
        alpha=0.8
        model = DecisionTreeClassifier(max_depth=max_depth, 
                                       min_samples_split=min_samples_split,
                                       random_state=520)
                                       # class_weight='balanced')
        model.fit(X, y)
        fig, (ax1, ax2) = plt.subplots(1, 2)
        ax1.plot((np.min(X[:, 0])-expand, np.max(X[:, 0])+expand), 
                 (np.min(X[:, 1])-expand, np.max(X[:, 1])+expand), 
                 alpha=0.0)
        xlim = ax1.get_xlim()
        ylim = ax1.get_ylim()
        xx = np.linspace(xlim[0], xlim[1], 100)
        yy = np.linspace(ylim[0], ylim[1], 50)
        XX, YY = np.meshgrid(xx, yy)
        xy = np.vstack([XX.ravel(), YY.ravel()]).T
        Z = model.predict_proba(xy).reshape((XX.shape[0], XX.shape[1], -1))

        ax1.imshow(Z, interpolation='bilinear',
               origin='lower', extent=[xlim[0], xlim[1], ylim[0], ylim[1]])

        ax1.scatter(X_0[:, 0], X_0[:, 1], s=80, linewidths=2, color='r', edgecolors='w')
        ax1.scatter(X_1[:, 0], X_1[:, 1], s=80, linewidths=2, color='g', edgecolors='w')
        ax1.scatter(X_2[:, 0], X_2[:, 1], s=80, linewidths=2, color='b', edgecolors='w')

        ax1.set_xlabel('Sepal length (cm)')
        ax1.set_ylabel('Sepal width (cm)')
        ax1.set_title('Decision Tree Classifier: ' +
                      'maximum depth:%s, minimal samples for split:%s' % 
                      (str(max_depth), str(min_samples_split)))
        annotations = plot_tree(model, ax=ax2, feature_names=['x', 'y'], filled=True)
        for annotation in annotations:
            text = annotation.get_text()
            vals = text[text.rfind('[') + 1:]
            vals = vals[:vals.find(']')]
            vals = [float(x) for x in vals.split(',')]
            vals_sum = sum(vals)
            vals = [x / vals_sum for x in vals]
            annotation.set_color('w')
            annotation.set_backgroundcolor(vals)
        plt.show()
    return plot_dt_iris

In [None]:
max_depth_widget = widgets.IntSlider(
    value=1,
    min=1,
    max=15,
    step=1,
    continuous_update=False,
    description='Max depth:')
min_samples_split_widget = widgets.IntSlider(
    value=2,
    min=2,
    max=15,
    step=1,
    continuous_update=False,
    description='Min split:')
expand_widget = widgets.FloatSlider(
    value=0.0,
    min=0,
    max=1,
    step=0.1,
    continuous_update=False,
    description='Expand:')
interact(create_plot_dt_iris(X_trimmed, y_trimmed), 
         max_depth=max_depth_widget, 
         min_samples_split=min_samples_split_widget,
         expand=expand_widget)

## <center>Real-world dataset</center>

### <center>Load the dataset</center>

Let's consider that dataset:
http://staff.pubhealth.ku.dk/~tag/Teaching/share/data/Bodyfat.html

The data contain estimates of the percentage of body fat determined by underwater weighing and various body circumference measurements for 252 men.

Accurate measurement of body fat is inconvenient/costly and it is desirable to have easy methods of estimating body fat that are not inconvenient/costly.

The variables in the data set are:

* Density determined from underwater weighing
* Percent body fat from Siri's (1956) equation
* Age (years)
* Weight (lbs)
* Height (inches)
* Neck circumference (cm)
* Chest circumference (cm)
* Abdomen 2 circumference (cm)
* Hip circumference (cm)
* Thigh circumference (cm)
* Knee circumference (cm)
* Ankle circumference (cm)
* Biceps (extended) circumference (cm)
* Forearm circumference (cm)
* Wrist circumference (cm)

In [None]:
data = pd.read_csv('http://staff.pubhealth.ku.dk/~tag/Teaching/share/data/Bodyfat.csv')

In [None]:
print(data)

### <center>Decision Tree Regressor</center>

First, let's consider very simple model: the input is density, the ouput is bodyfat.

In [None]:
vals = data.values
X_simple = vals[:, 0:1]
y_simple = vals[:, 1]

In [None]:
def rmse_loss(y_true, y_pred):
    return math.sqrt(metrics.mean_squared_error(y_true, y_pred))

# Greater is better to make sure the signs are not flipped
rmse_score = metrics.make_scorer(rmse_loss, greater_is_better=True)

max_depths = list(range(1, 15))
train_score = np.zeros(len(max_depths))
test_score = np.zeros(len(max_depths))

for i in range(len(max_depths)):
    model = DecisionTreeRegressor(max_depth=max_depths[i],
                                  random_state=520)
    cv_results = cross_validate(model, X_simple, y_simple, 
                                cv=10, return_train_score=True, 
                                scoring=rmse_score)
    train_score[i] = np.mean(cv_results['train_score'])
    test_score[i] = np.mean(cv_results['test_score'])

plt.plot(max_depths, train_score, 'b', label='Train score')
plt.plot(max_depths, test_score, 'r', label='Test score')
plt.title('Learning curve')
plt.xlabel('Max depth')
plt.ylabel('RMSE')
plt.xticks(ticks=range(min(max_depths), max(max_depths)+1))
plt.legend()
plt.show()

In [None]:
# Here we use closure to store the related variables
def create_plot_bodyfat_simple(_X, _y):
    X, y = _X, _y
    kfold = KFold(n_splits=5, shuffle=True, random_state=520)
    def plot_bodyfat_simple(max_depth=1):
        y_hat = np.zeros(y.shape)
        # Cross-validation
        for train, test in kfold.split(X, y):
            model = DecisionTreeRegressor(max_depth=max_depth,
                                          random_state=520)
            model.fit(X[train], y[train])
            y_hat[test] = model.predict(X[test])
        plt.title('Predicted vs actual, max depth: %s' % (str(max_depth)))
        plt.xlabel('Actual')
        plt.ylabel('Predicted')
        plt.scatter(y, y_hat)
        plt.show()
    return plot_bodyfat_simple

In [None]:
max_depth_widget = widgets.IntSlider(
    value=1,
    min=1,
    max=15,
    step=1,
    continuous_update=False,
    description='Max depth:')
interact(create_plot_bodyfat_simple(X_simple, y_simple), 
         max_depth=max_depth_widget)

Second, let's consider more realistic model: the input containts all the measurements, the ouput is bodyfat.

In [None]:
vals = data.values
X_complex = vals[:, 2:]
y_complex = vals[:, 1]

In [None]:
max_depths = list(range(1, 15))
train_score = np.zeros(len(max_depths))
test_score = np.zeros(len(max_depths))

for i in range(len(max_depths)):
    model = DecisionTreeRegressor(max_depth=max_depths[i],
                                  random_state=520)
    cv_results = cross_validate(model, X_complex, y_complex, cv=10, 
                                return_train_score=True, scoring=rmse_score)
    train_score[i] = np.mean(cv_results['train_score'])
    test_score[i] = np.mean(cv_results['test_score'])

plt.plot(max_depths, train_score, 'b', label='Train score')
plt.plot(max_depths, test_score, 'r', label='Test score')
plt.title('Learning curve')
plt.xlabel('Max depth')
plt.ylabel('RMSE')
plt.xticks(ticks=range(min(max_depths), max(max_depths)+1))
plt.legend()
plt.show()

In [None]:
# Here we use closure to store the related variables
def create_plot_bodyfat_complex(_X, _y):
    X, y = _X, _y
    kfold = KFold(n_splits=5, shuffle=True, random_state=520)
    def plot_bodyfat_complex(max_depth=1):
        y_hat = np.zeros(y.shape)
        # Cross-validation
        for train, test in kfold.split(X, y):
            model = DecisionTreeRegressor(max_depth=max_depth,
                                          random_state=520)
            model.fit(X[train], y[train])
            y_hat[test] = model.predict(X[test])
        plt.title('Predicted vs actual, max depth: %s' % (str(max_depth)))
        plt.xlabel('Actual')
        plt.ylabel('Predicted')
        plt.scatter(y, y_hat)
        plt.show()
    return plot_bodyfat_complex

In [None]:
max_depth_widget = widgets.IntSlider(
    value=1,
    min=1,
    max=15,
    step=1,
    continuous_update=False,
    description='Max depth:')
interact(create_plot_bodyfat_complex(X_complex, y_complex), 
         max_depth=max_depth_widget)