In [None]:
# Demonstration: IRIS dataset
# - fit decision tree classifier
# - inspect decision tree

# - pros and cons
# - tuning the decision tree

# Exercise: MNIST dataset

In [1]:
from sklearn.datasets import load_iris

iris = load_iris()

X = iris.data
y = iris.target

In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=31)

In [41]:
X_train.shape

(112, 4)

In [42]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [43]:
import numpy as np

X_train[:, 3]

array([2.1, 1.3, 0.3, 1.9, 2. , 0.2, 1.2, 1. , 0.2, 0.2, 1.3, 1.8, 2.3,
       0.2, 0.5, 1. , 2.3, 0.2, 1.4, 0.2, 1.3, 2.1, 0.2, 0.1, 1.8, 0.4,
       2.2, 0.2, 1.5, 0.4, 1.8, 0.2, 1.5, 1.9, 0.2, 1. , 1.2, 0.2, 0.3,
       1.4, 1.3, 1.5, 2.3, 2. , 1.4, 1.6, 0.2, 2. , 1.1, 0.2, 0.1, 2.1,
       0.4, 0.2, 0.3, 1.3, 1.5, 2. , 2.3, 0.4, 2.5, 0.2, 0.2, 2.5, 1.4,
       0.2, 1.7, 1.8, 1.8, 0.1, 1.3, 2.4, 0.2, 1.8, 0.1, 1.4, 2.3, 1. ,
       0.3, 1.6, 1.3, 0.2, 0.2, 2.3, 1.8, 1.3, 1.9, 2.4, 1.5, 0.2, 1.7,
       2.3, 1.9, 1.8, 1.5, 1.3, 1.9, 2. , 1.8, 1. , 0.2, 1.2, 0.3, 0.2,
       2. , 1.2, 0.2, 1.3, 1.1, 1.8, 0.4, 1.3])

In [44]:
np.sort(X_train[:, 3])

array([0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
       0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
       0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.4, 0.4, 0.5,
       1. , 1. , 1. , 1. , 1. , 1.1, 1.1, 1.2, 1.2, 1.2, 1.2, 1.3, 1.3,
       1.3, 1.3, 1.3, 1.3, 1.3, 1.3, 1.3, 1.3, 1.3, 1.4, 1.4, 1.4, 1.4,
       1.4, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.6, 1.6, 1.7, 1.7, 1.8, 1.8,
       1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.8, 1.9, 1.9, 1.9, 1.9, 1.9,
       2. , 2. , 2. , 2. , 2. , 2. , 2.1, 2.1, 2.1, 2.2, 2.3, 2.3, 2.3,
       2.3, 2.3, 2.3, 2.3, 2.4, 2.4, 2.5, 2.5])

In [45]:
X_test.shape

(38, 4)

In [46]:
# - fit decision tree classifier
from sklearn.tree import DecisionTreeClassifier

# 4: perfect score: investigate?
dtc_entropy = DecisionTreeClassifier(criterion='entropy', max_depth=3)

dtc_entropy.fit(X_train, y_train)
dtc_entropy.score(X_test, y_test) # mean accuracy

0.9473684210526315

In [47]:
from sklearn.metrics import classification_report, confusion_matrix

pred_entropy = dtc_entropy.predict(X_test)

print(classification_report(y_test, pred_entropy))
print(confusion_matrix(y_test, pred_entropy))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      0.88      0.93        16
           2       0.85      1.00      0.92        11

   micro avg       0.95      0.95      0.95        38
   macro avg       0.95      0.96      0.95        38
weighted avg       0.96      0.95      0.95        38

[[11  0  0]
 [ 0 14  2]
 [ 0  0 11]]


In [48]:
# - inspect decision tree
# conda install python-graphviz

from sklearn.tree import export_graphviz
import graphviz

filename = 'iris_entropy.dot'
export_graphviz(dtc_entropy,
                out_file=filename,
                feature_names=iris.feature_names,
                class_names=iris.target_names, filled=True,
                rounded=True)

source = graphviz.Source.from_file(filename)
source.render(view=True)

'iris_entropy.dot.pdf'

In [72]:
dtc_gini = DecisionTreeClassifier(criterion='gini', max_depth=4,
                                  random_state=24)

dtc_gini.fit(X_train, y_train)
print(dtc_gini.score(X_test, y_test)) # mean accuracy

filename = 'iris_gini_depth4.dot'
export_graphviz(dtc_gini,
                out_file=filename,
                feature_names=iris.feature_names,
                class_names=iris.target_names, filled=True,
                rounded=True)

source = graphviz.Source.from_file(filename)
source.render(view=True)

dtc_gini = DecisionTreeClassifier(criterion='gini', max_depth=5,
                                  random_state=24)

dtc_gini.fit(X_train, y_train)
print(dtc_gini.score(X_test, y_test)) # mean accuracy

filename = 'iris_gini_depth5.dot'
export_graphviz(dtc_gini,
                out_file=filename,
                feature_names=iris.feature_names,
                class_names=iris.target_names, filled=True,
                rounded=True)

source = graphviz.Source.from_file(filename)
source.render(view=True)

1.0
1.0


'iris_gini_depth5.dot.pdf'

In [68]:
print(dtc_gini.score(X_test, y_test)) # mean accurac

1.0


In [73]:
dtc_entropy.feature_importances_

array([0.        , 0.03793136, 0.32291776, 0.63915088])

In [74]:
# Feature importance
for i in range(len(iris.feature_names)):
    print(iris.feature_names[i], dtc_entropy.feature_importances_[i])

sepal length (cm) 0.0
sepal width (cm) 0.037931356617116406
petal length (cm) 0.3229177588472077
petal width (cm) 0.639150884535676


In [None]:
# overfitting
#
# Tree-specific:
#  max depth
#  max leaf nodes
#  min_samples_leaf
#
# Machine learning:
#  Cross validation
#  Forests
#

In [79]:
# Tune decision tree using GridSearchCV (for larger datasets like MNIST)

from sklearn.model_selection import GridSearchCV
 
param_grid = {
    'max_depth': [5, 10, 15], # limit depth to avoid overfitting
    'max_leaf_nodes': [10, 15], # limit leaf nodes also for overfitting
    'min_samples_leaf': [5, 10, 15]  # sets minimum support for leaf
}

# Note: better to use RandomForestClassifier because result is more stable
gs = GridSearchCV(DecisionTreeClassifier(random_state=42),
                 param_grid, verbose=True) # verbose: print more information
gs.fit(X_train, y_train)
print('best score', gs.score(X_test, y_test))
print('best parameters', gs.best_params_)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
best score 1.0
best parameters {'max_depth': 5, 'max_leaf_nodes': 10, 'min_samples_leaf': 5}


[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:    0.0s finished


In [39]:
# https://github.com/scikit-learn/scikit-learn/blob/79bdc8f711d0af225ed6be9fdb708cea9f98a910/sklearn/tree/export.py
def export_dict(tree, feature_names=None, max_depth=None):
    """Export a decision tree in dict format.
    Parameters
    ----------
    decision_tree : decision tree classifier
        The decision tree to be exported
    feature_names : list of strings, optional (default=None)
        Names of each of the features.
    max_depth : int, optional (default=None)
        The maximum depth of the representation. If None, the tree is fully
        generated.
    Returns
    -------
    a dictionary of the format <tree> := {
        'feature' <int> | <string>,
        'threshold' : <float>,
        'impurity' : <float>,
        'n_node_samples' : <int>,
        'left' : <tree>,
        'right' : <tree>,
        'value' : [<int>],
    }
    if feature_names is provided, it is used to map feature indicies
    to feature names.  All types (including the value list) are native
    python types as opposed to numpy types to make exporting to json
    and other pythonic operations easier.
    Examples
    --------
    >>> from sklearn.datasets import load_iris
    >>> from sklearn import tree
    >>> import json
    >>> clf = tree.DecisionTreeClassifier()
    >>> iris = load_iris()
    >>> clf = clf.fit(iris.data, iris.target)
    >>> d = export_dict(clf)
    >>> j = json.dumps(d, indent=4)
    """
    from sklearn.tree import _tree
    
    tree_ = tree.tree_

    # i is the element in the tree_ to create a dict for
    def recur(i, depth=0) :
        if max_depth is not None and depth > max_depth :
            return None
        if i == _tree.TREE_LEAF :
            return None

        feature = int(tree_.feature[i])
        threshold = float(tree_.threshold[i])
        if feature == _tree.TREE_UNDEFINED :
            feature = None
            threshold = None
            value = [map(int, l) for l in tree_.value[i].tolist()]
        else :
            value = None
            if feature_names :
                feature = feature_names[feature]

        return {
            'feature' : feature,
            'threshold' : threshold,
            'impurity' : float(tree_.impurity[i]),
            'n_node_samples' : int(tree_.n_node_samples[i]),
            'left'  : recur(tree_.children_left[i],  depth + 1),
            'right' : recur(tree_.children_right[i], depth + 1),
            'value' : value,
        }

    return recur(0)

In [40]:
d = export_dict(dtc_entropy, iris.feature_names)
d

{'feature': 'petal length (cm)',
 'threshold': 2.450000047683716,
 'impurity': 1.5834545859901241,
 'n_node_samples': 112,
 'left': {'feature': None,
  'threshold': None,
  'impurity': 0.0,
  'n_node_samples': 35,
  'left': None,
  'right': None,
  'value': [<map at 0x212fa8231d0>]},
 'right': {'feature': 'petal length (cm)',
  'threshold': 4.75,
  'impurity': 0.9998783322990061,
  'n_node_samples': 77,
  'left': {'feature': 'petal width (cm)',
   'threshold': 1.6500000953674316,
   'impurity': 0.18717625687320816,
   'n_node_samples': 35,
   'left': {'feature': None,
    'threshold': None,
    'impurity': 0.0,
    'n_node_samples': 34,
    'left': None,
    'right': None,
    'value': [<map at 0x212fa8232b0>]},
   'right': {'feature': None,
    'threshold': None,
    'impurity': 0.0,
    'n_node_samples': 1,
    'left': None,
    'right': None,
    'value': [<map at 0x212fa8230f0>]},
   'value': None},
  'right': {'feature': 'petal length (cm)',
   'threshold': 5.149999618530273,
   '

### Decision Tree Regressor



In [50]:
import pandas as pd
df = pd.read_csv('../data/auto-mpg/auto-mpg.data.txt', sep='\s+',
                names=['mpg', 'cylinders', 'displacement',
                       'horsepower', 'weight', 'acceleration', 'model_year',
                       'origin', 'car_name'],
                header=None,
                na_values=['?'])

# Attribute Information:
#    1. mpg:           continuous
#    2. cylinders:     multi-valued discrete
#    3. displacement:  continuous
#    4. horsepower:    continuous
#    5. weight:        continuous
#    6. acceleration:  continuous
#    7. model year:    multi-valued discrete
#    8. origin:        multi-valued discrete
#    9. car name:      string (mostly unique for each instance)

df.drop(['car_name'], axis=1, inplace=True)
df.dropna(inplace=True)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [52]:
X = df.loc[:, df.columns != 'mpg']
y = df.mpg
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print(X_train.shape)
print(X_test.shape)

(294, 7)
(98, 7)


In [58]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(random_state=42, max_depth=5)

dtr.fit(X_train, y_train)
print(dtr.score(X_test, y_test)) # r2 score

filename = 'autompg_entropy_depth5.dot'
export_graphviz(dtr,
                feature_names=df.columns[df.columns != 'mpg'],
                out_file=filename,
                filled=True,
                rounded=True)

source = graphviz.Source.from_file(filename)
source.render(view=True)

0.8168506048851136


'autompg_entropy_depth5.dot.pdf'