## EXERCISE: Decision trees



Let's try a decision tree on Iris data.

### Train and view a tree

In [1]:

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

iris = load_iris()
key=', '.join(['{}={}'.format(i,name) for i,name in enumerate(iris.target_names)])

# First let's create a train and test split
X_train, X_test, Y_train, Y_test = train_test_split(iris.data, iris.target, test_size=0.33,
                                                    random_state=5) # so we get the same results

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
# Let's fit a model
tree = DecisionTreeClassifier(max_depth=2)
_ = tree.fit(X_train, Y_train)

# Evaluate
print('Classification report ({}):\n'.format(key))
print(classification_report(Y_test, tree.predict(X_test)))

Classification report (0=setosa, 1=versicolor, 2=virginica):

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       0.89      0.94      0.91        17
           2       0.94      0.88      0.91        17

    accuracy                           0.94        50
   macro avg       0.94      0.94      0.94        50
weighted avg       0.94      0.94      0.94        50



In [3]:
#From http://chrisstrelioff.ws/sandbox/2015/06/08/decision_trees_in_python_with_scikit_learn_and_pandas.html
def get_code(tree, feature_names, target_names, spacer_base="    "):
    """Produce psuedo-code for decision tree.

    Args
    ----
    tree -- scikit-leant DescisionTree.
    feature_names -- list of feature names.
    target_names -- list of target (class) names.
    spacer_base -- used for spacing code (default: "    ").

    Notes
    -----
    based on http://stackoverflow.com/a/30104792.
    """
    left      = tree.tree_.children_left
    right     = tree.tree_.children_right
    threshold = tree.tree_.threshold
    features  = [feature_names[i] for i in tree.tree_.feature]
    value = tree.tree_.value

    def recurse(left, right, threshold, features, node, depth):
        spacer = spacer_base * depth
        if (threshold[node] != -2):
            print(spacer + "if ( " + features[node] + " <= " + \
                  str(threshold[node]) + " ) {")
            if left[node] != -1:
                    recurse(left, right, threshold, features,
                            left[node], depth+1)
            print(spacer + "}\n" + spacer +"else {")
            if right[node] != -1:
                    recurse(left, right, threshold, features,
                            right[node], depth+1)
            print(spacer + "}")
        else:
            target = value[node]
            for i, v in zip(np.nonzero(target)[1],
                            target[np.nonzero(target)]):
                target_name = target_names[i]
                target_count = int(v)
                print(spacer + "return " + str(target_name) + \
                      " ( " + str(target_count) + " examples )")

    recurse(left, right, threshold, features, 0, 0)
    
print('Decision tree:\n')
get_code(tree, iris.feature_names, iris.target_names)

Decision tree:

if ( petal length (cm) <= 2.449999988079071 ) {
    return setosa ( 34 examples )
}
else {
    if ( petal width (cm) <= 1.75 ) {
        return versicolor ( 33 examples )
        return virginica ( 3 examples )
    }
    else {
        return virginica ( 30 examples )
    }
}


## EXERCISE: Model selection on test data





### McNemar's test

McNemar's test is [recommended when we have a single test split](http://sci2s.ugr.es/keel/pdf/algorithm/articulo/dietterich1998.pdf).

Under H0, the two algorithms should have the same error rate.

In [4]:
from scipy.stats import chi2

def mcnemar(x, y):
    n1 = np.sum(x < y)
    n2 = np.sum(x > y)
    stat = (np.abs(n1-n2)-1)**2 / (n1+n2)
    df = 1
    pval = chi2.sf(stat,1)
    return stat, pval

### TODO Compare classifiers

- Choose the decision tree max_depth in [2..6], criterion in ['entropy', 'gini'] and splitter in ['best', 'random']. What are the best parameters? Print out all grid scores to sanity check the selection. Is there a unique best set of parameters?
- Use `np.array` create `l_yn` and `t_yn` arrays showing respectively for logistic regression and decision tree whether each test instance is predicted correctly (`1`) or incorrectly (`0`). Are the classifiers significantly different at p<=0.05 according to McNemar's test?(use the logistic regression code from previous week)
- Which classifier is significantly better at p<=0.05 using paired t-test?(use f-score measure)

In [6]:
from sklearn.model_selection import GridSearchCV

para = [
    {'max_depth': [2, 3, 4, 5, 6], 
     'criterion': ['entropy', 'gini'],
     'splitter': ['best', 'random']
    }
]

tree = DecisionTreeClassifier()
clf = GridSearchCV(tree, para, cv = 10, scoring = 'accuracy')
clf.fit(X_train, Y_train) 
print('Best Para for accuracy', clf.best_params_)
print(classification_report(Y_train, clf.predict(X_train)))

Best Para for accuracy {'criterion': 'entropy', 'max_depth': 2, 'splitter': 'best'}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       0.92      1.00      0.96        33
           2       1.00      0.91      0.95        33

    accuracy                           0.97       100
   macro avg       0.97      0.97      0.97       100
weighted avg       0.97      0.97      0.97       100



In [7]:
# compare log and decision tree
# output prediction based on two algorithms
# output the p value using McNemar test

from sklearn.linear_model import LogisticRegression 
# fit logistic regression 
logit = LogisticRegression()
_ = logit.fit(X_train, Y_train) 
l_pred = logit.predict(X_test) 
l_yn = [int(p==t) for p, t in zip(l_pred, Y_test)]

# fit tree
tree = DecisionTreeClassifier()
_ = tree.fit(X_train, Y_train) 
t_pred = tree.predict(X_test) 
t_yn = [int(p==t) for p, t in zip(t_pred, Y_test)]

stat, pval = mcnemar(l_yn, t_yn)

print(l_yn, t_yn)

print('P value is ', pval) 
print('stat', stat)

[1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
P value is  1.0
stat 0.0


In [11]:
from sklearn.metrics import f1_score
l_predict = []
t_predict = []
for i in range(10):
    X_train, X_test, Y_train, Y_test = train_test_split(iris.data, iris.target, test_size = 0.3, random_state = 0)
    logit = LogisticRegression(max_iter = 500, random_state = 0) 
    _ = logit.fit(X_train, Y_train) 
    l_predict.append(f1_score(Y_test, logit.predict(X_test), average = 'macro'))
    
    tree = DecisionTreeClassifier(max_iter = 500, random_state = 0) 
    _ = tree.fit(X_train, Y_train) 
    t_predict.append(f1_score(Y_test, tree.predict(X_test), average = 'macro'))
    
print(l_predict)
print(t_predict) 

import scipy.stats as stats
stats.ttest_rel(l_predict, t_predict)

TypeError: __init__() got an unexpected keyword argument 'max_iter'