## Worksheet 05
***

In [1]:
from sklearn.datasets import load_digits

import numpy as np

import pandas as pd

from decision_tree import DecisionTree
from density_tree import DensityTree
from generative_classifier import GenerativeClassifier

%load_ext autoreload
%autoreload 2

### Density Tree and Decision Tree

In [2]:
digits = load_digits()
data = digits['data']
target = digits['target']

# Subsets for each digit
data_subsets = [data[target==i] for i in  range(10)]

In [3]:
n_min = [20, 10]
for n in n_min:
    # Train generative classifier with density trees
    gc = GenerativeClassifier()
    gc.fit(data, target, n)
    
    # Train discriminative classifier
    dt = DecisionTree()
    dt.fit(data, target, n)
    
    # Predict and compute full training errorconfusion matrices
    confusion_gc = np.empty((10, 10))
    confusion_dt = np.empty((10, 10))
    # For each target subset
    for i in range(len(data_subsets)):
        # Predictions generative classifier
        predictions = np.array([gc.predict(j) for j in data_subsets[i]])
        confusion_gc[i, :] = np.bincount(predictions, minlength=10) / len(data_subsets[i]) * 100
        # Predictions discriminative classifier
        predictions = np.array([np.argmax(dt.predict(j)) for j in data_subsets[i]])
        confusion_dt[i, :] = np.bincount(predictions, minlength=10) / len(data_subsets[i]) * 100
    
    print('------------------------------------------------------------------------------------------------------------------')
    print('Confusion Matrix for Generative Classifier using 10 instances of DensityTree with stop criterion n_min={}'.format(n))
    
    display(
        pd.DataFrame(data=confusion_gc, index=range(10), columns=range(10))
        .rename_axis('G-Truth/Predicted', axis='columns')
    )
    print('------------------------------------------------------------------------------------------------------------------')
    print('Confusion Matrix for Discriminative Classifier using 10 instances of DecisionTree with stop criterion n_min={}'.format(n))
    
    display(
        pd.DataFrame(data=confusion_dt, index=range(10), columns=range(10))
        .rename_axis('G-Truth/Predicted', axis='columns')
    )

------------------------------------------------------------------------------------------------------------------
Confusion Matrix for Generative Classifier using 10 instances of DensityTree with stop criterion n_min=20


G-Truth/Predicted,0,1,2,3,4,5,6,7,8,9
0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,79.67033,1.648352,0.0,0.0,0.0,0.0,2.747253,15.934066,0.0
2,0.0,4.519774,57.062147,5.649718,0.0,0.564972,0.0,0.0,32.20339,0.0
3,0.0,3.278689,0.0,62.295082,0.0,4.918033,0.0,2.73224,24.043716,2.73224
4,0.0,1.657459,0.0,0.0,75.690608,0.552486,0.0,20.441989,0.552486,1.104972
5,0.0,4.395604,0.0,2.747253,0.549451,69.78022,0.0,9.340659,10.43956,2.747253
6,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.558659,0.558659,0.0,0.0,98.324022,0.558659,0.0
8,0.0,13.793103,1.149425,0.574713,0.0,1.724138,0.0,2.298851,80.45977,0.0
9,0.0,5.555556,1.111111,15.0,2.777778,3.888889,0.0,5.555556,7.222222,58.888889


------------------------------------------------------------------------------------------------------------------
Confusion Matrix for Discriminative Classifier using 10 instances of DecisionTree with stop criterion n_min=20


G-Truth/Predicted,0,1,2,3,4,5,6,7,8,9
0,97.752809,0.0,0.0,0.0,0.561798,0.0,0.0,0.0,0.561798,1.123596
1,0.0,93.406593,0.549451,1.648352,1.648352,0.549451,0.0,0.0,0.549451,1.648352
2,1.129944,2.259887,91.525424,0.564972,0.564972,0.0,0.0,0.564972,3.389831,0.0
3,0.0,2.185792,0.0,90.710383,0.546448,2.73224,0.0,0.0,1.639344,2.185792
4,1.104972,0.552486,1.657459,0.0,87.292818,0.552486,1.104972,5.524862,1.657459,0.552486
5,1.098901,2.197802,0.0,2.747253,0.549451,83.516484,0.549451,4.945055,1.648352,2.747253
6,0.552486,1.104972,0.552486,1.104972,0.552486,3.314917,90.607735,0.0,0.552486,1.657459
7,0.0,0.558659,0.0,2.234637,3.910615,1.675978,0.0,88.826816,1.675978,1.117318
8,0.574713,8.62069,0.574713,4.597701,2.298851,1.724138,1.149425,2.298851,74.137931,4.022989
9,0.0,2.777778,0.0,3.888889,0.555556,5.0,0.0,2.222222,3.888889,81.666667


------------------------------------------------------------------------------------------------------------------
Confusion Matrix for Generative Classifier using 10 instances of DensityTree with stop criterion n_min=10


G-Truth/Predicted,0,1,2,3,4,5,6,7,8,9
0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,86.813187,3.296703,0.0,0.0,0.0,0.0,0.0,8.791209,1.098901
2,0.0,11.299435,69.491525,2.259887,0.0,0.0,0.0,0.0,16.949153,0.0
3,0.0,1.092896,12.568306,65.57377,0.0,2.185792,0.0,2.185792,14.754098,1.639344
4,0.0,1.104972,0.0,0.0,83.977901,0.0,0.0,13.812155,1.104972,0.0
5,0.0,1.098901,0.549451,18.131868,0.549451,71.428571,0.0,0.549451,7.142857,0.549451
6,0.0,0.552486,0.552486,0.0,0.0,0.0,98.895028,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,4.469274,1.675978,0.0,92.73743,1.117318,0.0
8,0.0,6.321839,8.62069,0.0,0.0,0.0,0.0,1.149425,83.908046,0.0
9,0.0,2.222222,1.666667,10.555556,2.222222,3.888889,0.0,2.222222,12.777778,64.444444


------------------------------------------------------------------------------------------------------------------
Confusion Matrix for Discriminative Classifier using 10 instances of DecisionTree with stop criterion n_min=10


G-Truth/Predicted,0,1,2,3,4,5,6,7,8,9
0,98.314607,0.0,0.0,0.0,0.0,0.0,1.123596,0.0,0.561798,0.0
1,0.0,93.956044,1.648352,1.098901,1.098901,0.0,0.549451,0.549451,0.549451,0.549451
2,1.129944,0.564972,92.090395,2.824859,0.0,0.0,0.0,0.564972,2.259887,0.564972
3,0.546448,2.185792,1.092896,90.163934,0.0,1.092896,0.0,0.546448,2.185792,2.185792
4,0.552486,1.104972,0.552486,0.0,97.237569,0.0,0.0,0.0,0.0,0.552486
5,0.549451,0.0,0.549451,2.747253,2.197802,90.10989,0.549451,0.0,2.747253,0.549451
6,1.104972,0.552486,0.0,0.552486,0.552486,0.552486,96.685083,0.0,0.0,0.0
7,0.0,0.0,1.675978,1.675978,1.675978,0.0,0.0,93.296089,0.558659,1.117318
8,0.574713,2.298851,4.022989,1.724138,0.0,2.298851,0.574713,0.0,86.781609,1.724138
9,1.111111,1.111111,1.111111,3.333333,0.555556,1.111111,0.0,1.111111,2.222222,88.333333


The diagonals of the confusion matrices give the accuracies of predictions of tragets, the off-diagonal elements are the error cases. The discriminative classifiers with decision trees always perform better than the generative classifiers with density trees, reflecting the fact that accurate generative modeling is harder than discriminative modeling.

Decreasing n_min increases the training accuracy of both classifiers, but this does not imply that the test accuracy would also increase.

Since the success rate highly depends on the random selection of feature subsets when searching for the optimal split in each node, the results can change significantly by repeating the experiments

If all leaves of decision tree are trained to purity (by setting n_min=0), it makes sense that the training accuracy is 100%. However, this does not mean that the test error decreased as well -- in fact, it will typically increase due to overfitting.

In [4]:
# Train discriminative classifier
dt = DecisionTree()
dt.fit(data, target, n_min=0)


confusion_dt = np.zeros((10,10))
# For each target subset
for i in range(10):
    # Predictions generative classifier
    predictions = np.array([np.argmax(dt.predict(j)) for j in data_subsets[i]])
    confusion_dt[i,:] = np.bincount(predictions,minlength=10)/len(data_subsets[i])*100

print('------------------------------------------------------------------------------------------------------------------')
print('Confusion Matrix for Discriminative Classifier using 10 instances of Decision Tree')

display(
    pd.DataFrame(data = confusion_dt, index =range(10), columns =range(10) )
    .rename_axis('G-Truth/Predicted', axis = 'columns')
)

Confusion Matrix for Discriminative Classifier using 10 instances of Decision Tree
----------------------------------------------------------------------------------


G-Truth/Predicted,0,1,2,3,4,5,6,7,8,9
0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0


### Density Forest and Decision Forest