## Worksheet 05
***

In [1]:
from sklearn.datasets import load_digits

import numpy as np

import pandas as pd

from sklearn.ensemble import RandomForestClassifier

from decision_tree import DecisionTree
from density_tree import DensityTree
from decision_forest import DecisionForest
from generative_classifier import (GenerativeClassifier, GenerativeClassifierDensityForest)

%load_ext autoreload
%autoreload 2

### Density Tree and Decision Tree

In [2]:
digits = load_digits()
data = digits['data']
target = digits['target']

# Subsets for each digit
data_subsets = [data[target==i] for i in  range(10)]

In [3]:
n_min = [20, 10]
for n in n_min:
    # Train generative classifier with density trees
    gcdt = GenerativeClassifier()
    gcdt.fit(data, target, n)
    
    # Train discriminative classifier
    dcdt = DecisionTree()
    dcdt.fit(data, target, n)
    
    # Predict and compute full training error confusion matrices
    confusion_gcdt = np.empty((10, 10))
    confusion_dcdt = np.empty((10, 10))
    # For each target subset
    for i in range(len(data_subsets)):
        # Predictions generative classifier
        predictions = np.array([gcdt.predict(j) for j in data_subsets[i]])
        confusion_gcdt[i, :] = np.bincount(predictions, minlength=10) / len(data_subsets[i]) * 100
        # Predictions discriminative classifier
        predictions = np.array([np.argmax(dcdt.predict(j)) for j in data_subsets[i]])
        confusion_dcdt[i, :] = np.bincount(predictions, minlength=10) / len(data_subsets[i]) * 100
    
    print('------------------------------------------------------------------------------------------------------------------')
    print('Confusion Matrix for Generative Classifier using 10 instances of DensityTree with stop criterion n_min={}'.format(n))
    
    display(
        pd.DataFrame(data=confusion_gcdt, index=range(10), columns=range(10))
        .rename_axis('G-Truth/Predicted', axis='columns')
    )
    print('------------------------------------------------------------------------------------------------------------------')
    print('Confusion Matrix for Discriminative Classifier using 10 instances of DecisionTree with stop criterion n_min={}'.format(n))
    
    display(
        pd.DataFrame(data=confusion_dcdt, index=range(10), columns=range(10))
        .rename_axis('G-Truth/Predicted', axis='columns')
    )

------------------------------------------------------------------------------------------------------------------
Confusion Matrix for Generative Classifier using 10 instances of DensityTree with stop criterion n_min=20


G-Truth/Predicted,0,1,2,3,4,5,6,7,8,9
0,98.876404,0.0,0.0,0.0,1.123596,0.0,0.0,0.0,0.0,0.0
1,0.0,74.175824,0.549451,0.0,4.395604,0.0,0.0,1.098901,14.835165,4.945055
2,0.0,9.039548,48.022599,15.254237,0.0,1.694915,0.0,0.0,24.293785,1.694915
3,0.0,3.825137,1.639344,56.830601,0.0,2.185792,0.0,2.73224,26.229508,6.557377
4,0.0,2.762431,0.0,0.0,81.21547,1.104972,0.0,12.154696,0.0,2.762431
5,0.0,1.648352,0.0,3.846154,0.0,75.824176,0.0,4.945055,6.043956,7.692308
6,0.0,0.552486,0.0,0.0,0.0,0.0,99.447514,0.0,0.0,0.0
7,0.0,1.117318,0.0,0.0,1.675978,0.0,0.0,94.413408,0.0,2.793296
8,0.0,7.471264,0.574713,2.298851,0.0,1.724138,0.0,4.022989,83.333333,0.574713
9,0.0,1.666667,0.0,15.555556,0.0,0.0,0.0,6.111111,11.111111,65.555556


------------------------------------------------------------------------------------------------------------------
Confusion Matrix for Discriminative Classifier using 10 instances of DecisionTree with stop criterion n_min=20


G-Truth/Predicted,0,1,2,3,4,5,6,7,8,9
0,92.696629,0.561798,0.0,0.0,1.685393,3.370787,1.123596,0.0,0.0,0.561798
1,0.0,86.263736,1.648352,0.549451,2.747253,0.0,0.549451,2.197802,4.945055,1.098901
2,0.0,6.779661,80.79096,0.564972,1.129944,0.0,0.0,2.824859,6.779661,1.129944
3,0.0,0.0,2.185792,77.04918,0.546448,0.546448,0.0,3.278689,7.103825,9.289617
4,0.552486,0.552486,0.0,0.0,95.58011,1.104972,1.104972,1.104972,0.0,0.0
5,0.0,0.549451,0.0,2.197802,1.648352,81.318681,2.747253,3.296703,2.747253,5.494505
6,0.552486,3.867403,0.552486,0.0,1.104972,1.104972,92.265193,0.0,0.552486,0.0
7,0.558659,0.558659,0.0,0.0,1.117318,1.675978,0.0,92.178771,2.234637,1.675978
8,0.0,5.747126,3.448276,2.298851,1.724138,0.574713,3.448276,6.321839,75.862069,0.574713
9,1.111111,1.666667,1.111111,4.444444,1.111111,0.555556,0.555556,3.888889,7.222222,78.333333


------------------------------------------------------------------------------------------------------------------
Confusion Matrix for Generative Classifier using 10 instances of DensityTree with stop criterion n_min=10


G-Truth/Predicted,0,1,2,3,4,5,6,7,8,9
0,99.438202,0.0,0.0,0.0,0.561798,0.0,0.0,0.0,0.0,0.0
1,0.0,74.725275,1.098901,0.549451,1.648352,1.098901,0.0,0.0,17.032967,3.846154
2,0.0,4.519774,44.067797,22.59887,0.0,0.564972,0.0,0.0,28.248588,0.0
3,0.0,2.73224,2.185792,71.038251,0.0,3.825137,0.0,2.185792,14.754098,3.278689
4,0.0,0.0,0.0,0.0,92.265193,1.657459,0.0,4.972376,0.552486,0.552486
5,0.0,0.0,0.0,0.549451,0.0,89.010989,0.0,1.098901,4.945055,4.395604
6,0.0,0.0,0.0,0.0,0.0,0.552486,98.342541,0.0,1.104972,0.0
7,0.0,1.117318,0.0,0.0,4.469274,2.234637,0.0,90.502793,1.675978,0.0
8,0.0,5.747126,0.0,4.022989,1.149425,2.873563,0.0,0.574713,85.057471,0.574713
9,0.0,1.111111,0.0,14.444444,3.888889,1.666667,0.0,5.555556,7.222222,66.111111


------------------------------------------------------------------------------------------------------------------
Confusion Matrix for Discriminative Classifier using 10 instances of DecisionTree with stop criterion n_min=10


G-Truth/Predicted,0,1,2,3,4,5,6,7,8,9
0,99.438202,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.561798
1,0.0,92.307692,2.197802,0.549451,0.0,1.098901,0.0,0.0,3.296703,0.549451
2,0.564972,3.389831,94.350282,0.564972,0.0,0.564972,0.0,0.0,0.564972,0.0
3,0.546448,1.092896,2.185792,90.163934,0.546448,1.092896,0.0,0.546448,1.092896,2.73224
4,0.0,1.657459,1.104972,1.104972,92.81768,0.552486,0.552486,1.657459,0.0,0.552486
5,0.549451,0.0,1.098901,0.0,0.549451,93.956044,2.197802,0.0,0.0,1.648352
6,0.552486,0.0,1.657459,0.0,0.0,2.209945,93.922652,0.0,1.657459,0.0
7,1.117318,0.0,2.793296,0.558659,0.0,0.0,0.0,94.413408,1.117318,0.0
8,1.724138,1.724138,2.298851,2.298851,1.724138,0.574713,0.574713,0.0,89.08046,0.0
9,0.0,0.0,2.777778,1.666667,0.0,0.0,0.0,0.555556,3.333333,91.666667


The diagonals of the confusion matrices give the accuracies of predictions of tragets, the off-diagonal elements are the error cases. The discriminative classifiers with decision trees always perform better than the generative classifiers with density trees, reflecting the fact that accurate generative modeling is harder than discriminative modeling.

Decreasing n_min increases the training accuracy of both classifiers, but this does not imply that the test accuracy would also increase.

Since the success rate highly depends on the random selection of feature subsets when searching for the optimal split in each node, the results can change significantly by repeating the experiments

If all leaves of decision tree are trained to purity (by setting n_min=0), it makes sense that the training accuracy is 100%. However, this does not mean that the test error decreased as well -- in fact, it will typically increase due to overfitting.

In [4]:
# Train discriminative classifier
dcdt = DecisionTree()
dcdt.fit(data, target, n_min=0)

    # Predict and compute full training error confusion matrix
confusion_dcdt = np.zeros((10,10))
# For each target subset
for i in range(10):
    # Predictions generative classifier
    predictions = np.array([np.argmax(dcdt.predict(j)) for j in data_subsets[i]])
    confusion_dcdt[i, :] = np.bincount(predictions,minlength=10)/len(data_subsets[i])*100

print('------------------------------------------------------------------------------------------------------------------')
print('Confusion Matrix for Discriminative Classifier using 10 instances of Decision Tree')

display(
    pd.DataFrame(data = confusion_dcdt, index =range(10), columns =range(10) )
    .rename_axis('G-Truth/Predicted', axis = 'columns')
)

------------------------------------------------------------------------------------------------------------------
Confusion Matrix for Discriminative Classifier using 10 instances of Decision Tree


G-Truth/Predicted,0,1,2,3,4,5,6,7,8,9
0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0


### Density Forest and Decision Forest

In [5]:
for n in n_min:
    # Train generative classifier with density forest
    gcdf = GenerativeClassifierDensityForest(20)
    gcdf.fit(data, target, n)
    
    # Train discriminative classifier
    dcdf = DecisionForest(20)
    dcdf.fit(data, target, n)
    
    # Train sklearn's predefined decision forest 
    rfc = RandomForestClassifier(20, min_samples_split=n)
    rfc.fit(data, target)
    
    # Predict and compute full training error confusion matrices
    confusion_gcdf = np.empty((10, 10))
    confusion_dcdf = np.empty((10, 10))
    confusion_rfc = np.empty((10, 10))
    # For each target subset
    for i in range(10):
        # Predictions generative classifier
        predictions = np.array([gcdf.predict(j) for j in data_subsets[i]])
        confusion_gcdf[i, :] = np.bincount(predictions, minlength=10) / len(data_subsets[i]) * 100
        # Predictions discriminative classifier
        predictions = np.array([np.argmax(dcdf.predict(i)) for i in data_subsets[i]])
        confusion_dcdf[i, :] = np.bincount(predictions, minlength=10) / len(data_subsets[i]) * 100
        # Predictions for sklearn random forest
        predictions = rfc.predict(data_subsets[i])
        confusion_rfc[i, :] = np.bincount(predictions, minlength=10) / len(data_subsets[i]) * 100
        
    print('------------------------------------------------------------------------------------------------------------------')
    print('Confusion Matrix for Generative Classifier using 10 instances of DensityForest with stop criterion n_min={}'.format(n))
    
    display(
        pd.DataFrame(data=confusion_gcdf, index=range(10), columns=range(10))
        .rename_axis('G-Truth/Predicted', axis='columns')
    )
    print('------------------------------------------------------------------------------------------------------------------')
    print('Confusion Matrix for Discriminative Classifier using 10 instances of DecisionForest with stop criterion n_min={}'.format(n))
    
    display(
        pd.DataFrame(data=confusion_dcdf, index=range(10), columns=range(10))
        .rename_axis('G-Truth/Predicted', axis='columns')
    )
    print('------------------------------------------------------------------------------------------------------------------')
    print('Confusion Matrix for sklearn\'s RandomForest with stop criterion n_min={}'.format(n))
    
    display(
        pd.DataFrame(data=confusion_rfc, index=range(10), columns=range(10))
        .rename_axis('G-Truth/Predicted', axis='columns')
    )

------------------------------------------------------------------------------------------------------------------
Confusion Matrix for Generative Classifier using 10 instances of DensityForest with stop criterion n_min=20


G-Truth/Predicted,0,1,2,3,4,5,6,7,8,9
0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,94.505495,2.197802,0.0,0.0,0.0,0.0,0.549451,2.197802,0.549451
2,0.0,4.519774,84.745763,3.389831,0.0,0.0,0.0,0.0,7.344633,0.0
3,0.0,0.546448,0.0,86.885246,0.0,0.546448,0.0,2.185792,9.836066,0.0
4,0.0,2.209945,0.0,0.0,96.132597,0.0,0.0,1.657459,0.0,0.0
5,0.0,0.0,0.0,7.692308,0.0,85.164835,0.0,1.648352,4.945055,0.549451
6,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,1.117318,0.558659,0.0,98.324022,0.0,0.0
8,0.0,2.873563,0.574713,1.149425,0.0,1.149425,0.0,1.724138,92.528736,0.0
9,0.0,1.111111,0.0,23.888889,0.0,0.555556,0.0,5.555556,8.333333,60.555556


------------------------------------------------------------------------------------------------------------------
Confusion Matrix for Discriminative Classifier using 10 instances of DecisionForest with stop criterion n_min=20


G-Truth/Predicted,0,1,2,3,4,5,6,7,8,9
0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,98.895028,0.0,0.0,1.104972,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,99.450549,0.0,0.0,0.0,0.549451
6,0.0,0.0,0.552486,0.0,0.0,0.552486,98.342541,0.0,0.552486,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.441341,0.0,0.558659
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0


------------------------------------------------------------------------------------------------------------------
Confusion Matrix for sklearn's RandomForest with stop criterion n_min=20


G-Truth/Predicted,0,1,2,3,4,5,6,7,8,9
0,99.438202,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.561798,0.0
1,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,96.721311,0.0,0.546448,0.0,0.546448,2.185792,0.0
4,0.0,0.0,0.0,0.0,98.895028,0.0,0.0,1.104972,0.0,0.0
5,0.0,0.0,0.0,0.549451,0.0,98.351648,0.0,0.0,0.0,1.098901
6,1.104972,0.0,0.0,0.0,0.552486,0.0,98.342541,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
8,0.0,1.149425,0.0,0.0,0.0,0.0,0.0,0.574713,97.126437,1.149425
9,0.0,0.0,0.0,0.555556,0.0,1.111111,0.0,1.111111,1.666667,95.555556


------------------------------------------------------------------------------------------------------------------
Confusion Matrix for Generative Classifier using 10 instances of DensityForest with stop criterion n_min=10


G-Truth/Predicted,0,1,2,3,4,5,6,7,8,9
0,99.438202,0.561798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,93.406593,2.747253,0.0,0.549451,0.0,0.0,0.0,2.747253,0.549451
2,0.0,0.564972,84.180791,3.389831,0.0,0.0,0.0,0.0,11.864407,0.0
3,0.0,0.0,0.0,83.606557,0.0,2.185792,0.0,2.185792,8.743169,3.278689
4,0.0,1.104972,0.0,0.0,95.58011,0.0,0.0,2.762431,0.0,0.552486
5,0.0,0.549451,0.0,2.747253,0.0,90.659341,0.0,0.549451,3.846154,1.648352
6,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
8,0.0,6.896552,0.574713,1.724138,0.0,0.0,0.0,1.724138,87.356322,1.724138
9,0.0,2.222222,0.0,10.555556,1.111111,1.666667,0.0,5.0,5.555556,73.888889


------------------------------------------------------------------------------------------------------------------
Confusion Matrix for Discriminative Classifier using 10 instances of DecisionForest with stop criterion n_min=10


G-Truth/Predicted,0,1,2,3,4,5,6,7,8,9
0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.552486,99.447514,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.441341,0.0,0.558659
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0


------------------------------------------------------------------------------------------------------------------
Confusion Matrix for sklearn's RandomForest with stop criterion n_min=10


G-Truth/Predicted,0,1,2,3,4,5,6,7,8,9
0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,99.453552,0.0,0.546448,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,99.450549,0.0,0.0,0.0,0.549451
6,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
8,0.0,0.574713,0.0,0.0,0.0,0.0,0.0,0.0,99.425287,0.0
9,0.0,0.0,0.0,0.0,0.0,0.555556,0.0,0.0,0.0,99.444444


The ensembles improve 