# Chapter 7 Exercises

This notebook holds the code for the chapter 7 exercises that require coding

Question 8:
- Load MNIST, split into: train, test, validation (40,000, 10,000, 10,000)
- Train various classifiers (RF, Extra-Trees, SVM)
- Combine the above classifiers into an ensemble that outperforms on validation set, use hard or soft voting classifier. 
- Try the ensemble on the test set, how much better is it than individual classifiers? 

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import itertools

from sklearn.datasets import fetch_mldata
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm_notebook as tqdm

In [2]:
TRAIN_SIZE = 40000
VALID_SIZE = 10000
TEST_SIZE = 10000

RANDOM_STATE = 42
ROUND_TO = 4
MAX_ITER = 500

In [3]:
def get_data(type_of_data):
    mnist_data = fetch_mldata(type_of_data)

    X_train, X_test_valid, y_train, y_test_valid = train_test_split(mnist_data.data, 
                                                    mnist_data.target, 
                                                    test_size=TEST_SIZE + VALID_SIZE,
                                                    train_size=TRAIN_SIZE,
                                                    random_state=RANDOM_STATE)
 
    X_valid, X_test, y_valid, y_test = train_test_split(X_test_valid, 
                                                    y_test_valid, 
                                                    test_size=TEST_SIZE, 
                                                    random_state=RANDOM_STATE)
    
    all_data = {
        'X_train': X_train, 
        'X_valid': X_valid, 
        'X_test': X_test,
        'y_train': y_train, 
        'y_valid': y_valid, 
        'y_test': y_test
    }
    
    
    return all_data

In [4]:
def plot_confusion_matrix(cm, 
                          classes,
                          normalize=False,
                          title='Confusion Matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [5]:
def fit_classifier(clf, data, round_to=4, classifier_name='', plot_cnf_mat=False):
    X_train = data['X_train']
    y_train = data['y_train']
    X_valid = data['X_valid']
    y_valid = data['y_valid']
    X_test = data['X_test']
    y_test = data['y_test']
    
    if classifier_name == 'svm':
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_valid = scaler.transform(X_valid)
        X_test = scaler.transform(X_test)
    
    clf.fit(X_train, y_train)

    valid_predictions = clf.predict(X_valid)
    valid_cnf_mat = confusion_matrix(y_valid, valid_predictions) 
    valid_accuracy = sum(np.diag(valid_cnf_mat)) / sum(valid_cnf_mat.flatten())
    valid_accuracy_r = round(valid_accuracy, round_to)
    
    test_predictions = clf.predict(X_test)
    test_cnf_mat = confusion_matrix(y_test, test_predictions)
    test_accuracy = sum(np.diag(test_cnf_mat)) / sum(test_cnf_mat.flatten())
    test_accuracy_r = round(test_accuracy, round_to)
    
    title = 'The {0} accuracy is {1:.3f}'.format(classifier_name, test_accuracy)
    
    if plot_cnf_mat:
        plot_confusion_matrix(svm_cnf_mat, classes=np.unique(y_train), title=svm_title, normalize=True)
    
    return clf, valid_predictions, valid_accuracy_r, test_accuracy_r

In [6]:
%%time
data = get_data('MNIST original')
classifiers = {
    'rf': RandomForestClassifier(), 
    'et': ExtraTreesClassifier(), 
    'svm': SVC(max_iter=MAX_ITER)
}

validation_accuracies = dict(zip(list(classifiers.keys()), [None] * len(classifiers)))
test_accuracies = dict(zip(list(classifiers.keys()), [None] * len(classifiers)))
valid_predictions = dict(zip(list(classifiers.keys()), [None] * len(classifiers)))
vtg_valid_accuracy_diffs = dict(zip(list(classifiers.keys()), [None] * len(classifiers)))
vtg_test_accuracy_diffs = dict(zip(list(classifiers.keys()), [None] * len(classifiers)))

for key, classifier in classifiers.items():
    clf, valid_prediction, validation_accuracy, test_accuracy = fit_classifier(classifier, 
                                                                               data, 
                                                                               round_to=ROUND_TO, 
                                                                               classifier_name=key)
    validation_accuracies[key] = validation_accuracy
    test_accuracies[key] = test_accuracy
    valid_predictions[key] = valid_prediction
    classifiers[key] = clf



CPU times: user 9min 35s, sys: 744 ms, total: 9min 36s
Wall time: 9min 40s


In [8]:
vtg_clf = VotingClassifier(list(classifiers.items()), voting='hard')
vtg_clf, vtg_valid_prediction, vtg_validation_accuracy, vtg_test_accuracy = fit_classifier(vtg_clf, 
                                                                                           data, 
                                                                                           round_to=ROUND_TO, 
                                                                                           classifier_name='vtg')
for key in classifiers.keys():    
    vtg_valid_accuracy_diffs[key] = vtg_validation_accuracy - validation_accuracies[key]
    vtg_test_accuracy_diffs[key] = vtg_test_accuracy - test_accuracies[key]




NameError: name 'vtg_valid_accuray_diffs' is not defined

In [9]:
print(vtg_valid_accuracy_diffs)
print(vtg_test_accuracy_diffs)
print('Voting test acc: {}'.format(vtg_test_accuracy))

{'rf': -0.0014999999999999458, 'et': -0.0057999999999999163, 'svm': -0.020299999999999985}
{'rf': -0.0041999999999999815, 'et': -0.0097999999999999199, 'svm': -0.021299999999999986}
Voting test acc: 0.937


## Exercise 9

- Run above clfs on validation set, create a new training set, with the resulting predictions: each training instance is a vector with the predictions from each classifier
- Perform evaluation of the test set with the new ensemble

In [15]:
testing_preds = []

for clf in classifiers.values():
    testing_preds_clf = clf.predict(data['X_test'])
    testing_preds.append(testing_preds_clf)
    
new_training_data = np.hstack(tuple([values for values in list(valid_predictions.values())])).reshape(TEST_SIZE, -1)
new_testing_data = np.hstack(tuple(testing_preds)).reshape(TEST_SIZE, -1)

blender = RandomForestClassifier()
blender.fit(new_training_data, data['y_valid'])

blender_predictions = blender.predict(new_testing_data)
blender_cnf_mat = confusion_matrix(data['y_test'], blender_predictions)
blender_accuracy = sum(np.diag(blender_cnf_mat)) / sum(blender_cnf_mat.flatten())
blender_accuracy_r = round(test_accuracy, ROUND_TO)

print('The blender accuracy is: {0:.3f}'.format(blender_accuracy_r))
print('Therefore the difference in the performanace is: {0:.3f}'.format(blender_accuracy_r - vtg_test_accuracy))


{'rf': array([ 8.,  0.,  8., ...,  8.,  4.,  4.]), 'et': array([ 8.,  0.,  8., ...,  8.,  4.,  4.]), 'svm': array([ 8.,  0.,  8., ...,  8.,  4.,  4.])}
(10000, 3)
(10000, 3)
The blender accuracy is: 0.9583
Therefore the difference in the performanace is: 0.021299999999999986


Therefore the difference in the performanace is: 0.021
