## Imports

In [48]:
%matplotlib inline
import matplotlib.pyplot as plt
import wget
import os, struct
import numpy as np
import gzip
import sklearn
import sklearn.decomposition
import array

from collections import namedtuple
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import KFold
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load data

In [2]:
def download_data(directory='./data/'):
    """Download MNIST database"""

    if os.path.exists(directory):
        return

    os.makedirs(directory)
    wget.download("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz", directory, bar=None)
    wget.download("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz", directory, bar=None)
    wget.download("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz", directory, bar=None)
    wget.download("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz", directory, bar=None)

In [3]:
def load_mnist(dataset="training", digits=np.arange(10), path="./data/"):
    """Loads MNIST files into 3D numpy arrays

    Adapted from: http://abel.ee.ucla.edu/cvxopt/_downloads/mnist.py
    """

    if dataset == "training":
        fname_img = os.path.join(path, 'train-images-idx3-ubyte.gz')
        fname_lbl = os.path.join(path, 'train-labels-idx1-ubyte.gz')
    elif dataset == "testing":
        fname_img = os.path.join(path, 't10k-images-idx3-ubyte.gz')
        fname_lbl = os.path.join(path, 't10k-labels-idx1-ubyte.gz')
    else:
        raise ValueError("dataset must be 'testing' or 'training'")

    flbl = gzip.open(fname_lbl, 'rb')
    magic_nr, size = struct.unpack(">II", flbl.read(8))
    lbl = array.array("b", flbl.read())
    flbl.close()

    fimg = gzip.open(fname_img, 'rb')
    magic_nr, size, rows, cols = struct.unpack(">IIII", fimg.read(16))
    img = array.array("B", fimg.read())
    fimg.close()

    ind = [k for k in range(size) if lbl[k] in digits]
    N = len(ind)

    images = np.zeros((N, rows, cols), dtype=np.uint8)
    labels = np.zeros((N, 1), dtype=np.int8)
    for i in range(len(ind)):
        images[i] = np.array(img[ind[i] * rows * cols: (ind[i] + 1) * rows * cols]).reshape((rows, cols))
        labels[i] = lbl[ind[i]]

    return images, labels

In [4]:
def split_data():
    """Preprocess and return training and testing datasets"""

    images = namedtuple('MNIST_images', ['train', 'test'])
    labels = namedtuple('MNIST_labels', ['train', 'test'])
    images.train, labels.train = load_mnist('training')
    images.test, labels.test = load_mnist('testing')

    images.train = images.train.reshape(images.train.shape[0], -1)
    images.test = images.test.reshape(images.test.shape[0], -1)

    images.train = sklearn.preprocessing.normalize(images.train.astype(np.float), axis=1)
    images.test = sklearn.preprocessing.normalize(images.test.astype(np.float), axis=1)

    labels.train = labels.train.ravel()
    labels.test = labels.test.ravel()

    return images, labels

In [5]:
download_data()

In [6]:
images, labels = split_data()

# Experiments

## Single classifier 

In [7]:
pca = sklearn.decomposition.PCA(50).fit(images.train)
pca_images_train = pca.transform(images.train)
pca_images_test = pca.transform(images.test)

In [8]:
models = namedtuple('models', ['logit', 'svm', 'random_forest', 'extra_trees'])
predictions = namedtuple('predictions', ['logit', 'svm', 'random_forest', 'extra_trees'])

### Logistic regression

In [9]:
models.logit = LogisticRegression('l2', False).fit(pca_images_train, labels.train)
predictions.logit = models.logit.predict(pca_images_test)

In [10]:
np.mean(predictions.logit == labels.test)

0.90190000000000003

### SVM

In [11]:
models.svm = SVC(kernel='linear').fit(pca_images_train, labels.train)
predictions.svm = models.svm.predict(pca_images_test)

In [12]:
np.mean(predictions.svm == labels.test)

0.93759999999999999

### Random forest classifier

In [13]:
models.random_forest = RandomForestClassifier().fit(pca_images_train, labels.train)
predictions.random_forest = models.random_forest.predict(pca_images_test)

In [14]:
np.mean(predictions.random_forest == labels.test)

0.93259999999999998

### Extra trees classifier

In [15]:
models.extra_trees = ExtraTreesClassifier().fit(pca_images_train, labels.train)
predictions.extra_trees = models.extra_trees.predict(pca_images_test)

In [16]:
np.mean(predictions.extra_trees == labels.test)

0.93030000000000002

## Stacking classifier

In [50]:
from stacking import Stacking

### SVM + Logistic regression

In [18]:
svm_logit = Stacking(base_estimators=[(lambda X, y: SVC(C=5, kernel='poly', degree=2, gamma=1.).fit(X, y),
                                       lambda clf, X: clf.predict(X))],
                     meta_fitter=LogisticRegression('l2').fit,
                     n_folds=5)
svm_logit_predictions = svm_logit.fit(pca_images_train, labels.train).predict(pca_images_test)

In [19]:
np.mean(svm_logit_predictions == labels.test)

0.60519999999999996

### Random forest + Logistic regression

In [20]:
rf_logit = Stacking(base_estimators=[(lambda X, y: RandomForestClassifier().fit(X, y),
                                      lambda clf, X: clf.predict(X))],
                    meta_fitter=LogisticRegression('l2').fit,
                    n_folds=3)
rf_logit_predictions = rf_logit.fit(pca_images_train, labels.train).predict(pca_images_test)

In [21]:
np.mean(rf_logit_predictions == labels.test)

0.48720000000000002

### Extra trees + Logistic regression

In [22]:
et_logit = Stacking(base_estimators=[(lambda X, y: ExtraTreesClassifier().fit(X, y),
                                      lambda clf, X: clf.predict_proba(X))],
                    meta_fitter=LogisticRegression('l2').fit,
                    n_folds=3)
et_logit_predictions = et_logit.fit(pca_images_train, labels.train).predict(pca_images_test)

In [23]:
np.mean(et_logit_predictions == labels.test)

0.93230000000000002

### SVM + SVM

In [30]:
svm_svm = Stacking(base_estimators=[(lambda X, y: SVC(kernel='linear').fit(X, y),
                                     lambda clf, X: clf.predict(X))],
                   meta_fitter=SVC(C=5, kernel='poly', degree=2, gamma=1.).fit,
                   n_folds=5)
svm_svm_predictions = svm_svm.fit(pca_images_train, labels.train).predict(pca_images_test)

In [31]:
np.mean(svm_svm_predictions == labels.test)

0.93759999999999999

### Random forest + SVM

In [25]:
rf_svm = Stacking(base_estimators=[(lambda X, y: RandomForestClassifier().fit(X, y),
                                    lambda clf, X: clf.predict(X))],
                  meta_fitter=SVC(C=10, kernel='poly', degree=2, gamma=1.).fit,
                  n_folds=5)
rf_svm_predictions = rf_svm.fit(pca_images_train, labels.train).predict(pca_images_test)

In [26]:
np.mean(rf_svm_predictions == labels.test)

0.9335

### Extra trees + SVM

In [79]:
wildfowl = Stacking(base_estimators=[(ExtraTreesClassifier().fit, lambda clf, X: clf.predict_proba(X))],
                    meta_fitter=SVC(C=5, kernel='poly', degree=2, gamma=1.).fit,
                    n_folds=5)
wildfowl.fit(pca_images_train, labels.train).score(pca_images_test, labels.test)

0.9345

In [80]:
wildfowl = Stacking(base_estimators=[(ExtraTreesClassifier().fit, lambda clf, X: clf.predict_proba(X)),
                                     (RandomForestClassifier().fit, lambda clf, X: clf.predict_proba(X)),
                                     (LogisticRegression('l2').fit, lambda clf, X: clf.predict(X))],
                    meta_fitter=SVC(C=5, kernel='poly', degree=2, gamma=1.).fit,
                    n_folds=5, extend_meta=True)
wildfowl.fit(pca_images_train, labels.train).score(pca_images_test, labels.test)

0.97430000000000005

In [81]:
wildfowl = Stacking(base_estimators=[(ExtraTreesClassifier().fit, lambda clf, X: clf.predict_proba(X)),
                                     (RandomForestClassifier().fit, lambda clf, X: clf.predict_proba(X)),
                                     (LogisticRegression('l2').fit, lambda clf, X: clf.predict(X))],
                    meta_fitter=SVC(C=5, kernel='poly', degree=2, gamma=1.).fit,
                    n_folds=5, extend_meta=False)
wildfowl.fit(pca_images_train, labels.train).score(pca_images_test, labels.test)

0.9486

In [82]:
wildfowl = Stacking(base_estimators=[(ExtraTreesClassifier().fit, lambda clf, X: clf.predict_proba(X)),
                                     (RandomForestClassifier().fit, lambda clf, X: clf.predict_proba(X)),
                                     (LogisticRegression('l2').fit, lambda clf, X: clf.predict_proba(X))],
                    meta_fitter=SVC(C=5, kernel='poly', degree=2, gamma=1.).fit,
                    n_folds=5, extend_meta=True)
wildfowl.fit(pca_images_train, labels.train).score(pca_images_test, labels.test)

0.97760000000000002

In [83]:
for n_folds in [2, 3, 5, 10, 15, 20]:
    et_logit = Stacking(base_estimators=[(ExtraTreesClassifier().fit, lambda clf, X: clf.predict_proba(X)),
                                         (RandomForestClassifier().fit, lambda clf, X: clf.predict_proba(X)),
                                         (LogisticRegression('l2').fit, lambda clf, X: clf.predict_proba(X))],
                        meta_fitter=SVC(C=5, kernel='poly', degree=2, gamma=1.).fit,
                        n_folds=n_folds, extend_meta=True)
    print(et_logit.fit(pca_images_train, labels.train).score(pca_images_test, labels.test))

0.9781
0.9763
0.9787
0.9768
0.977
0.9749
