## Imports

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import wget
import os, struct
import numpy as np
import gzip
import sklearn
import sklearn.decomposition
import array

from collections import namedtuple
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import KFold

## Load data

In [2]:
def download_data(directory='./data/'):
    """Download MNIST database"""

    if os.path.exists(directory):
        return

    os.makedirs(directory)
    wget.download("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz", directory, bar=None)
    wget.download("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz", directory, bar=None)
    wget.download("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz", directory, bar=None)
    wget.download("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz", directory, bar=None)

In [3]:
def load_mnist(dataset="training", digits=np.arange(10), path="./data/"):
    """Loads MNIST files into 3D numpy arrays

    Adapted from: http://abel.ee.ucla.edu/cvxopt/_downloads/mnist.py
    """

    if dataset == "training":
        fname_img = os.path.join(path, 'train-images-idx3-ubyte.gz')
        fname_lbl = os.path.join(path, 'train-labels-idx1-ubyte.gz')
    elif dataset == "testing":
        fname_img = os.path.join(path, 't10k-images-idx3-ubyte.gz')
        fname_lbl = os.path.join(path, 't10k-labels-idx1-ubyte.gz')
    else:
        raise ValueError("dataset must be 'testing' or 'training'")

    flbl = gzip.open(fname_lbl, 'rb')
    magic_nr, size = struct.unpack(">II", flbl.read(8))
    lbl = array.array("b", flbl.read())
    flbl.close()

    fimg = gzip.open(fname_img, 'rb')
    magic_nr, size, rows, cols = struct.unpack(">IIII", fimg.read(16))
    img = array.array("B", fimg.read())
    fimg.close()

    ind = [k for k in range(size) if lbl[k] in digits]
    N = len(ind)

    images = np.zeros((N, rows, cols), dtype=np.uint8)
    labels = np.zeros((N, 1), dtype=np.int8)
    for i in range(len(ind)):
        images[i] = np.array(img[ind[i] * rows * cols: (ind[i] + 1) * rows * cols]).reshape((rows, cols))
        labels[i] = lbl[ind[i]]

    return images, labels

In [4]:
def split_data():
    """Preprocess and return training and testing datasets"""

    images = namedtuple('MNIST_images', ['train', 'test'])
    labels = namedtuple('MNIST_labels', ['train', 'test'])
    images.train, labels.train = load_mnist('training')
    images.test, labels.test = load_mnist('testing')

    images.train = images.train.reshape(images.train.shape[0], -1)
    images.test = images.test.reshape(images.test.shape[0], -1)

    images.train = sklearn.preprocessing.normalize(images.train.astype(np.float), axis=1)
    images.test = sklearn.preprocessing.normalize(images.test.astype(np.float), axis=1)

    labels.train = labels.train.ravel()
    labels.test = labels.test.ravel()

    return images, labels

In [5]:
download_data()

In [6]:
images, labels = split_data()

## Stacking

In [7]:
def reverse_splits(splits):
    return [split[::-1] for split in splits]

In [8]:
from stacking import Classifier, Stacking

ImportError: cannot import name 'Kfold'

In [None]:
def get_SVM_fitter(C=1., kernel='linear', degree=3, gamma=1.):
    return lambda X, y: Classifier(SVC(C=C, kernel=kernel, degree=degree, gamma=gamma).fit(X, y).predict)

def SVM_fitter(X, y):
    classifier = SVC(kernel=kernel, degree=degree, gamma=gamma).fit(X, y)
    return Classifier(classifier.predict)

def logit_fitter(X, y):
    classifier = LogisticRegression('l2', False).fit(X, y)
    return Classifier(classifier.predict)

def random_forest_fitter(X, y):
    classifier = RandomForestClassifier().fit(X, y)
    return Classifier(classifier.predict)

def random_forest_proba_fitter(X, y):
    classifier = RandomForestClassifier().fit(X, y)
    return Classifier(classifier.predict_proba)

def extra_trees_fitter(X, y):
    classifier = ExtraTreesClassifier().fit(X, y)
    return Classifier(classifier.predict)

def extra_trees_proba_fitter(X, y):
    classifier = ExtraTreesClassifier().fit(X, y)
    return Classifier(classifier.predict_proba)

# Experiments

## Single classifier 

In [9]:
pca = sklearn.decomposition.PCA(50).fit(images.train)
pca_images_train = pca.transform(images.train)
pca_images_test = pca.transform(images.test)

In [10]:
models = namedtuple('models', ['logit', 'svm', 'random_forest', 'extra_trees'])
predictions = namedtuple('predictions', ['logit', 'svm', 'random_forest', 'extra_trees'])

### Logistic regression

In [11]:
models.logit = LogisticRegression('l2', False).fit(pca_images_train, labels.train)
predictions.logit = models.logit.predict(pca_images_test)

In [12]:
np.mean(predictions.logit != labels.test)

0.098100000000000007

### SVM

In [14]:
models.svm = SVC(kernel = 'linear').fit(pca_images_train, labels.train)
predictions.svm = models.svm.predict(pca_images_test)

In [15]:
np.mean(predictions.svm != labels.test)

0.062399999999999997

### Random forest classifier

In [16]:
models.random_forest = RandomForestClassifier().fit(pca_images_train, labels.train)
predictions.random_forest = models.random_forest.predict(pca_images_test)

In [17]:
np.mean(predictions.random_forest != labels.test)

0.069000000000000006

### Extra trees classifier

In [18]:
models.extra_trees = ExtraTreesClassifier().fit(pca_images_train, labels.train)
predictions.extra_trees = models.extra_trees.predict(pca_images_test)

In [19]:
np.mean(predictions.extra_trees != labels.test)

0.071999999999999995

## Stacking classifier

### Logistic regression + SVM

In [44]:
logit_svm = Stacking(base_fitter=logit_fitter, meta_fitter=get_SVM_fitter(C=5, kernel='poly', degree = 2), 
                     split=lambda I: list(KFold(n=I.size, n_folds=5, shuffle=True)))
logit_svm_predictions = logit_svm.fit(pca_images_train, labels.train).predict(pca_images_test)

In [45]:
np.mean(logit_svm_predictions != labels.test)

0.42509999999999998

### Logistic regression + Random forest

In [22]:
logit_rf = Stacking(base_fitter=logit_fitter, meta_fitter=random_forest_fitter, 
                    split=lambda I: list(KFold(n=I.size, n_folds=5, shuffle=True)))
logit_rf_predictions = logit_rf.fit(pca_images_train, labels.train).predict(pca_images_test)

In [23]:
np.mean(logit_rf_predictions != labels.test)

0.0746

### Extra trees + Logistic regression

In [46]:
et_logit = Stacking(base_fitter=extra_trees_proba_fitter, meta_fitter=logit_fitter, 
                    split=lambda I: list(KFold(n=I.size, n_folds=5, shuffle=True)))
et_logit_predictions = et_logit.fit(pca_images_train, labels.train).predict(pca_images_test)

In [47]:
np.mean(et_logit_predictions != labels.test)

0.47089999999999999

### SVM + SVM

In [26]:
svm_svm = Stacking(base_fitter=get_SVM_fitter(), meta_fitter=get_SVM_fitter(C=5, kernel='poly', degree = 2), 
                   split=lambda I: list(KFold(n=I.size, n_folds=5, shuffle=True)))
svm_svm_predictions = svm_svm.fit(pca_images_train, labels.train).predict(pca_images_test)

In [27]:
np.mean(svm_svm_predictions != labels.test)

0.038399999999999997

### Random forest + SVM

In [28]:
rf_svm = Stacking(base_fitter=random_forest_fitter, meta_fitter=get_SVM_fitter(C=10, kernel='poly', degree = 2), 
                  split=lambda I: list(KFold(n=I.size, n_folds=5, shuffle=True)))
rf_svm_predictions = rf_svm.fit(pca_images_train, labels.train).predict(pca_images_test)

In [29]:
np.mean(rf_svm_predictions != labels.test)

0.0339

### Extra trees + SVM

In [30]:
wildfowl = Stacking(base_fitter=extra_trees_proba_fitter, meta_fitter=get_SVM_fitter(C=5, kernel='poly', degree = 2), 
                    split=lambda I: list(KFold(n=I.size, n_folds=5, shuffle=True)))
wildfowl_predictions = wildfowl.fit(pca_images_train, labels.train).predict(pca_images_test)

In [31]:
np.mean(wildfowl_predictions != labels.test)

0.024199999999999999

In [35]:
for n_folds in [2, 3, 5, 10, 15, 20]:   
    et_logit = Stacking(base_fitter=extra_trees_proba_fitter, meta_fitter=get_SVM_fitter(C=5, kernel='poly', degree = 2), 
                        split=lambda I: list(KFold(n=I.size, n_folds=n_folds, shuffle=True)))
    et_logit_predictions = et_logit.fit(pca_images_train, labels.train).predict(pca_images_test)
    print(np.mean(et_logit_predictions != labels.test))

0.0237
0.0215
0.0249
0.0269
0.0275
0.0282


## Multi-stacking

In [17]:
from stacking import MultiStacking

multi_wildfowl = MultiStacking(
    fitters=[
        random_forest_fitter,
        extra_trees_fitter,
        get_SVM_fitter(C=5, kernel='poly', degree=2)
    ]
)
multi_wildfowl_predictions = multi_wildfowl.fit(pca_images_train[:100], labels.train[:100]).predict(pca_images_test)
np.mean(multi_wildfowl_predictions != labels.test)

0.68210000000000004