In [1]:
# Support for maths
import numpy as np
# Plotting tools
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelBinarizer
# we use the following for plotting figures in jupyter
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# GPy: Gaussian processes library
import GPy

import time

In [2]:
N = 60000

In [3]:
# (train_data, train_labels), (test_data, test_labels) = FashionMNIST()

train_data = np.load('../../datasets/export/fashion_mnist/numpy/train_data_fashion_mnist.npy').astype('uint8')
test_data = np.load('../../datasets/export/fashion_mnist/numpy/test_data_fashion_mnist.npy').astype('uint8')
train_labels = np.load('../../datasets/export/fashion_mnist/numpy/train_targets_fashion_mnist.npy').astype('uint8')
test_labels = np.load('../../datasets/export/fashion_mnist/numpy/test_targets_fashion_mnist.npy').astype('uint8')

# Convert one-hot to integers
train_labels = np.argmax(train_labels, axis=1)[:N]
test_labels = np.argmax(test_labels, axis=1)

D = train_data[0].reshape(-1).shape[0]

# Flatten the images
train_data = train_data.reshape(-1, D)[:N]
test_data = test_data.reshape(-1, D)

In [4]:
def threshold_binarize(data, threshold):
    data_bin = np.where(data>threshold, 1, 0).astype('uint8')
    return data_bin

In [5]:
# fashion mnist has values between 0 and 255
threshold = 10

train_data_bin = threshold_binarize(train_data, threshold)
test_data_bin = threshold_binarize(test_data, threshold)

In [6]:
# like one-hot encoding with 0 corresponding to -1
label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
train_labels_bin = label_binarizer.fit_transform(train_labels)
test_labels_bin = label_binarizer.fit_transform(test_labels)

In [None]:
# => alpha = 30 seems to be a good regularization strength for the linear kernel!

In [12]:
# Create a 1-D RBF kernel with default parameters
k = GPy.kern.RBF(input_dim=784)
# Preview the kernel's parameters
k

rbf.,value,constraints,priors
variance,1.0,+ve,
lengthscale,1.0,+ve,


In [7]:
M = int(np.sqrt(60000))

In [8]:
# uniform sampling of inducing points
idxs = np.random.randint(0, high=len(train_data_bin), size=M)
Z = train_data_bin[idxs]

In [24]:
Z.shape

(244, 784)

In [30]:
from scipy.cluster.vq import kmeans

# Our initial inducing points, assigned with k-means
Z = 1. * kmeans(train_data_bin.astype('float32'), M)[0]

In [31]:
Z.shape

(244, 784)

In [13]:
# choice of inducing points does not influence kernel inversion!

since = time.time()

# We fit with sparse GP regression using randomly assigned inducing variables
m = GPy.models.SparseGPRegression(train_data_bin.astype('float32'), train_labels_bin.astype('float32'), Z=Z, kernel=k)
# m.Gaussian_noise.variance = 0.01
m.Gaussian_noise.unfix() # We fix the Gaussian observation noise

print('Time taken: {}'.format(time.time() - since))

KeyboardInterrupt: 

In [None]:
since = time.time()

m.optimize()

print('Time taken: {}'.format(time.time() - since))

In [None]:
# We will not optimise any of the hyperparameters again
mean_sparse_rand, Cov_sparse_rand = m.predict(test_data_bin, full_cov=False)

In [9]:
g_likelihood = GPy.likelihoods.Gaussian()

In [14]:
# Bernoulli likelihood
g_likelihood = GPy.likelihoods.Gaussian()

# size of minibatch
batchsize = 10

# Fit a sparse variational Gaussian process our data, inducing points and a Bernoulli likelihood
m = GPy.core.SVGP(
    train_data_bin.astype('float32'), train_labels_bin.astype('float32'), Z,
    kernel=GPy.kern.RBF(784) + GPy.kern.White(784, variance=1.e-6),
    likelihood=g_likelihood,
    batchsize=batchsize
)

# We will fix the white noise of our kernel to 10^-6
m.kern.white.unfix();

# Fix the inducing inputs to the kmeans initialised locations
m.Z.fix();

In [17]:
from climin import Adadelta

In [15]:
## This is a small utility for giving realtime output of the optimiser
import ipywidgets

logpy = ipywidgets.Label(align='right')
prog  = ipywidgets.FloatProgress(value=0., min=0., max=1.)
progress_widget = ipywidgets.HBox([prog, ipywidgets.Label("obj : "), logpy])

def write_out(*v):
    ''' writes progress and loss to the widget '''
    prog.value, logpy.value = v

max_iter = 2000
# callback during each optimisation step
def callback(i):
    ''' callback during each optimisation step: prints progress and loss, and exits after max_iterations'''
    write_out(i['n_iter']/max_iter, str(-m.log_likelihood()))
    if i['n_iter'] > max_iter:
        return True # when True is returned, the optimiser will stop
    return False

In [18]:
# Display the progress widget
display(progress_widget)

# Initialise optimiser with model parameters and gradients
opt = Adadelta(m.optimizer_array, m.stochastic_grad)

# Minimise negative log likelihood until callback returns True (i.e. after max_iter runs) 
_ = opt.minimize_until(callback)

# Print final model
display(m)

HBox(children=(FloatProgress(value=0.0, max=1.0), Label(value='obj : '), Label(value='')))

KeyboardInterrupt: 

In [8]:
m = GPy.models.GPRegression(train_data_bin, train_labels_bin, k)
m

GP_regression.,value,constraints,priors
rbf.variance,1.0,+ve,
rbf.lengthscale,1.0,+ve,
Gaussian_noise.variance,1.0,+ve,


In [10]:
# Constrain the regression parameters to be positive only
m.constrain_positive()

m.kern.variance = 0.1
m.kern.lengthscale = np.sqrt(D)

# Fix the Gaussian noise variance at 0.01 
m.Gaussian_noise.variance = 1e-6 # (Reset the parameter first)
m.Gaussian_noise.variance.fix()
# m.Gaussian_noise.variance.unfix()

m.optimize()
m

reconstraining parameters GP_regression


GP_regression.,value,constraints,priors
rbf.variance,0.3248396771646316,+ve,
rbf.lengthscale,6.8153834810410965,+ve,
Gaussian_noise.variance,1e-06,+ve fixed,


In [9]:
# Constrain the regression parameters to be positive only
m.constrain_positive()

m.kern.variance = 0.1
m.kern.lengthscale = np.sqrt(D)

# Fix the Gaussian noise variance at 0.01 
m.Gaussian_noise.variance = 1e-2 # (Reset the parameter first)
m.Gaussian_noise.variance.fix()
# m.Gaussian_noise.variance.unfix()

m.optimize()
m

reconstraining parameters GP_regression


GP_regression.,value,constraints,priors
rbf.variance,0.306347886211182,+ve,
rbf.lengthscale,7.25549786847386,+ve,
Gaussian_noise.variance,0.01,fixed +ve,


In [11]:
# Constrain the regression parameters to be positive only
m.constrain_positive()

m.kern.variance = 0.1
m.kern.lengthscale = np.sqrt(D)

# Fix the Gaussian noise variance at 0.01 
m.Gaussian_noise.variance = 10 # (Reset the parameter first)
m.Gaussian_noise.variance.fix()
# m.Gaussian_noise.variance.unfix()

m.optimize()
m

reconstraining parameters GP_regression


GP_regression.,value,constraints,priors
rbf.variance,0.6838136837863136,+ve,
rbf.lengthscale,50.92407055717207,+ve,
Gaussian_noise.variance,10.0,+ve fixed,


In [9]:
# Constrain the regression parameters to be positive only
m.constrain_positive()

m.kern.variance = 0.1
m.kern.lengthscale = np.sqrt(D)

# Fix the Gaussian noise variance at 0.01 
m.Gaussian_noise.variance = 10 # (Reset the parameter first)
# m.Gaussian_noise.variance.fix()
m.Gaussian_noise.variance.unfix()

m.optimize()
m

reconstraining parameters GP_regression


GP_regression.,value,constraints,priors
rbf.variance,0.1789339259732666,+ve,
rbf.lengthscale,7.177267570983543,+ve,
Gaussian_noise.variance,0.0414465985822242,+ve,


In [10]:
# Get mean and covariance of optimised GP
train_mean, train_cov = m.predict_noiseless(train_data_bin, full_cov=False)

In [11]:
# Get mean and covariance of optimised GP
test_mean, test_cov = m.predict_noiseless(test_data_bin, full_cov=False)

In [26]:
mean.shape

(10000, 10)

In [12]:
train_score = np.sum(np.equal(np.argmax(train_mean, 1), np.argmax(train_labels_bin, 1))) / len(train_data) * 100

In [13]:
test_score = np.sum(np.equal(np.argmax(test_mean, 1), np.argmax(test_labels_bin, 1))) / len(test_data) * 100

In [14]:
# 5K: all parameters optimized
# rbf.variance 0.1789339259732666
# rbf.lengthscale 7.177267570983543
# Gaussian_noise.variance 0.04144659858222422
test_score

85.83

In [16]:
# 1K: all parameters optimized
# rbf.variance 0.2774811544636858
# rbf.lengthscale 8.578422490836331
# Gaussian_noise.variance 0.03759184663277108
test_score

81.55

In [26]:
# 1K: no optimized kernel (0.1 variance)
score

76.59

In [22]:
# 1K: no optimized kernel (1000 variance)
score

83.67

In [45]:
# 1K: no optimized kernel
score

76.55999999999999

In [30]:
# 1K: optimized kernel
# rbf.variance 0.32483967716463163
# rbf.lengthscale 6.8153834810410965
# test score
score

80.93

In [37]:
# train score
score

100.0

In [44]:
# train score 0.01 noise
score

100.0

In [12]:
# 10K: no optimized kernel
score

83.67

In [28]:
# 10K: optimized kernel
score

86.65