In [1]:
import numpy as np
from scipy.spatial.distance import cdist
import time
from sklearn.preprocessing import LabelBinarizer
import random as ran
import time
import torch

In [2]:
"""
Superclass for classes of Preconditioners.

"""
class Preconditioner(object):

    def __init__(self, name = ""):
        self.name = name

In [3]:
class InducingPointsHelper(object):

	def __init__(self, seed):
		ran.seed(seed)
		self.name = "InducingPointsHelper"

	"""
	Returns a random selection of points from the given dataset
		X - Dataset
		M - Number of points to be selected
	"""
	def get_random_inducing_points(self, X, M):
		rand = ran.sample(range(0, X.shape[0]), M)
		return X[rand]

In [4]:
class RBF(object):

    def __init__(self, lengthscale=1., variance=1., noise=1.):
        super(RBF, self).__init__()
        self.lengthscale = lengthscale
        self.variance = variance
        self.jitter = 1e-9
        self.noise = noise / self.variance + self.jitter# dividing by variance for new strategy

    def K(self, X1, X2):
        """ GP squared exponential kernel """
        pairwise_dists = cdist(X1, X2, 'euclidean')
        return self.variance*np.exp(-0.5 * (pairwise_dists ** 2) / self.lengthscale ** 2)
        # return pairwise_dists

In [23]:
class RegularPcgPyTorch(object):

    def __init__(self, K, Y, P, init=None, threshold=1e-9, preconInv=None):
        N = np.shape(K)[0]
        if init is None:
            init = np.zeros((N,1))

        if preconInv is None:
            preconInv = np.linalg.inv(P)

        self.K = K
        self.P = P
        self.Y = Y.flatten()

        x = init
        r = Y - np.dot(K, x) #initialise residual gradient
        z = np.dot(preconInv, r)
        p = z

        outerC = 0
        
        # move data to pytorch / cuda
        x = torch.from_numpy(x)
        r = torch.from_numpy(r)
        z = torch.from_numpy(z)
        p = torch.from_numpy(p)
        K = torch.from_numpy(K)
        preconInv = torch.from_numpy(preconInv)
        
        with torch.no_grad():

            while True:
                alpha = r.t().mm(z) / p.t().mm(K).mm(p)
                x = x + alpha*p
                r_prev = r
                r = r - alpha*K.mm(p)
                # norm(residual) <= max(tol*norm(b), atol) might also be an option
                if r.t().mm(r) < threshold*N or outerC>10000:
                    break
                z_prev = z
                z = preconInv.mm(r)
                beta = z.t().mm(r) / z_prev.t().mm(r_prev)
                p = z + beta*p
                outerC = outerC + 1

        self.iterations = outerC
        self.result = x.numpy()

In [6]:
class RegularPcg(object):

	def __init__(self, K, Y, P, init=None, threshold=1e-9, preconInv=None):
		N = np.shape(K)[0]
		if init is None:
			init = np.zeros((N,1))

		if preconInv is None:
			preconInv = np.linalg.inv(P)

		self.K = K
		self.P = P
		self.Y = Y.flatten()

		x = init
		r = Y - np.dot(K, x) #initialise residual gradient
		z = np.dot(preconInv, r)
		p = z

		outerC = 0

		while True:
			alpha = np.dot(r.T, z) / np.dot(p.T,np.dot(K, p))
			x = x + alpha*p
			r_prev = r
			r = r - alpha*np.dot(K,p)
			# norm(residual) <= max(tol*norm(b), atol) might also be an option
			if (np.dot(r.T, r).flatten() < threshold*N or outerC>10000):
				break
			z_prev = z
			z = np.dot(preconInv, r)
			beta = np.dot(z.T, r) / np.dot(z_prev.T, r_prev)
			p = z + beta*p
			outerC = outerC + 1
		
		self.iterations = outerC
		self.result = x

In [7]:
"""
Nystrom Preconditioner
"""
class Nystrom(Preconditioner):

	"""
	Construct preconditioning matrix
		X - Training data
		kern - Class of kernel function
		Xm - Inducing points
		addNoise - Flag indicating whether to add likelihood variance to kernel matrix
	"""
	def __init__(self, X, kern, Xm, addNoise=True):
		super(Nystrom, self).__init__("Nystrom")

		start = time.time()

		self.kern = kern
		self.X = X
		N = np.shape(X)[0]
		M = np.shape(Xm)[0]
		self.M = M
		self.N = N

		Kxm = kern.K(X, Xm)
		Km = kern.K(Xm, Xm)

		self.Kxm = Kxm
		self.Km = Km + 1e-6*np.identity(M) # jitter
		self.KmInv = np.linalg.inv(self.Km)
        
		print('Type:', type(Kxm[0,0]))
		print('Size:', Kxm.shape, self.KmInv.shape)

		if addNoise:
			self.precon = np.dot(np.dot(Kxm,self.KmInv),Kxm.T) + self.kern.noise*np.identity(N)
		else:
			self.precon = np.dot(np.dot(Kxm,self.KmInv),Kxm.T)

		self.duration = time.time() - start

	"""
	Compute inversion of the preconditioner.
	"""
	def get_inversion(self):
		N = np.shape(self.X)[0]
		M = np.shape(self.Km)[0]
		noise = self.kern.noise
		inv_noise = float(1) / noise
		noise_matrix = noise*np.identity(M)

		eigs, eigv = np.linalg.eig(self.KmInv)
		for i in range(len(eigv)):
			if (eigs[i] < self.kern.jitter):
				eigs[i] = self.kern.jitter
			eigs[i] = np.sqrt(eigs[i])

		eigsD = np.diag(eigs)
		left = np.dot(self.Kxm, np.dot(eigv, eigsD))
		right = np.dot(eigsD, np.dot(eigv.T, self.Kxm.T))

		return inv_noise*self.woodbury_inversion(np.identity(N), left, noise_matrix, right)

	"""
	Implementation of Woodbury's matrix inversion lemma.
	"""
	def woodbury_inversion(self, Ainv, U, Cinv, V):
		left_outer = np.dot(Ainv, U)
		right_outer = np.dot(V, Ainv)
		inner = np.linalg.inv(Cinv + np.dot(V, np.dot(Ainv, U)))
		return Ainv - np.dot(left_outer, np.dot(inner, right_outer))

In [8]:
# (train_data, train_labels), (test_data, test_labels) = FashionMNIST()

train_data = np.load('../datasets/export/fashion_mnist/numpy/train_data_fashion_mnist.npy').astype('float32')
test_data = np.load('../datasets/export/fashion_mnist/numpy/test_data_fashion_mnist.npy').astype('float32')
train_labels = np.load('../datasets/export/fashion_mnist/numpy/train_targets_fashion_mnist.npy').astype('float32')
test_labels = np.load('../datasets/export/fashion_mnist/numpy/test_targets_fashion_mnist.npy').astype('float32')

train_data = train_data[:1000]
train_labels = train_labels[:1000]

# Convert one-hot to integers
train_labels = np.argmax(train_labels, axis=1)
test_labels = np.argmax(test_labels, axis=1)

D = train_data[0].reshape(-1).shape[0]
N = len(train_data)

# Flatten the images
train_data = train_data.reshape(-1, D)
test_data = test_data.reshape(-1, D)

In [9]:
def threshold_binarize(data, threshold):
    data_bin = np.where(data>threshold, 1, 0)
    return data_bin

threshold = 10

train_data_bin = threshold_binarize(train_data, threshold).astype('float32')
test_data_bin = threshold_binarize(test_data, threshold).astype('float32')

In [10]:
label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
train_labels_bin = label_binarizer.fit_transform(train_labels).astype('float32')
test_labels_bin = label_binarizer.fit_transform(test_labels).astype('float32')

In [11]:
M = int(np.sqrt(N))
ipHelper = InducingPointsHelper(0)
XmRandom = ipHelper.get_random_inducing_points(train_data,M)

In [12]:
XmRandom.shape

(31, 784)

In [13]:
kernel = RBF(lengthscale=np.sqrt(D/2), variance=1., noise=1.)

In [14]:
# originally adding noise
K = kernel.K(train_data_bin,train_data_bin) + kernel.jitter*np.identity(N)

In [15]:
K

array([[1.        , 0.64482552, 0.68118008, ..., 0.67771352, 0.91575153,
        0.76209529],
       [0.64482552, 1.        , 0.81852192, ..., 0.79587246, 0.63422142,
        0.64895107],
       [0.68118008, 0.81852192, 1.        , ..., 0.89382533, 0.66487032,
        0.64812385],
       ...,
       [0.67771352, 0.79587246, 0.89382533, ..., 1.        , 0.66148676,
        0.63990947],
       [0.91575153, 0.63422142, 0.66487032, ..., 0.66148676, 1.        ,
        0.76697113],
       [0.76209529, 0.64895107, 0.64812385, ..., 0.63990947, 0.76697113,
        1.        ]])

In [16]:
prec = Nystrom(train_data, kernel, XmRandom)

Type: <class 'numpy.float64'>
Size: (1000, 31) (31, 31)


In [17]:
prec.precon.shape

(1000, 1000)

In [18]:
inv = prec.get_inversion()

In [19]:
inv.shape

(1000, 1000)

In [20]:
train_labels[:, None].shape

(1000, 1)

In [21]:
from scipy.sparse.linalg import cg

In [96]:
?? cg

In [24]:
dual_cofs = []
for dim in range(train_labels_bin.shape[1]):
    since = time.time()
    print('Running CG for dim', dim)
    pcg = RegularPcgPyTorch(K, train_labels_bin[:, dim][:, None], prec.precon, threshold=1e-9, preconInv=inv)
    dual_cofs.append(pcg.result)
    # coef, info = cg(K, train_labels_bin[:, dim], tol=1e-5, M=inv) # M=inv
    # dual_cofs.append(coef.reshape((-1, 1)))
    # print('Info:', info)
    print('Done. Iterations:', pcg.iterations)
    print('Time:', time.time() - since)
    
dual_coef = np.hstack(dual_cofs)

Running CG for dim 0
Done. Iterations: 368
Time: 0.7589826583862305
Running CG for dim 1
Done. Iterations: 371
Time: 0.7356622219085693
Running CG for dim 2
Done. Iterations: 380
Time: 0.7248811721801758
Running CG for dim 3
Done. Iterations: 369
Time: 0.7232060432434082
Running CG for dim 4
Done. Iterations: 380
Time: 0.8736495971679688
Running CG for dim 5
Done. Iterations: 319
Time: 0.6026642322540283
Running CG for dim 6
Done. Iterations: 389
Time: 0.7373867034912109
Running CG for dim 7
Done. Iterations: 321
Time: 0.6077091693878174
Running CG for dim 8
Done. Iterations: 333
Time: 0.8163228034973145
Running CG for dim 9
Done. Iterations: 327
Time: 0.6170711517333984


In [25]:
dual_coef.shape

(1000, 10)

In [26]:
K_test = kernel.K(test_data_bin, train_data_bin)
prediction = np.dot(K_test, dual_coef)

In [27]:
prediction.shape

(10000, 10)

In [28]:
score = np.sum(np.equal(np.argmax(prediction, 1), np.argmax(test_labels_bin, 1))) / len(test_data) * 100

In [29]:
score

78.02

In [89]:
score

78.01