In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_kernels
from sklearn.metrics import mean_squared_error
from sklearn.kernel_ridge import KernelRidge
from sklearn.datasets import fetch_mldata
from sklearn.utils import check_random_state
from sklearn.kernel_approximation import Nystroem
from nystrom import nystrom_kernel
from scipy.spatial.distance import pdist

from time import time

In [2]:
# data = fetch_mldata('regression-datasets house_16H').data
# n
# X = data[:, [1]]
# y = data[:, 0]

# print(X, y)

In [3]:
n_train_samples = 1e4
n_test_samples = 1e3
d_dims = 1
random_state = 123

generator = check_random_state(random_state)
x_train = generator.randn(int(n_train_samples))
y_train = np.sin(x_train) * 0.1 * generator.randn(int(n_train_samples))

x_test = generator.randn(int(n_test_samples))
y_test = np.sin(x_test) * 0.1 * generator.randn(int(n_test_samples))

x_train = x_train[:, np.newaxis]
x_test = x_test[:, np.newaxis]
y_train = y_train[:, np.newaxis]
y_test = y_test[:, np.newaxis]

print('Size of x_train: {}'.format(x_train.shape))
print('Size of x_train: {}'.format(x_test.shape))

Size of x_train: (10000, 1)
Size of x_train: (1000, 1)


In [18]:
n_samples = 100  # number of samples
k_rank = 10      # rank for rsvd
n_components = 100 # number of components to keep
lam = 1e-3  # regularization parameter
kernel = 'rbf'
sigma = np.mean(pdist(x_train, metric='euclidean'))
gamma = 1 / (2 * sigma**2)

### Scikit-Learn (Naive)

In [19]:
t0 = time()

krr_model = KernelRidge(alpha=lam, kernel=kernel, gamma=gamma)
krr_model.fit(x_train, y_train)

t1 = time() - t0
print('Time taken for normal: {:4f} secs'.format(t1))

Time taken for normal: 10.094844 secs


In [20]:
y_pred_normal = krr_model.predict(x_test)

error_normal = mean_squared_error(y_pred_normal.squeeze(),
                                  y_test.squeeze())
print(error_normal)

0.00414979449472


### Scikit-Learn (Nystrom Approximation)

In [27]:
t0 = time()
nystrom = Nystroem(kernel=kernel, 
                   n_components=n_components,
                   gamma=gamma,
                   random_state=random_state)

L = nystrom.fit_transform(x_train)

t1 = time() - t0
print('Time taken for Nystrom Approximation: {:.4f} secs'.format(t1))

print('Size of L: {}'.format(L.shape))

Time taken for Nystrom Approximation: 0.0440 secs
Size of L: (10000, 100)


In [22]:
# KRR Training

t0 = time()

temp = L.T.dot(y_train)
temp = np.linalg.solve(lam * np.eye(L.shape[1]) + L.T.dot(L), temp)
weights_nystrom = y_train - L.dot(temp)
weights_nystrom /= lam

t1 = time() - t0
print('Time taken for KRR Training: {:.4f} secs'.format(t1))

Time taken for KRR Training: 0.0094 secs


In [23]:
# KRR Predictions
K_test = pairwise_kernels(x_train, Y=x_test, metric='rbf', gamma=gamma)
y_pred_nystrom = K_test.T.dot(weights_nystrom)
error_nystrom = mean_squared_error(y_pred_nystrom.squeeze(),
                                   y_test.squeeze())
print(error_nystrom)

print(np.abs(error_normal - error_nystrom))

0.00414984036403
4.58693137553e-08


### Nystrom w/ RSVD

In [24]:
from sklearn.utils.extmath import randomized_svd
# KRR Training

t0 = time()

k_rank = 50

rnd = check_random_state(random_state)
inds = rnd.permutation(n_samples)
basis_inds = inds[:n_components]
basis = x_train[basis_inds]

basis_kernel = pairwise_kernels(basis, metric=kernel,
                                gamma=gamma)

# RSVD
U, S, V = randomized_svd(basis_kernel, k_rank)

S = np.maximum(S, 1e-12)
normalization = np.dot(U / np.sqrt(S), V)
components = basis
component_indices = inds

embedding = pairwise_kernels(x_train, Y=components,
                             metric=kernel, gamma=gamma)

Lr = np.dot(embedding, normalization)

t1 = time() - t0
print('Time taken for Nystrom Approximation: {:.4f} secs'.format(t1))


Time taken for Nystrom Approximation: 0.0361 secs


In [25]:
t0 = time()

temp = Lr.T.dot(y_train)
temp = np.linalg.solve(lam * np.eye(Lr.shape[1]) + Lr.T.dot(Lr), temp)
weights_rnystrom = y_train - Lr.dot(temp)
weights_rnystrom /= lam

t1 = time() - t0
print('Time taken for KRR Training: {:.4f} secs'.format(t1))

Time taken for KRR Training: 0.0096 secs


In [26]:
# KRR Predictions
K_test = pairwise_kernels(x_train, Y=x_test, metric='rbf', gamma=gamma)
y_pred_rnystrom = K_test.T.dot(weights_rnystrom)
error_rnystrom = mean_squared_error(y_pred_rnystrom.squeeze(), y_test.squeeze())
print(error_nystrom)

print(np.abs(error_normal - error_rnystrom))
print(np.abs(error_nystrom - error_rnystrom))

0.00414984036403
2.49666620156e-08
2.09026517397e-08


### Training

In [8]:
n_samples = 100  # number of samples
n_components = 100 # number of components to keep
lam = 1e-3  # regularization parameter

t0 = time()
K = pairwise_kernels(x_train, metric='rbf')
t1 = time() - t0
print('Time taken for kernel mat: {:.4f} secs'.format(t1))

print('Shape of K: {}'.format(K.shape))

Time taken for kernel mat: 1.7629 secs
Shape of K: (10000, 10000)


#### Nystrom Approximation

In [5]:
# -------------------
# Randomized
# -------------------

n_col_indices = 1000 # number of columns to sample
n_components = 1000  # rank
random_state = 123  # reproducibility
svd = 'randomized'  # svd algorithm

U_approx, D_approx, W_approx, C = nystrom_kernel(
    K, n_col_indices, n_components=n_components, 
    random_state=random_state, 
    svd=svd)

print('Size of U_approx: {}'.format(U_approx.shape))
print('Size of D_approx: {}'.format(D_approx.shape))
print('Size of W_approx: {}'.format(W_approx.shape))
print('Size of C: {}'.format(C.shape))

Size of U_approx: (10000, 1000)
Size of D_approx: (1000, 1000)
Size of W_approx: (1000, 1000)
Size of C: (10000, 1000)


In [6]:
K_approx = U_approx.dot(D_approx).dot(U_approx.T)
K_approx2 = C.dot(np.linalg.pinv(W_approx)).dot(C.T)

err = np.linalg.norm(K - K_approx, 'fro')
err_2 = np.linalg.norm(K - K_approx2, 'fro')
print('Error ({}): {:.3e}'.format(svd, err))
print('Error ({}) CWC.T: {:.3e}'.format(svd, err_2))

Error (randomized): 9.779e-03
Error (randomized) CWC.T: 2.956e-01


Time taken for normal: 43.450791 secs


In [8]:
t0 = time()
W_pinverse = np.linalg.pinv(W_approx)

inverse = np.linalg.pinv(np.eye(n_col_indices) + 
                         W_pinverse.dot(C.T).dot(C))
W_inverse = np.linalg.inv(W_approx)

mat_prod = C.dot(inverse).dot(W_inverse).dot(C.T)
weights_nystrom = (1/lam) * (lam * np.eye(int(n_train_samples)) - mat_prod).dot(y_train)

t1 = time() - t0
print('Time taken for normal: {:4f} secs'.format(t1))

Time taken for normal: 7.785578 secs


In [48]:
print('Size of weights: {}'.format(weights.shape))
print('Size of weights (nystrom): {}'.format(weights_nystrom.shape))
print('Size of y_train: {}'.format(y_train.shape))

Size of weights: (10000, 1)
Size of weights (nystrom): (10000, 1)
Size of y_train: (10000, 1)


In [9]:
from sklearn.metrics import mean_squared_error


In [49]:

K_test = pairwise_kernels(x_train, Y=x_test, metric='rbf')
print(K.shape, weights.shape)
y_pred = K_test.T.dot(weights)

print(y_pred.shape, y_test.shape)
error_normal = mean_squared_error(y_pred.squeeze(), y_test.squeeze())
print(error_normal)

(10000, 10000) (10000, 1)
(1000, 1) (1000, 1)
0.00416038639092


In [11]:
K_test = pairwise_kernels(x_train, Y=x_test, metric='rbf')
y_pred_ny = K_test.T.dot(weights_nystrom)

print(y_pred_ny.shape, y_test.shape)
error_nystrom = mean_squared_error(y_pred_ny, y_test)
print(error_nystrom)

(1000, 1) (1000, 1)
21452478184.9
