In [23]:
import numpy as np
import warnings 
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
import scipy as scio
from scipy.spatial.distance import pdist
from scipy.linalg import cho_factor, cho_solve, cholesky
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics import mean_squared_error
from sklearn.utils import check_array, check_random_state
from sklearn.linear_model.ridge import _solve_cholesky_kernel as kernel_solve
from time import time
from sklearn.decomposition import TruncatedSVD

%matplotlib inline
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

## Generate Data

In [24]:
# generate datasets
random_state = 123
n_samples = 10000
n_features = 100

# create data
x_data, y_data = make_regression(n_samples=n_samples,
                                 n_features=n_features,
                                 random_state=random_state)



# split data into training and testing
train_percent = 0.2

x_train, x_test, y_train, y_test = train_test_split(
    x_data, y_data, train_size=train_percent,
    random_state=random_state
)

# remove the mean from the training data
y_mean = np.mean(y_train)

y_train -= y_mean
y_test -= y_mean

n_samples = x_train.shape
print('Shape of X: ', x_train.shape, x_test.shape)

Shape of X:  (2000, 100) (8000, 100)


## Nystrom Approximation of a Kernel Matrix

#### Kernel Matrix of Data

In [25]:
sigma = np.mean(pdist(x_train, metric='euclidean'))
gamma = 1 / (2 * sigma**2)

K = rbf_kernel(x_train, gamma=gamma)

print('Size of Kernel matrix: ', K.shape)

Size of Kernel matrix:  (2000, 2000)


#### Sampling

In [26]:
# uniform sampling without replacement
indices = np.random.permutation(n_samples)

# choose the number of column indices
n_column_indices = 1200

# choose the columns randomly from the matrix
C = K[:, :n_column_indices]

print('Size of the sampled K matrix, C: ', C.shape)

# get the other sampled columns
W = C[:n_column_indices, :]

print('Size of m-by-m intersection matrix, W: ', W.shape)

Size of the sampled K matrix, C:  (2000, 1200)
Size of m-by-m intersection matrix, W:  (1200, 1200)


#### Truncated SVD

In [35]:
k_components = 1000
t0 = time()
normal_svd = TruncatedSVD(n_components=k_components,
                          algorithm='arpack',
                          random_state=123)

V =  normal_svd.fit_transform(W)
D = np.diag(normal_svd.singular_values_)

t1 = time() - t0
print('Time taken for normal SVD: {:.4f} seconds'.format(t1))

print('Size of V:', V.shape)
print('Size of D:', D.shape)

Time taken for normal SVD: 10.7687 seconds
Size of V: (1200, 1000)
Size of D: (1000, 1000)


In [36]:
t0 = time()

r_svd = TruncatedSVD(n_components=k_components,
                          algorithm='randomized',
                          random_state=123)

rV =  r_svd.fit_transform(W)
rD = np.diag(normal_svd.singular_values_)

t1 = time() - t0
print('Time taken for randomized SVD: {:.4f} seconds'.format(t1))


print('Size of rV:', rV.shape)
print('Size of rD:', rD.shape)

Time taken for randomized SVD: 1.0368 seconds
Size of rV: (1200, 1000)
Size of rD: (1000, 1000)
