In [247]:
import numpy as np
import scipy.sparse
from sklearn.exceptions import NotFittedError
from sklearn.metrics import pairwise_kernels
from sklearn.decomposition import KernelPCA
import torch

import sys
sys.path.append("../")

from src.debiasing.utils import get_design_matrix

In [249]:
def centered_kernel(X, X_index, Y, Y_index, kernel="linear", filter_params=True, n_jobs=None, torch=False, **kwds):
    """Compute the group-mean-centered kernel between arrays X and Y.
    
    This method takes either a vector array and returns a kernel matrix. 
    For the mean centering of the kernel within each group, the index to which 
    each sample belongs to must be privided as ``X_index`` and ``Y_index``. 
    When there is a sample that does not belong to any group, its index 
    must be set ``nan``, and centering is not applied to the sample.
    
    Parameters
    ----------
    X : array [n_samples_a, n_features]
        A feature array which is sorted accoding to the group index. 
        Note that if the index of an sample is 'np.nan', the sample must come at the last of the array.  
    X_index: integer array [n_samples]
        A sorted array of indices to which samples in X belongs to. When a sample does 
        not belong to any group, its index must be ``np.nan``. 
    Y: array [n_samples_b, 1 + n_features]
        A second feature array which is sorted accoding to the group index. 
        Note that if the index of an sample is 'np.nan', the sample must come at the last of the array.  
    Y_index: integer array [n_samples]
        A sorted array of indices to which samples in Y belongs to. When a sample does 
        not belong to any group, its index must be ``np.nan``. 
    kernel: string or callable
        The kernel to use when calculating kernel between instances in a
        feature array. Valid string values for kernel are:
        ['additive_chi2', 'chi2', 'linear', 'poly', 'polynomial', 'rbf', 'laplacian', 'sigmoid', 'cosine'],
        which must be one of the kernels in sklearn.pairwise.PAIRWISE_KERNEL_FUNCTIONS.
        If callable is passed, kernel must have following arguments:
        - X: array [n_sample_a, n_features]
        - Y: array [n_sample_b, n_features]
        - torch: bool.
        When `torch` is set `True`, the kernel must return torch.Tensor which is differentiable w.r.t. X and Y.
    filter_params : boolean
        Whether to filter invalid parameters or not.
    n_jobs : int or None, optional (default=None)
        The number of jobs to use for the computation. This works by breaking
        down the pairwise matrix into n_jobs even slices and computing them in
        parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.
    **kwds : optional keyword parameters
        Any further parameters are passed directly to the kernel function.
        
    Returns
    -------
    K : array or torch.Tensor of shape [n_samples_a, n_samples_b]
        A mean-centered kernel matrix K such that K_{i, j} is the kernel between the
        ith and jth vectors of the given matrix X, if Y is None.
        If Y is not None, then K_{i, j} is the kernel between the ith array
        from X and the jth array from Y.
        
    """
    if isinstance(kernel, str):
        KXY = pairwise_kernels(X, Y, metric=kernel, filter_params=filter_params, n_jobs=n_jobs, **kwds)
    elif callable(kernel):
        KXY = kernel(X, Y, torch=torch)
    else:
        raise ValueError("An invalid value is given as the argument `kernel`. "
                         "`kernel` should be either string or callable.")
    DX = get_design_matrix(X_index)
    DY = get_design_matrix(Y_index)
    KXY_centered = DX @ KXY @ DY
    return KXY_centered


In [250]:
def _test_centered_kernel():
    
    D = 500
    N = 1000
    N_grouped = 900
    N_nan = N - N_grouped
    num_groups = 100
    
    # X contains groups of various size and non-centered samples
    X = np.random.randn(N * D).reshape(N, D)
    X_index = np.concatenate([
        np.repeat(np.arange(num_groups), 2),  
        np.random.randint(0, num_groups, N_grouped - 2*num_groups),
        np.full([N_nan], np.nan)
    ])
    X_index.sort()
    
    
    # Y only contains non-centered samples
    Y = np.random.randn(N * D).reshape(N, D)
    Y_index = np.array([np.nan] * N)
    
    Kxx = centered_kernel(X, X_index, X, X_index, kernel="rbf")
    assert(np.allclose(Kxx, Kxx.T))
    eigvals = np.linalg.eigvalsh(Kxx)
    assert(np.all(-1e-8 < eigvals))  # Kernel matrix must be positive definite.
    assert(np.sum(1e-8 < eigvals) == N - num_groups)  # The effective rank of the matrix should be (N - num_groups).
    
    Kxy = centered_kernel(X, X_index, Y, Y_index, kernel="rbf")
    Kyx = centered_kernel(Y, Y_index, X, X_index, kernel="rbf")
    assert(np.allclose(Kxy, Kyx.T))  # Kernel matrix must be symmetric. 

In [251]:
_test_centered_kernel()

In [181]:
np.concatenate([
    np.arange(10).reshape(10, 1), 
    np.arange(20).reshape(10, 2)
], axis=1)

array([[ 0,  0,  1],
       [ 1,  2,  3],
       [ 2,  4,  5],
       [ 3,  6,  7],
       [ 4,  8,  9],
       [ 5, 10, 11],
       [ 6, 12, 13],
       [ 7, 14, 15],
       [ 8, 16, 17],
       [ 9, 18, 19]])

In [5]:
centered_kernel()

array([-5.12694827, -4.88818365, -4.81650607, ...,  5.02625115,
        5.03215745,  5.33735244])

In [None]:
import numpy as np
from sklearn.decomposition import KernelPCA
from src.kernel_pca import centered_kernel
N = 100
D = 10
X = np.random.randn(N * D).reshape(N, D)
X_index = np.repeat(np.arange(N//2), 2)
X_combined = np.concatenate([X_index.reshape(N, 1), X], axis=1)


Y = np.random.randn(N * D).reshape(N, D)
Y_index = np.repeat(np.arange(N//2), 2)
Y_combined = np.concatenate([X_index.reshape(N, 1), X], axis=1)