In [1]:
import itertools
import numpy as np
from numba import njit

from utilities import chol_params_to_lower_triangular_matrix
from utilities import cov_matrix_to_sdcorr_params
from utilities import number_of_triangular_elements_to_dimension

from jax import jacfwd
from kernel_transformations_jax import covariance_from_internal as covariance_from_internal_jax
from kernel_transformations_jax import sdcorr_from_internal as sdcorr_from_internal_jax
from kernel_transformations_jax import probability_from_internal as probability_from_internal_jax

from numpy.testing import assert_array_almost_equal

$$
\tilde{\text{vec}}
\left (
\begin{matrix}
(0,0)  &        &        &        \\
(1, 0) & (1,1)  &        &        \\
(2, 0) & (2, 1) & (2, 2) &        \\
(3, 0) & (3, 1) & (3, 2) & (3, 3) \\
\end{matrix}
\right ) =:
\tilde{\text{vec}}(L) = 
\big ( (0,0), (1,0), (1,1), (2,0), (2, 1), (2, 2), (3, 0), (3, 1), (3, 2), (3, 3) \big )^\top := v
$$

The following two functions allow us to move between these two representation of a (lower-triangular) matrix in a bijective fashion.

In [2]:
MAX_VALUE = 500

SEQUENCE_I = list(itertools.chain.from_iterable(itertools.repeat(i-1, i) for i in range(1, MAX_VALUE)))
SEQUENCE_J = list(itertools.chain.from_iterable(range(i-1) for i in range(1, MAX_VALUE)))

SEQUENCE_I = np.array(SEQUENCE_I)
SEQUENCE_J = np.array(SEQUENCE_J)

@njit
def _vectorized_index_to_matrix_index(index):
    return np.array([SEQUENCE_I[index], SEQUENCE_J[index]])

@njit
def _matrix_index_to_vectorized_index(i, j):
    return int(i * (i + 1) / 2) +  j

for k in range(100):
    assert _matrix_index_to_vectorized_index(*_vectorized_index_to_matrix_index(k)) == k

## Derivative of ``covariance_from_internal``

The graph we want to differentiate looks as follow

$$\tilde{\text{vec}}(L) \to L \to L L^\top =: \Sigma \to \tilde{\text{vec}}(\Sigma) \,,$$
where $L$ is the Cholesky factor of the covariance matrix $\Sigma$.
Let us define a function $f: \mathbb{R}^m \to \mathbb{R}^m, x \mapsto f(x)$, which takes an internal vector $x$ (of correct dimension) and transforms it to the covariance matrix as depicted above.
We want to find the Jacobian of $f$, i.e.
$$
J(f) = \left ( \frac{\partial \, f_i}{\partial \, x_j} \right )_{i, j = 1, \dots, m}
$$

We tackle this problem by finding an explicit expression for ${\partial f_i}/{\partial x_j}$ and then looping over $i,j = 1,\dots, m$.

Henceforth let $i, j$ be given and let $\Sigma = (\sigma_{i, j})$.
Note that for each $j$ we can find a unique index tuple $(a, b) = (a(j), b(j))$ such that $\tilde{\text{vec}}(L)_j = L_{a, b}$ and equivalently we find $(n, m) = (n(i), m(i))$ such that $\tilde{\text{vec}}(\Sigma)_i = \sigma_{n, m}$. Also note that for these indices it always holds that $a \geq b$ and $n \geq m$.

Now note again that with $L = \left [ \begin{matrix} \ell_1 \\ \vdots \\ \ell_m \end{matrix} \right ]$, we have $\sigma_{k, l} = \ell_k^ \, \bullet \ell_l$.
Hence we get

$$
\frac{\partial \, f_i}{\partial \, x_j} = 
\frac{\partial}{\partial L_{a, b}} \left ( \ell_n \bullet \, \ell_m \right ) = 
\frac{\partial}{\partial L_{a, b}} \left ( \sum_{k=1}^{m} L_{n, k} L_{m, k} \right ) = 
\mathbb{1}(b \leq m) \left [ \mathbb{1}(a = n) L_{m, b} + \mathbb{1}(a = m) L_{n, b} \right ]
$$

In [3]:
def derivative_covariance_from_internal(internal_values):
    dim = len(internal_values)
    
    chol = chol_params_to_lower_triangular_matrix(internal_values)
    
    deriv = np.zeros((dim, dim))
    
    for i in range(dim):
        
        outer_index = _vectorized_index_to_matrix_index(i)
        n = outer_index[0]
        m = outer_index[1]
        
        for j in range(dim):
            
            inner_index = _vectorized_index_to_matrix_index(j)
            a = inner_index[0]
            b = inner_index[1]
            
            deriv[i, j] = _derivative_covariance_from_internal_inner(n, m, a, b, chol)
                
    return deriv

In [4]:
def _derivative_covariance_from_internal_inner(n, m, a, b, chol):
    deriv = 0
    
    if b <= m:
        if a == n:
            deriv += chol[m, b]
        if a == m:
            deriv += chol[n, b]
        
    return deriv

## Example / Testing

In [5]:
J = jacfwd(covariance_from_internal_jax)

In [6]:
def get_random_internal(dim, seed=0):
    np.random.seed(seed)
    chol = np.tril(np.random.randn(dim, dim))
    internal = chol[np.tril_indices(len(chol))]
    return internal

In [7]:
for dim in range(10, 50):
    internal = get_random_internal(dim)

    jax_deriv = J(internal)

    my_deriv = derivative_covariance_from_internal(internal)

    assert_array_almost_equal(jax_deriv, my_deriv)



## Timeit

In [8]:
internal = get_random_internal(20)

In [9]:
%timeit J(internal)

13.5 ms ± 65.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
%timeit derivative_covariance_from_internal(internal)

80.8 ms ± 630 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Derivative of ``sdcorr_from_internal``

The graph we want to differentiate looks as follow

$$\tilde{\text{vec}}(L) \to L \to L L^\top =: \Sigma \to \mathcal{P} \to \tilde{\text{vec}}(\mathcal{P}) \,,$$
where $L$ is the Cholesky factor of the covariance matrix $\Sigma$ and $\mathcal{P}$ denotes the correlation matrix.
Let us define a function $f: \mathbb{R}^m \to \mathbb{R}^m, x \mapsto f(x)$, which takes an internal vector $x$ (of correct dimension) and transforms it to the correlation matrix as depicted above.
We want to find the Jacobian of $f$, i.e.
$$
J(f) = \left ( \frac{\partial \, f_i}{\partial \, x_j} \right )_{i, j = 1, \dots, m}
$$

We tackle this problem by finding an explicit expression for ${\partial f_i}/{\partial x_j}$ and then looping over $i,j = 1,\dots, m$.

Henceforth let $i, j$ be given and let $\Sigma = (\sigma_{i, j})$ as well as $\mathcal{P} = (\rho_{i,j})$.
Note that for each $j$ we can find a unique index tuple $(a, b) = (a(j), b(j))$ such that $\tilde{\text{vec}}(L)_j = L_{a, b}$ and equivalently we find $(n, m) = (n(i), m(i))$ such that $\tilde{\text{vec}}(\mathcal{P})_i = \rho_{n, m}$. Also note that for these indices it always holds that $a \geq b$ and $n \geq m$.

Now note again that with $L = \left [ \begin{matrix} \ell_1 \\ \vdots \\ \ell_m \end{matrix} \right ]$, we have $\sigma_{k, l} = \ell_k^ \, \bullet \ell_l$.

But by definition of a correlation matrix we thus have

$$
\rho_{n, m} = 
\frac{\sigma_{n, m}}{\sqrt{\sigma_{n, n}} \sqrt{\sigma_{m, m}}} =
\frac{\ell_n \, \bullet \ell_m}{||\ell_n||_2 \, ||\ell_m||_2} \,.
$$

Hence,

$$
\frac{\partial \, f_i}{\partial \, x_j} = 
\frac{\partial}{\partial L_{a(j), b(j)}} \rho_{n(i), m(i)} =
\frac{\partial}{\partial L_{a, b}} \left ( \frac{\ell_n \, \bullet \ell_m}{||\ell_n||_2 \, ||\ell_m||_2}
\right ) =: 
(\star)$$

To solve for $(\star)$ let us consider first

$$
(\star \star) :=
\frac{\partial}{\partial L_{a, b}} \left ( \ell_n \, \bullet \ell_m \right ) = 
\frac{\partial}{\partial L_{a, b}} \left ( \sum_{k=1}^{m} L_{n,k} L_{m, k} \right ) = 
\mathbb{1}(b \leq m, a = n) L_{m, b} + \mathbb{1}(b \leq m, a = m) L_{n, b}
$$

and 

$$
(\bullet) :=
\frac{\partial}{\partial L_{a, b}} ||\ell_n||_2 = \frac{\partial}{\partial L_{a, b}} \sqrt{\sum_{k=1}^n L_{n, k}^2} = \mathbb{1}(b \leq n, a = n) L_{n, b} ||\ell_n||_2^{-1}
$$


\begin{align}
(\bullet \, \bullet) :=
\frac{\partial}{\partial L_{a, b}} \left ( ||\ell_n||_2 \, ||\ell_m||_2 \right ) &= 
\frac{\partial}{\partial L_{a, b}} \left ( ||\ell_n||_2 \right ) ||\ell_m||_2 + 
\frac{\partial}{\partial L_{a, b}} \left ( ||\ell_m||_2 \right ) ||\ell_n||_2 \\
&= \mathbb{1}(b \leq n, a = n) L_{n, b} \frac{||\ell_m||_2}{||\ell_n||_2} + 
\mathbb{1}(b \leq m, a = m) L_{m, b}  \frac{||\ell_n||_2}{||\ell_m||_2}
\end{align}

Then, with $\alpha_{n, m} := ||\ell_n||_2 \, ||\ell_m||_2$, we get by applying the quotient rule

$$
(\star) = \frac{1}{\alpha_{n, m}^2} \left ( \alpha_{n, m} \times (\star \star) - \ell_n \, \bullet \ell_m \times (\bullet \, \bullet) \right )
$$

In [11]:
def derivative_sdcorr_from_internal(internal_values):
    dim = len(internal_values)
    
    chol = chol_params_to_lower_triangular_matrix(internal_values)
    
    deriv = np.zeros((dim, dim))
    
    for i in range(dim):
        
        outer_index = _vectorized_index_to_matrix_index(i)
        n = outer_index[0]
        m = outer_index[1]
        
        for j in range(dim):
            
            inner_index = _vectorized_index_to_matrix_index(j)
            a = inner_index[0]
            b = inner_index[1]
            
            deriv[i, j] = _derivative_sdcorr_from_internal_inner(n, m, a, b, chol)

    return deriv

In [12]:
def _derivative_sdcorr_from_internal_inner(n, m, a, b, chol):
    ln_norm = np.sqrt(np.sum(chol[n] ** 2))
    lm_norm = np.sqrt(np.sum(chol[m] ** 2))
    
    alpha = ln_norm * lm_norm
    
    # \ell_n \bullet \ell_m
    dotprod = np.dot(chol[n], chol[m])
    
    
    # (\star \star)
    left = 0
    if b <= m:
        if a == n:
            left += chol[m, b]
        if a == m:
            left += chol[n, b]
            
    # (\bullet \bullet)
    right = 0
    if b <= n and a == n:
        right += chol[n, b] * lm_norm / ln_norm
    if b <= m and a == m:
        right += chol[m, b] * ln_norm / lm_norm

    deriv = (alpha * left - dotprod * right) / (alpha ** 2)
    return deriv

## Example / Testing

In [13]:
J = jacfwd(sdcorr_from_internal_jax)

In [15]:
bad = []
for dim in range(10, 25):
    try:
        internal = get_random_internal(dim)
        jax_deriv = J(internal)
        my_deriv = derivative_sdcorr_from_internal(internal)
        assert_array_almost_equal(jax_deriv, my_deriv)
    except AssertionError:
        bad.append(dim)
        
print(bad)

[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]


## Timeit

In [16]:
internal = get_random_internal(20)

In [17]:
%timeit J(internal)

24.6 ms ± 1.32 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
%timeit derivative_sdcorr_from_internal(internal)

1.75 s ± 116 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Derivative of ``probability_from_internal``

Let $f: \mathbb{R}^m \to \mathbb{R}^m, x \mapsto \frac{1}{x^\top 1} x$, with $1$ denoting a vector of all ones. Define $\sigma := x^\top 1 = \sum_k x_k$. Then,
$$
J(f)(x) = \frac{1}{\sigma} I_m - \frac{1}{\sigma^2} 1 x^\top \,,
$$
where $I_m$ denotes the $m \times m$ identity matrix.

In [19]:
def derivative_probability_from_internal(internal_values):
    dim = len(internal_values)
    
    sigma = np.sum(internal_values)
    
    left = np.eye(dim) / sigma
    
    right = np.ones((dim, dim)) * (internal_values / (sigma ** 2))
    
    deriv = left - right.T
    return deriv

## Example / Testing

In [20]:
J = jacfwd(probability_from_internal_jax)

In [21]:
bad = []
for dim in range(10, 50):
    try:
        internal = get_random_internal(dim)
        jax_deriv = J(internal)
        my_deriv = derivative_probability_from_internal(internal)
        assert_array_almost_equal(jax_deriv, my_deriv)
    except AssertionError:
        bad.append(dim)
        
print(bad)

[16, 17, 21, 34, 39]


## Timeit

In [22]:
internal = get_random_internal(20)

In [23]:
%timeit J(internal)

3.78 ms ± 42 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [24]:
%timeit derivative_probability_from_internal(internal)

741 µs ± 14.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
