# Testing regularization approaches

Here's a notebook for playing with different penalties

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
from matplotlib import pyplot as plt
from dement import DemEnt
import numpy as np
from scipy.optimize import minimize, check_grad
from scipy.special import erf

Initialize a model
--
We'll simulate a demographic history that suffers a crash then an exponential recovery

Define the time axis $\mathbf{t}$ (including the boundary at infinity) and the population size trajectory $\mathbf{y}$

In [None]:
t = np.array([0] + list(np.logspace(0, 4, 100)) + [np.inf])

# constant
# y_true = 10000 * np.ones(len(t) - 1)

# crash followed by exponential growth
y_true = 1000 * (10 * np.exp(-t[:-1]/100) + 1 + 5 * np.array(t[:-1] > 1000, float))

The number of sampled haplotypes $n$:

In [None]:
n = 200

Initialize dement object, and print its docstring:

In [None]:
dement = DemEnt(n, t, y_true)
print(dement.__doc__)

Inversion
--

### Initialization with constant model

We initialize by fitting a constant population size.
According to WSD's scribbles, the MLE assuming $\eta(t) = \eta_0$ (constant) is $\hat \eta_0 = \frac{S}{2 H_{n-1}}$, where $S$ is the number of segregating sites (the sum of the observed SFS vector) and $H_{n-1}$ is the $n$th harmonic number.
This was derived by using the well-known result (cited in Rosen et al.) that the expected SFS for a constant population is given by $\xi_i = \frac{2\eta_0}{i}$ (in units where $\eta$ is the population-scaled mutation rate).
Then the likelihood for $\eta_0$ is a Poisson random field parameterized by the $\xi_i$.

In [None]:
S = dement.sfs.sum()
H = (1 / np.arange(1, len(dement.sfs))).sum()
y_constant = (S / 2 / H) * np.ones(len(t) - 1)

### Regularized loss as a penalized log-likelihood
We must deal with the asymptotically constant boundary condition.
Standard regularizers blow up on the infinite epoch.
Let's use Gaussian instead of Lebesgue measure on time to induce integrability: $\mathrm{d}\mu(t) = \exp\left(-\frac{1}{2}\left(\frac{t}{\tau}\right)^2\right)\mathrm{d}t,$
where $\tau$ is the characteristic time to asmptopia (the boundary of our time grid).
For example, a modified $L2$ would be
$$
R\left[\eta(t)\right] = \int_0^\infty \eta(t)^2 \mathrm{d}\mu(t) = \int_0^\infty \eta(t)^2 \exp\left(-\frac{1}{2}\left(\frac{t}{\tau}\right)^2\right)\mathrm{d}t.
$$
So the discretized problem is expressed in terms of the error function $\DeclareMathOperator{\erf}{erf}\erf(\cdot)$.

In [None]:
def loss(y, y_prime, lambda_: float):
    # gaussian transformed measure
    tau = 10 * dement.t[-2]
    dmu = np.diff(erf(dement.t / tau / np.sqrt(2))) * tau * np.sqrt(np.pi / 2)
    # generalized KL divergence (a Bregman divergence)
    R = ((y * np.log(y/y_prime) - y + y_prime) * dmu).sum()
    # fusion L2
#     R = ((np.diff(y) * dmu[:-1])**2).sum()
    return - dement.ell(y) + lambda_ * R

Minimize loss with L-BFGS-B

In [None]:
# Initial regularization strength
lambda_ = 1e1

# initial and prior
y = y_constant
y_prime = y_constant

for _ in range(10):
    result = minimize(loss,
                      y,
                      args=(y_prime, lambda_),
                      # jac=gradF,
                      method='L-BFGS-B',
                      options=dict(
    #                                ftol=1e-10,
                                   maxfun=np.inf),
                      bounds=[(1e-6, None)] * len(y))
    assert result.success, result
    y = result.x
    
    dement.plot(y, y_label='inferred ($\lambda = ${:.2g})'.format(lambda_))    
    
    # update prior and reduce regularization strength
    y_prime = y
    lambda_ /= 10