TCC>TTC pulse timing
==

In [None]:
%matplotlib inline
from mushi import kSFS
from histories import η
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import msprime
import stdpopsim

In [None]:
# plt.style.use('dark_background')

EUR populations

In [None]:
pops = {'CEU', 'FIN', 'GBR', 'IBS', 'TSI'}

load 3-SFS for each population

In [None]:
ksfs_dict = {}
for pop in pops:
    ksfs_df = pd.read_csv(f'1KG/scons_output/{pop}/3-SFS.tsv', sep='\t', index_col=0)
    ksfs_dict[pop] = kSFS(X=ksfs_df.values, mutation_types=ksfs_df.columns)

time grid of epoch boundaries (measured in generations)

In [None]:
change_points = np.logspace(np.log10(1), np.log10(200000), 200)

masked genome size

In [None]:
with open('1KG/scons_output/masked_size.tsv') as f:
    masked_genome_size = int(f.read())

mutation rate per site per generation

In [None]:
u = 1.3e-8

mutation rate per masked genome per generation

In [None]:
μ0 = u * masked_genome_size

mask high frequencies due to ancestral state misidentification

In [None]:
clip_low = 0
clip_high = 10
# we need a different mask vector for each population becuase the number of haplotypes n
# (length of the vector) varies
freq_mask = {}
for pop in pops:
    freq_mask[pop] = np.array([True if (clip_low <= i < ksfs_dict[pop].n - clip_high - 1) else False
                               for i in range(ksfs_dict[pop].n - 1)])

fit paramaters

In [None]:
η_regularization = dict(α_tv=3e2, α_spline=1e4, α_ridge=1e-10)
μ_regularization = dict(hard=True, β_rank=0, β_tv=7e1, β_ridge=1e-10)
convergence = dict(tol=1e-10, max_iter=10000, max_line_iter=300, γ=0.8)

generation time for time calibration

In [None]:
t_gen = 29

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(8, 4))
for idx, pop in enumerate(pops):
    print(pop)    
    ksfs_dict[pop].η = None
    ksfs_dict[pop].μ = None
    ksfs_dict[pop].infer_history(change_points, μ0, infer_μ=False,
                                 loss='prf', **η_regularization,
                                 **convergence, mask=freq_mask[pop])
    plt.sca(axes[0])
    ksfs_dict[pop].plot_total(label=pop)
    plt.sca(axes[1])
    ksfs_dict[pop].η.plot(lw=3, label=pop, t_gen=t_gen)
    plt.xlim([1e3, 1e6])
    plt.legend()
plt.show()



loop over a few values of the `β_spline` parameter, which controls how much L2 smoothness is imposed on the first derivative, and fit each EUR population using each parameter value

In [None]:
β_spline_array = (1e-20, 1e3, 1e4)
fig, axes = plt.subplots(len(β_spline_array), 2,
                         sharex='col', figsize=(8, 3 * len(β_spline_array)))
for idx, β_spline in enumerate(β_spline_array):
    print(f'β_spline = {β_spline}')
    for idx2, pop in enumerate(pops):
        print(pop)
        ksfs_dict[pop].μ = None
        ksfs_dict[pop].infer_history(change_points, μ0, β_spline=β_spline, infer_η=False,
                                     loss='prf', **μ_regularization,
                                     **convergence, mask=freq_mask[pop])
        plt.sca(axes[idx, 0])
        ksfs_dict[pop].plot('TCC>TTC', clr=True,
                            label=(pop if idx == 0 else None),
                            lw=3, alpha=0.5, c=f'C{idx2}')
        plt.xscale('log')
        if idx == 0:
            plt.legend()
        if idx < axes.shape[0] - 1:
            plt.xlabel(None)
        plt.sca(axes[idx, 1])
        ksfs_dict[pop].μ.plot(('TCC>TTC',), t_gen=t_gen, clr=True,
                              label=(pop if idx == 0 else None),
                              lw=3, alpha=0.5)
        plt.xscale('log')
        if idx < axes.shape[0] - 1:
            plt.xlabel(None)        
        plt.xlim([1e3, 1e6])
plt.tight_layout()
plt.savefig(f'/Users/williamdewitt/Downloads/europulse.pdf', dpi=300)#, transparent=True)
plt.show()

Instead of fitting the demography, we can use the demography of Tennessen et al. from `stdpopsim`

In [None]:
species = stdpopsim.get_species("HomSap")
model = species.get_demographic_model("OutOfAfrica_2T12")
ddb = model.get_demography_debugger()
steps = np.concatenate((np.array([0]), change_points))
η_Tennessen = η(change_points,
                1 / ddb.coalescence_rate_trajectory(steps=steps,
                                                    num_samples=[0, 2],
                                                    double_step_validation=False)[0])
plt.figure(figsize=(4, 4))
η_Tennessen.plot(t_gen=t_gen, label='EUR (Tennessen et al.)')
plt.xlim([1e3, 1e6])
plt.legend()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3,
                         sharex='col', figsize=(12, 4))
for idx, pop in enumerate(pops):
    ksfs_dict[pop].infer_history(change_points, μ0, η=η_Tennessen, β_spline=1e-20,
                                 loss='prf', **η_regularization, **μ_regularization,
                                 **convergence, mask=freq_mask[pop])
    plt.sca(axes[0])
    ksfs_dict[pop].plot_total(c=f'C{idx}', label=pop)
    plt.sca(axes[1])
    ksfs_dict[pop].plot('TCC>TTC', clr=True, label=pop, lw=3, alpha=0.5, c=f'C{idx}')
    plt.sca(axes[2])
    ksfs_dict[pop].μ.plot(('TCC>TTC',), t_gen=t_gen, clr=True,
                          label=pop,
                          lw=3, alpha=0.5)
axes[0].legend()
axes[2].set_xlim([1e3, 1e6])
plt.tight_layout()
plt.savefig(f'/Users/williamdewitt/Downloads/europulse.Tennessen.pdf', dpi=300)#, transparent=True)
plt.show()

It fits the total SFS quite poorly, and timing of the TCC pulse seems to be incorrectly scaled as a result. The number of segregating variants observed does not match what's expected under this demography:

In [None]:
plt.figure(figsize=(3, 3))
for pop in pops:
    plt.plot(ksfs_dict[pop].X[freq_mask[pop], :].sum(),
             (ksfs_dict[pop].L @ ksfs_dict['CEU'].μ.Z)[freq_mask[pop], :].sum(),
             'o', label=pop)
plt.plot([.8e7, 1.05e7], [.8e7, 1.05e7], '--k')
plt.xlabel('observed S')
plt.ylabel('predicted S')
plt.legend()
plt.show()

We conclude that an older site-wise mutation rate estimate was cryptically modifying the diffusion timescale in Harris and Pritchard (2017) via the assumed demography. Tennessen et al. likely used a phylogenetically-calibrated mutation rate `~2.35e-8` (although no rate is reported), rather than an estimate based on trio sequencing.