TCC>TTC pulse timing
==

In [None]:
%matplotlib inline
from mushi import kSFS
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

In [None]:
# plt.style.use('dark_background')

EUR populations

In [None]:
pops = {'CEU', 'FIN', 'GBR', 'IBS', 'TSI'}

load 3-SFS for each population

In [None]:
ksfs_dict = {}
for pop in pops:
    ksfs_df = pd.read_csv(f'1KG/scons_output/{pop}/3-SFS.tsv', sep='\t', index_col=0)
    ksfs_dict[pop] = kSFS(X=ksfs_df.values, mutation_types=ksfs_df.columns)

time grid of epoch boundaries (measured in generations)

In [None]:
change_points = np.logspace(np.log10(1), np.log10(200000), 200)

masked genome size

In [None]:
with open('1KG/scons_output/masked_size.tsv') as f:
    masked_genome_size = int(f.read())

mutation rate per site per generation

In [None]:
u = 1.3e-8

mutation rate per masked genome per generation

In [None]:
μ0 = u * masked_genome_size

mask high frequencies due to ancestral state misidentification

In [None]:
clip_low = 0
clip_high = 10
# we need a different mask vector for each population becuase the number of haplotypes n (length of the vector) varies
freq_mask = {}
for pop in pops:
    freq_mask[pop] = np.array([True if (clip_low <= i < ksfs_dict[pop].n - clip_high - 1) else False for i in range(ksfs_dict[pop].n - 1)])

fit paramaters

In [None]:
η_regularization = dict(α_tv=3e2, α_spline=1e4, α_ridge=1e-10)
μ_regularization = dict(hard=True, β_rank=0, β_tv=3e1, β_ridge=1e-10)
convergence = dict(tol=1e-10, max_iter=10000, max_line_iter=300, γ=0.8)

generation time for time calibration

In [None]:
t_gen = 29

loop over a few values of the `β_spline` parameter, which controls how much L2 smoothness is imposed on the first derivative, and fit each EUR population using each parameter value

In [None]:
β_spline_array = (1e-20, 1e3, 1e4)
fig, axes = plt.subplots(len(β_spline_array), 2,
                         sharex='col', figsize=(8, 3 * len(β_spline_array)))
for idx, β_spline in enumerate(β_spline_array):
    print(f'β_spline = {β_spline}')
    for idx2, pop in enumerate(pops):
        print(pop)
        ksfs_dict[pop].infer_history(change_points, μ0, β_spline=β_spline,
                                     loss='prf', **η_regularization, **μ_regularization,
                                     **convergence, mask=freq_mask[pop])
        plt.sca(axes[idx, 0])
        ksfs_dict[pop].plot('TCC>TTC', clr=True,
                            label=(pop if idx == 0 else None),
                            lw=3, alpha=0.5, c=f'C{idx2}')
        plt.xscale('log')
        if idx == 0:
            plt.legend()
        if idx < axes.shape[0] - 1:
            plt.xlabel(None)
        plt.sca(axes[idx, 1])
        ksfs_dict[pop].μ.plot(('TCC>TTC',), t_gen=t_gen, clr=True,
                              label=(pop if idx == 0 else None),
                              lw=3, alpha=0.5)
        plt.xscale('log')
        if idx < axes.shape[0] - 1:
            plt.xlabel(None)        
        plt.xlim([1e3, 1e6])
plt.tight_layout()
plt.savefig(f'~/Downloads/europulse.pdf', dpi=300)#, transparent=True)
plt.show()