1KG
==
Run mushi on 3-SFS computed from 1000 Genome Project data

In [None]:
%matplotlib inline 
#%matplotlib notebook
import histories
import mushi
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.special import expit
import time
import msprime
%cd stdpopsim
from stdpopsim import homo_sapiens
%cd ../

### Load 1KG 3-SFS

In [None]:
ksfs_CEU = pd.read_csv('1KG/scons_output/3-SFS.tsv', sep='\t', index_col=0)

mutation_types = ksfs_CEU.columns
ksfs_CEU

Rank plot of the number of SNPs of each triplet mutation type

In [None]:
plt.figure(figsize=(15, 3))
plt.plot(ksfs_CEU.sum().sort_values(ascending=False).to_frame(), '.')
plt.xticks(rotation='vertical', family='monospace')
plt.ylabel('number of SNPs')
plt.yscale('symlog')
plt.tight_layout()
plt.savefig('/Users/williamdewitt/Downloads/foo.png')
plt.show()

In [None]:
# plt.style.use('dark_background')

### Demographic history $\eta(t)$ from the European population in Browning et al.
Browning, S. R. et al. Ancestry-specific recent effective population size in the Americas. [PLOS Genetics 14, e1007385 (2018)](https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1007385).

In [None]:
t = np.logspace(0, np.log10(3e4), 800)

In [None]:
from scipy.signal import butter, filtfilt

model = homo_sapiens.TennessenTwoPopOutOfAfrica()
dd = msprime.DemographyDebugger(Ne=model.default_population_size,
                                population_configurations=model.population_configurations,
                                demographic_events=model.demographic_events,
                                migration_matrix=model.migration_matrix)
y = 2 * dd.population_size_trajectory(np.concatenate(([0], t)))[:, 1]

# # smoothing
# b, a = butter(2, 1/100)
# y = filtfilt(b, a, y)

η = histories.η(t, y)

plt.figure(figsize=(4, 4))
η.plot()
plt.show()

### Mushi $k$-SFS object conditioned on this demographic history

In [None]:
ksfs = mushi.kSFS(η, X=ksfs_CEU.values, mutation_types=mutation_types)

### TMRCA CDF

In [None]:
plt.figure(figsize=(3, 3))
plt.plot(η.change_points, ksfs.tmrca_cdf())
plt.xlabel('$t$')
plt.ylabel('TMRCA CDF')
plt.ylim([0, 1])
plt.xscale('symlog')
plt.tight_layout()
plt.show()

### Mutation type enrichment traces, as in Harris and Pritchard

In [None]:
plt.figure(figsize=(6, 3))
ksfs.plot(alpha=0.1, lw=2, c='C0', normed=True)
plt.show()

### Mutation type enrichment again, but as a heatmap with correlation clustering

In [None]:
ksfs.clustermap(figsize=(25, 10))
plt.show()

### Invert the $k$-SFS conditioned on $\eta(t)$ to get $\boldsymbol\mu(t)$
Accelerated proximal gradient descent

In [None]:
μ_inferred, f_trajectory = ksfs.infer_μ(# loss function parameters
                                        fit='prf',
                                        exclude_singletons=False,
                                        bins=None,
                                        # time derivative regularization parameters
                                        λ_tv=1e3,
                                        α_tv=0,
                                        # spectral regularization parameters
                                        λ_r=0,
                                        α_r=1-1e-4,
                                        hard=True,                                        
                                        # convergence parameters
                                        max_iter=10,
                                        tol=1e-6,
                                        γ=0.8)

Convergence

In [None]:
plt.figure(figsize=(4, 2))
plt.plot(f_trajectory)
plt.xlabel('iterations')
plt.ylabel('cost')
plt.xscale('symlog')
plt.tight_layout()
plt.show()

The inferred histories for each mutation type (raw mutation rate in units of mutations per genome per generation)

In [None]:
plt.figure(figsize=(6, 3))
ksfs.plot(μ=μ_inferred, alpha=0.1, lw=2, normed=True)
plt.show()

Heatmap of the inferred mutation spectrum history, plotted as relative mutation intensity as in Harris and Pritchard

In [None]:
μ_inferred.clustermap(figsize=(25, 10))
plt.show()

plot $\chi^2$ goodness of fit for each $k$-SFS matrix element, and compute $\chi^2$ goodness of fit test for the $k$-SFS matrix as a whole

In [None]:
ksfs.clustermap(μ_inferred, figsize=(25, 10), cmap='Reds')
plt.show()

Plot SFS fit for the first 10 mutation types individually

In [None]:
plt.figure(figsize=(3, 3))
ksfs.plot1('TCC>TTC', μ=μ_inferred, prf_quantiles=True)
plt.tight_layout()
plt.show()

### Singular value spectrum of $Z$

In [None]:
plt.figure(figsize=(3, 3))
plt.bar(range(μ_inferred.Z.shape[1]), np.linalg.svd(μ_inferred.Z, compute_uv=False))
plt.yscale('log')
plt.tight_layout()
plt.show()