1KG
==
Run mushi on 3-SFS computed from 1000 Genome Project data

In [None]:
# %matplotlib inline 
%matplotlib notebook
import histories
import mushi
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.special import expit
import time
from sklearn.decomposition import PCA

In [None]:
# plt.style.use('dark_background')

### Load 1KG 3-SFS

In [None]:
ksfs_df = pd.read_csv('1KG/scons_output_EUR/3-SFS.tsv', sep='\t', index_col=0)
mutation_types = ksfs_df.columns
n = ksfs_df.shape[0] + 1
ksfs = mushi.kSFS(X=ksfs_df.values, mutation_types=mutation_types)

Rank plot of the number of SNPs of each triplet mutation type

In [None]:
plt.figure(figsize=(15, 3))
plt.plot(ksfs_df.sum().sort_values(ascending=False).to_frame(), '.')
plt.xticks(rotation='vertical', family='monospace')
plt.ylabel('number of SNPs')
plt.yscale('symlog')
plt.tight_layout()
# plt.savefig('/Users/williamdewitt/Downloads/foo.png')
plt.show()

### Genome size and mutation rate estimation

In [None]:
masked_size = int(open('1KG/scons_output_EUR/masked_size.tsv').read())
mutation_rate = 1.1e-8
μ_0 = mutation_rate * masked_size

### Invert the total SFS to fit $\eta(t)$

In [None]:
mask = np.array([False if (1 <= i <= n - 20) else True for i in range(n - 1)])
# mask = None

In [None]:
change_points = np.logspace(0, 5, 50)

it = 0
def callback(x):
    global it
    it += 1
    print(f'iteration {it}', end='  \r', flush=True)

result = ksfs.infer_η(change_points=change_points,
                      fit='prf', μ_0=μ_0, λ=1e-6, mask=mask,
                      callback=callback,
#                       method='Nelder-Mead',
                      options=dict(disp=True,
#                                    tol=1e-3,
                                   maxiter=1000,
#                                    maxfev=np.inf
                                  )
                     )
print(result.message)

plt.figure(figsize=(6, 3))
plt.subplot(121)
ksfs.plot_total()
# plt.legend()
plt.subplot(122)
ksfs.η.plot()
plt.show()

### TMRCA CDF

In [None]:
plt.figure(figsize=(3, 3))
plt.plot(change_points, ksfs.tmrca_cdf())
plt.xlabel('$t$')
plt.ylabel('TMRCA CDF')
plt.ylim([0, 1])
plt.xscale('symlog')
plt.tight_layout()
plt.show()

### Mutation type enrichment as a heatmap with correlation clustering

In [None]:
ksfs.clustermap(figsize=(25, 10))
# plt.savefig('/Users/williamdewitt/Downloads/1KG_heatmap.pdf', transparent=True)
plt.show()

### Invert the $k$-SFS conditioned on $\eta(t)$ to get $\boldsymbol\mu(t)$
Accelerated proximal gradient descent

In [None]:
f_trajectory = ksfs.infer_μ(# loss function parameters
                                        fit='prf',
                                        mask=mask,
                                        # time derivative regularization parameters
                                        λ_tv=1e4,
                                        α_tv=0,
                                        # spectral regularization parameters
                                        λ_r=1e2,
                                        α_r=1-1/2e4,
                                        hard=True,                                        
                                        # convergence parameters
                                        max_iter=10000,
                                        tol=1e-10,
                                        γ=0.8)

Convergence

In [None]:
plt.figure(figsize=(4, 2))
plt.plot(f_trajectory)
plt.xlabel('iterations')
plt.ylabel('cost')
plt.xscale('symlog')
plt.tight_layout()
plt.show()

### Singular value spectrum of $Z$

In [None]:
plt.figure(figsize=(3, 3))
plt.plot(range(1, 1 + min(ksfs.μ.Z.shape)), np.linalg.svd(ksfs.μ.Z, compute_uv=False), '.')
plt.yscale('log')
plt.tight_layout()
plt.show()

The inferred histories for each mutation type (raw mutation rate in units of mutations per genome per generation)

In [None]:
pulse_types = ('TCC>TTC', 'CCC>CTC', 'ACC>ATC', 'TCT>TTT')

In [None]:
plt.figure(figsize=(6, 2.5))
plt.subplot(121)

ksfs.plot(alpha=0.02, c='k', lw=2, normed=True)
for color_idx, mut_type in enumerate(pulse_types):
    ksfs.plot(mut_type, lw=2, c=f'C{color_idx}', normed=True, label=mut_type)
plt.legend(loc=1, prop={'size': 8}, framealpha=0, edgecolor='k')
plt.xlim([2, None])

plt.subplot(122)
ksfs.μ.plot(alpha=0.1, lw=1, c='k', normed=False)
for mut_type in pulse_types:
    ksfs.μ.plot(types=[mut_type], lw=2, normed=False, label=mut_type)
plt.legend().remove()#loc=2, prop={'size': 7.5})

# plt.savefig('/Users/williamdewitt/Downloads/1KG.pdf', transparent=True)
plt.show()

In [None]:
plt.figure(figsize=(6, 2.5))
plt.plot(np.concatenate(([0], change_points)), ksfs.μ.Z.sum(1), lw=1, c='k')
plt.xscale('symlog')
plt.show()

Heatmap of the inferred mutation spectrum history, plotted as relative mutation intensity as in Harris and Pritchard

In [None]:
# μ.clustermap(figsize=(25, 10))
# # plt.savefig('/Users/williamdewitt/Downloads/cluster.png', transparent=False)
# plt.show()

plot $\chi^2$ goodness of fit for each $k$-SFS matrix element, and compute $\chi^2$ goodness of fit test for the $k$-SFS matrix as a whole

In [None]:
# ksfs.clustermap(μ, figsize=(25, 10), cmap='Reds')
# plt.show()

Plot SFS fit for the first 10 mutation types individually

In [None]:
# plt.figure(figsize=(3, 3))
# ksfs.plot1('TCC>TTC', μ=μ, prf_quantiles=True)
# plt.tight_layout()
# plt.show()

In [None]:
pca = PCA().fit(ksfs.μ.Z.T)

plt.figure(figsize=(6, 2.5))
plt.subplot(121)

plt.plot(np.concatenate(([0], change_points)), pca.components_[0], label='principle vector 1')
plt.plot(np.concatenate(([0], change_points)), pca.components_[1], label='principle vector 2')
plt.plot(np.concatenate(([0], change_points)), pca.components_[2], label='principle vector 3')
# plt.plot(np.concatenate(([0], change_points)), pca.components_[3], label='principle vector 4')
# plt.plot(np.concatenate(([0], change_points)), pca.components_[4], label='principle vector 5')
# plt.plot(np.concatenate(([0], change_points)), pca.components_[5], label='principle vector 6')
# plt.plot(np.concatenate(([0], change_points)), pca.components_[6], label='principle vector 7')
# plt.plot(np.concatenate(([0], change_points)), pca.components_[7], label='principle vector 8')
# plt.plot(np.concatenate(([0], change_points)), pca.components_[8], label='principle vector 9')
# plt.plot(np.concatenate(([0], change_points)), pca.components_[9], label='principle vector 10')
plt.xlabel('$t$')
plt.xscale('log')
plt.legend(loc='lower left', prop={'size': 7.5}, framealpha=.5)

plt.subplot(122)
plt.scatter(*pca.transform(ksfs.μ.Z.T).T[1:3, :], c='k', alpha=0.2, s=20)
for mut_type in pulse_types:
    plt.scatter(*pca.transform(ksfs.μ.Z.T).T[1:3, ksfs.mutation_types.get_loc(mut_type)], label=mut_type, s=20)
plt.xlabel('PC 2')
plt.ylabel('PC 3')
plt.legend(loc='lower right', prop={'size': 8}, framealpha=.5)
plt.tight_layout()
plt.savefig('/Users/williamdewitt/Downloads/1KG_PC.pdf', transparent=True)
plt.show()

In [None]:
pca.explained_variance_

In [None]:
import seaborn as sns

col_colors = ['red' if mut_type in pulse_types else 'grey' for mut_type in ksfs.mutation_types]

Z_pca = pca.transform(ksfs.μ.Z.T).T[1:10, :]
df = pd.DataFrame(data=Z_pca, index=range(2, Z_pca.shape[0] + 2),
                  columns=ksfs.μ.mutation_types)
g = sns.clustermap(df, center=0, col_colors=col_colors, method='ward',
                   cbar_kws={'label': 'PC weight'})