1KG
==
Run mushi on 3-SFS computed from 1000 Genome Project data

In [None]:
%matplotlib inline 
# %matplotlib notebook
import histories
import mushi
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.special import expit
import time
from sklearn.decomposition import PCA

In [None]:
# plt.style.use('dark_background')

### Load 1KG 3-SFS

In [None]:
ksfs_df = pd.read_csv('1KG/scons_output_EUR/3-SFS.tsv', sep='\t', index_col=0)
mutation_types = ksfs_df.columns
n = ksfs_df.shape[0] + 1
ksfs = mushi.kSFS(X=ksfs_df.values, mutation_types=mutation_types)

### Rank plot of the number of SNPs of each triplet mutation type

In [None]:
plt.figure(figsize=(15, 3))
plt.plot(ksfs_df.sum().sort_values(ascending=False).to_frame(), '.')
plt.xticks(rotation='vertical', family='monospace')
plt.ylabel('number of SNPs')
plt.yscale('symlog')
plt.tight_layout()
# plt.savefig('/Users/williamdewitt/Downloads/foo.png')
plt.show()

### Genome size and mutation rate estimation

In [None]:
masked_size = int(open('1KG/scons_output_EUR/masked_size.tsv').read())
mutation_rate = 1.1e-8
μ_0 = mutation_rate * masked_size
print(f'mutation rate in units of mutations per masked genome per generation: {μ_0}')

### Mutation type enrichment as a heatmap with correlation clustering

In [None]:
ksfs.clustermap(figsize=(25, 10))
# plt.savefig('/Users/williamdewitt/Downloads/1KG_heatmap.pdf', transparent=True)
plt.show()

### Invert the $k$-SFS to infer $\eta(t)$ and $\boldsymbol\mu(t)$
Initialize to constant

In [None]:
change_points = np.logspace(0, 5.3, 200)
mask = np.array([False if (0 <= i <= n - 20) else True for i in range(n - 1)])

ksfs.infer_constant(change_points=change_points, μ_0=μ_0, mask=mask)

plt.figure(figsize=(6, 6))
plt.subplot(221)
ksfs.plot_total()
plt.subplot(222)
ksfs.η.plot()
plt.subplot(223)
ksfs.plot(normed=True, alpha=0.5)
plt.subplot(224)
ksfs.μ.plot(normed=True, alpha=0.5)
plt.show()

### TMRCA CDF

In [None]:
plt.figure(figsize=(3, 3))
plt.plot(change_points, ksfs.tmrca_cdf())
plt.xlabel('$t$')
plt.ylabel('TMRCA CDF')
plt.ylim([0, 1])
plt.xscale('symlog')
plt.tight_layout()
plt.show()

Accelerated proximal block coordinate descent

In [None]:
f_trajectory = []

sweeps = 10
for sweep in range(1, 1 + sweeps):    
    print(f'block coordinate descent sweep {sweep:.2g}')
    f = ksfs.coord_desc(# loss function parameters
                        fit='prf',
                        mask=mask,
                        # η(t) regularization parameters
                        α_tv=0,#1e3,
                        α_spline=1e3,
                        # μ(t) regularization parameters
                        β_tv=0,
                        β_spline=2e5,   
                        # spectral regularization parameters
                        β_rank=5e3,
                        β_ridge=1e-6,
                        hard=True,                                     
                        # convergence parameters
                        max_iter=10000,
                        tol=1e-10,
                        γ=0.8)
    print(f'cost: {f}')
    if sweep > 1:
        relative_change = np.abs((f - f_old) / f_old)
        print(f'relative change: {relative_change:.2g}')
    print()
    f_old = f
    f_trajectory.append(f)
    
    plt.figure(figsize=(6, 6))
    plt.subplot(221)
    ksfs.plot_total()
    plt.subplot(222)
    ksfs.η.plot()
    plt.subplot(223)
    ksfs.plot(normed=True, alpha=0.5)
    plt.subplot(224)
    ksfs.μ.plot(normed=True, alpha=0.5)
    plt.show()


plt.figure(figsize=(4, 2))
plt.plot(f_trajectory)
plt.xlabel('iterations')
plt.ylabel('cost')
# plt.xscale('symlog')
plt.tight_layout()
plt.show()


pulse_types = ('TCC>TTC', 'ACC>ATC', 'TCT>TTT', 'CCC>CTC')

plt.figure(figsize=(6, 2.5))
plt.subplot(121)

ksfs.plot(alpha=0.01, c='k', lw=2, normed=True, zorder=0)
for color_idx, mut_type in enumerate(pulse_types):
    zorder = color_idx + 1
    alpha = 0.9 * (len(pulse_types) - color_idx) / len(pulse_types)
    ksfs.plot(mut_type, alpha=alpha, lw=2, c=f'C{color_idx}', normed=True, label=mut_type, zorder=zorder)
plt.legend(loc=0, prop={'size': 8}, framealpha=0.5, edgecolor='k')
plt.xlim([2, None])

plt.subplot(122)
ksfs.μ.plot(alpha=0.1, lw=1, c='k', normed=False)
for mut_type in pulse_types:
    ksfs.μ.plot(types=[mut_type], lw=2, normed=False, label=mut_type)
plt.legend().remove()#loc=2, prop={'size': 7.5})

# plt.savefig('/Users/williamdewitt/Downloads/1KG.pdf', transparent=True)
plt.show()

### Singular value spectrum of $Z$

In [None]:
plt.figure(figsize=(3, 3))
plt.plot(range(1, 1 + min(ksfs.μ.Z.shape)), np.linalg.svd(ksfs.μ.Z, compute_uv=False), '.')
plt.yscale('log')
plt.tight_layout()
plt.show()

Heatmap of the inferred mutation spectrum history, plotted as relative mutation intensity as in Harris and Pritchard

In [None]:
# ksfs.μ.clustermap(figsize=(25, 10))
# plt.savefig('/Users/williamdewitt/Downloads/cluster.png', transparent=False)
# plt.show()

plot $\chi^2$ goodness of fit for each $k$-SFS matrix element, and compute $\chi^2$ goodness of fit test for the $k$-SFS matrix as a whole

In [None]:
# ksfs.clustermap(μ, figsize=(25, 10), cmap='Reds')
# plt.show()

Plot SFS fit for the first 10 mutation types individually

In [None]:
# plt.figure(figsize=(3, 3))
# ksfs.plot1('TCC>TTC', μ=μ, prf_quantiles=True)
# plt.tight_layout()
# plt.show()

In [None]:
pca = PCA().fit(ksfs.μ.Z.T)

plt.figure(figsize=(6, 2.5))
plt.subplot(121)

plt.plot(np.concatenate(([0], change_points)), pca.components_[0], label='latent signature 1')
plt.plot(np.concatenate(([0], change_points)), pca.components_[1], label='latent signature 2')
plt.plot(np.concatenate(([0], change_points)), pca.components_[2], label='latent signature 3')
# plt.plot(np.concatenate(([0], change_points)), pca.components_[3], label='principle vector 4')
# plt.plot(np.concatenate(([0], change_points)), pca.components_[4], label='principle vector 5')
# plt.plot(np.concatenate(([0], change_points)), pca.components_[5], label='principle vector 6')
# plt.plot(np.concatenate(([0], change_points)), pca.components_[6], label='principle vector 7')
# plt.plot(np.concatenate(([0], change_points)), pca.components_[7], label='principle vector 8')
# plt.plot(np.concatenate(([0], change_points)), pca.components_[8], label='principle vector 9')
# plt.plot(np.concatenate(([0], change_points)), pca.components_[9], label='principle vector 10')
plt.xlabel('$t$')
plt.xscale('symlog')
plt.legend(loc='lower left', prop={'size': 7.5}, framealpha=.5)

plt.subplot(122)
plt.scatter(*pca.transform(ksfs.μ.Z.T).T[1:3, :], c='k', alpha=0.2, s=20)
for mut_type in pulse_types:
    x, y = pca.transform(ksfs.μ.Z.T).T[1:3, ksfs.mutation_types.get_loc(mut_type)]
    plt.scatter(x, y, label=mut_type, s=20, c='C2')
    plt.annotate(mut_type, (x, y), size='small',
                c='C2'
                )
PC2_outliers = np.nonzero(pca.transform(ksfs.μ.Z.T).T[1, :] > 0.2)[0]
for foo, idx in enumerate(PC2_outliers):
    x, y = pca.transform(ksfs.μ.Z.T).T[1:3, idx]
    plt.scatter(x, y, label=mut_type, s=20, c='C1')
    if ksfs.mutation_types[idx] == 'TCG>TTG':
        xt = x - 0.15
        yt = y + 0.1
    elif ksfs.mutation_types[idx] == 'GCG>GTG':
        xt = x - 0.18
        yt = y - 0.03
    elif ksfs.mutation_types[idx] == 'CCG>CTG':
        xt = x - 0.1
        yt = y + 0.08
    elif ksfs.mutation_types[idx] == 'ACG>ATG':
        xt = x - 0.2
        yt = y - 0.03       
    plt.annotate(ksfs.mutation_types[idx], (x, y), xytext=(xt, yt), size='small', c='C1',
                 arrowprops=dict(arrowstyle='->') 
                )
plt.xlabel('PC 2')
plt.ylabel('PC 3')
# plt.legend(loc='upper right', prop={'size': 8}, framealpha=.5)
plt.tight_layout()
# plt.savefig('/Users/williamdewitt/Downloads/1KG_PC.pdf', transparent=True)
plt.show()

In [None]:
pca.explained_variance_

In [None]:
import seaborn as sns

col_colors = ['red' if mut_type in pulse_types else 'grey' for mut_type in ksfs.mutation_types]

comp = 1
Z_pca = np.array([pca.transform(ksfs.μ.Z.T).T[comp, :]])
df = pd.DataFrame(data=Z_pca, index=[comp],
                  columns=ksfs.μ.mutation_types)
g = sns.clustermap(df.T, center=0,
#                    col_colors=col_colors,
                   method='ward', col_cluster=False, row_cluster=False,
                   cbar_kws={'label': 'PC weight'}, figsize=(5, 10))

In [None]:
plt.figure(figsize=(13, 3))
df2 = df.T.sort_values(1)
sns.barplot(x=df2.index, y=1, data=df2)
plt.xticks(rotation='vertical')
plt.tight_layout()
plt.savefig('/Users/williamdewitt/Downloads/PC3_rank.pdf', transparent=True)
plt.show()

In [None]:
ksfs.plot(type='TCC>TTC', normed=True)