# Compute synonymous mutation rates for clades and overall
Get variables from `snakemake`:

In [1]:
import glob
counts_csvs = glob.glob("results/mutation_counts/counts_by_clade/*.csv")

Import Python modules:

In [15]:
import os

import altair as alt

import numpy

import pandas as pd

import sklearn.decomposition

Get mutation counts:

In [3]:
mutation_counts = pd.concat(
    [
        pd.read_csv(f).assign(clade=os.path.splitext(os.path.basename(f))[0])
        for f in counts_csvs
    ],
    ignore_index=True,
).assign(mut_type=lambda x: x["nt_mutation"].map(lambda m: f"{m[0]}to{m[-1]}"))

mutation_counts

Unnamed: 0,protein,aa_mutation,nt_mutation,codon_change,synonymous,count,reversion_to_ref,reversion_to_founder,clade,mut_type
0,M,M,G26577C,GAA>CAA,False,1769,False,False,21M,GtoC
1,ORF1a;ORF1ab,ORF1a;ORF1ab,A11537G,ATT>GTT;ATT>GTT,False,849,False,False,21M,AtoG
2,ORF1a;ORF1ab,ORF1a;ORF1ab,T10198C,GAT>GAC;GAT>GAC,True,820,False,False,21M,TtoC
3,ORF1a;ORF1ab,ORF1a;ORF1ab,T3037C,TTT>TTC;TTT>TTC,True,760,False,False,21M,TtoC
4,S,S,T22674C,TTC>TCC,False,745,True,True,21M,TtoC
...,...,...,...,...,...,...,...,...,...,...
456913,ORF1a;ORF1ab,ORF1a;ORF1ab,G351A,CGT>CAT;CGT>CAT,False,1,False,False,19A,GtoA
456914,ORF1a;ORF1ab,ORF1a;ORF1ab,G3518T,GTT>TTT;GTT>TTT,False,1,True,True,19A,GtoT
456915,ORF1a;ORF1ab,ORF1a;ORF1ab,G3482A,GGA>AGA;GGA>AGA,False,1,False,False,19A,GtoA
456916,ORF1a;ORF1ab,ORF1a;ORF1ab,G3476A,GTT>ATT;GTT>ATT,False,1,False,False,19A,GtoA


Tally mutation type counts among synonymous mutations, separating reversions to reference from other mutations:

In [4]:
mut_type_counts = (
    mutation_counts
    .query("synonymous")
    .groupby(["clade", "mut_type", "reversion_to_ref"], as_index=False)
    .aggregate({"count": "sum"})
)

mut_types = mut_type_counts["mut_type"].unique().tolist()

Plot total mutation counts for each clade:

In [5]:
mut_counts = (
    mut_type_counts
    .groupby(["clade", "reversion_to_ref"], as_index=False)
    .aggregate({"count": "sum"})
)

mut_counts_chart = (
    alt.Chart(mut_counts)
    .encode(
        y="clade",
        x="sum(count)",
        color="reversion_to_ref",
        tooltip=["clade", "count", "reversion_to_ref"],
    )
    .mark_bar()
)

mut_counts_chart

Get PCA of mutation spectrum separately for each clade and stratifying mutations by whether they are reversions to reference:

In [22]:
mut_type_freqs = (
    mut_type_counts
    .assign(
        total_count=lambda x: x.groupby(["clade", "reversion_to_ref"])["count"].transform("sum"),
        freq=lambda x: x["count"] / x["total_count"],
    )
    .pivot_table(
        index=["clade", "reversion_to_ref", "total_count"],
        values="freq",
        columns="mut_type",
        fill_value=0,
    )
)

pca = sklearn.decomposition.PCA(n_components=2)
pca_coords = pca.fit_transform(mut_type_freqs.values)
assert len(pca_coords) == len(mut_type_freqs)

mut_type_freqs_pca = (
    mut_type_freqs
    .reset_index()
    .assign(
        principal_component_1=pca_coords[:, 0],
        principal_component_2=pca_coords[:, 1],
        log10_total_count=lambda x: numpy.log(x["total_count"]) / numpy.log(10),
    )
)

Plot mutation spectrum PCA for all clades including both reversions to reference and non reversions to reference:

In [30]:
mut_type_freqs_chart = (
    alt.Chart(mut_type_freqs_pca)
    .encode(
        x="principal_component_1",
        y="principal_component_2",
        shape="reversion_to_ref",
        color="clade",
        tooltip=["clade", "reversion_to_ref", "total_count"],
    )
    .mark_point(filled=True, size=50)
)

mut_type_freqs_chart

Now do PCA on just non-reversion mutations:

In [32]:
mut_type_freqs_no_revert = mut_type_freqs.query("reversion_to_ref == False")

pca = sklearn.decomposition.PCA(n_components=2)
pca_coords_no_revert = pca.fit_transform(mut_type_freqs_no_revert.values)
assert len(pca_coords_no_revert) == len(mut_type_freqs_no_revert)

mut_type_freqs_no_revert_pca = (
    mut_type_freqs_no_revert
    .reset_index()
    .assign(
        principal_component_1=pca_coords_no_revert[:, 0],
        principal_component_2=pca_coords_no_revert[:, 1],
        log10_total_count=lambda x: numpy.log(x["total_count"]) / numpy.log(10),
    )
)

Plot the PCA on just non-reversion mutations:

In [36]:
total_count_selection = alt.selection_single(
    fields=["log10_total_count"],
    init={"log10_total_count": 4},
    bind=alt.binding_range(
        name="minimum log10 total counts",
        min=int(mut_type_freqs_no_revert_pca["log10_total_count"].min()),
        max=mut_type_freqs_no_revert_pca["log10_total_count"].max(),
    )
)

mut_type_freqs_no_revert_chart = (
    alt.Chart(mut_type_freqs_no_revert_pca)
    .encode(
        x="principal_component_1",
        y="principal_component_2",
        color="clade",
        tooltip=["clade", "total_count"],
    )
    .mark_point(filled=True, size=50)
    .add_selection(total_count_selection)
    .transform_filter(
        total_count_selection.log10_total_count <= alt.datum.log10_total_count
    )
)

mut_type_freqs_no_revert_chart