# Synonymous mutation spectrum
Get input variables from [papermill](https://papermill.readthedocs.io/) parameterization (note next cell is tagged as `parameters`):

In [44]:
input_csv = "results/mutation_counts/aggregated.csv"

Import Python modules:

In [45]:
import os

import altair as alt

import numpy

import pandas as pd

import sklearn.decomposition

Read the mutation counts and assign mutation types:

In [46]:
mutation_counts = pd.read_csv(input_csv).assign(
    mut_type=lambda x: x["nt_mutation"].map(lambda m: f"{m[0]}to{m[-1]}")
)

mutation_counts

Unnamed: 0,protein,aa_mutation,nt_mutation,codon_change,synonymous,count,reversion_to_ref,reversion_to_founder,clade,subset,mut_type
0,ORF1ab,ORF1ab,C16466T,CCA>CTA,False,62,False,False,19A,all,CtoT
1,M,M,T26767C,ATC>ACC,False,54,False,False,19A,all,TtoC
2,ORF1a;ORF1ab,ORF1a;ORF1ab,C3037T,TTC>TTT;TTC>TTT,True,50,False,False,19A,all,CtoT
3,ORF1ab,ORF1ab,C19220T,GCT>GTT,False,47,False,False,19A,all,CtoT
4,ORF1ab,ORF1ab,A17236G,ATA>GTA,False,45,False,False,19A,all,AtoG
...,...,...,...,...,...,...,...,...,...,...,...
974785,ORF1a;ORF1ab,ORF1a;ORF1ab,G2900A,GTC>ATC;GTC>ATC,False,1,False,False,22C,England,GtoA
974786,ORF1a;ORF1ab,ORF1a;ORF1ab,G2867A,GTA>ATA;GTA>ATA,False,1,False,False,22C,England,GtoA
974787,ORF1a;ORF1ab,ORF1a;ORF1ab,G2782T,GTG>GTT;GTG>GTT,True,1,False,False,22C,England,GtoT
974788,ORF1a;ORF1ab,ORF1a;ORF1ab,G2525A,GAG>AAG;GAG>AAG,False,1,False,False,22C,England,GtoA


Tally mutation type counts among **only synonymous** mutations for each clade and subset, separating reversions to reference from other mutations:

In [47]:
mut_type_counts = (
    mutation_counts
    .query("synonymous")
    .groupby(["clade", "subset", "mut_type", "reversion_to_ref"], as_index=False)
    .aggregate({"count": "sum"})
)

mut_types = mut_type_counts["mut_type"].unique().tolist()

Plot total mutation counts for each clade and subset on a log scale:

In [48]:
clade_counts = (
    mut_type_counts
    .groupby(["clade", "subset"], as_index=False)
    .aggregate({"count": "sum"})
)

clade_counts_chart = (
    alt.Chart(clade_counts)
    .encode(
        x="clade",
        y=alt.Y("count", title="total mutations"),
        tooltip=["clade", "count"],
        column=alt.Column("subset", title=None),
    )
    .mark_bar()
    .properties(width=alt.Step(12), height=175)
)

clade_counts_chart

Plot fraction of mutation counts from reversions to reference.
Below you can see these fractions are high, probably indicating there is some issue with calling reversions to reference that is still plaguing the data and such reversions should perhaps be ignored:

In [49]:
reversion_fracs = (
    mut_type_counts
    .groupby(["clade", "subset", "reversion_to_ref"], as_index=False)
    .aggregate({"count": "sum"})
    .assign(frac=lambda x: x["count"] / x.groupby(["clade", "subset"])["count"].transform("sum"))
)

reversion_fracs_chart = (
    alt.Chart(reversion_fracs)
    .encode(
        x="clade",
        y=alt.Y("frac", title="fraction of mutations"),
        color="reversion_to_ref",
        tooltip=["clade", "count", "frac"],
        column=alt.Column("subset", title=None),
    )
    .mark_bar()
    .properties(width=alt.Step(12), height=150)
)

reversion_fracs_chart

In [50]:
mutation_counts.groupby(["clade", "reversion_to_ref", "reversion_to_founder"]).aggregate({"count": "sum"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
clade,reversion_to_ref,reversion_to_founder,Unnamed: 3_level_1
19A,False,False,16961
19A,True,True,470
20A,False,False,156036
20A,False,True,47
20A,True,False,371
...,...,...,...
22B,True,True,2183
22C,False,False,191693
22C,False,True,1644
22C,True,False,6466


In [51]:
(
    mutation_counts
    .query("synonymous")
    .query("subset == 'all'")
   # .query("reversion_to_ref")
    .sort_values("count", ascending=False)
    .query("nt_mutation.str.contains('3037')")
    .head(20)
)



Unnamed: 0,protein,aa_mutation,nt_mutation,codon_change,synonymous,count,reversion_to_ref,reversion_to_founder,clade,subset,mut_type
444189,ORF1a;ORF1ab,ORF1a;ORF1ab,T3037C,TTT>TTC;TTT>TTC,True,3459,True,False,21J,all,TtoC
732326,ORF1a;ORF1ab,ORF1a;ORF1ab,T3037C,TTT>TTC;TTT>TTC,True,760,True,False,21M,all,TtoC
382770,ORF1a;ORF1ab,ORF1a;ORF1ab,T3037C,TTT>TTC;TTT>TTC,True,544,True,False,21I,all,TtoC
606162,ORF1a;ORF1ab,ORF1a;ORF1ab,T3037C,TTT>TTC;TTT>TTC,True,214,True,False,21K,all,TtoC
222414,ORF1a;ORF1ab,ORF1a;ORF1ab,T3037C,TTT>TTC;TTT>TTC,True,93,True,False,20I,all,TtoC
10589,ORF1a;ORF1ab,ORF1a;ORF1ab,T3037C,TTT>TTC;TTT>TTC,True,87,True,False,20A,all,TtoC
448391,ORF1a;ORF1ab,ORF1a;ORF1ab,C3037T,TTC>TTT;TTC>TTT,True,76,False,True,21J,all,CtoT
314168,ORF1a;ORF1ab,ORF1a;ORF1ab,T3037C,TTT>TTC;TTT>TTC,True,60,True,False,20J,all,TtoC
2,ORF1a;ORF1ab,ORF1a;ORF1ab,C3037T,TTC>TTT;TTC>TTT,True,50,False,False,19A,all,CtoT
106590,ORF1a;ORF1ab,ORF1a;ORF1ab,T3037C,TTT>TTC;TTT>TTC,True,41,True,False,20C,all,TtoC


In [57]:
(
    mutation_counts
    .query("synonymous")
    .query("subset == 'all'")
    .query("not reversion_to_ref")
    .sort_values("count", ascending=False)
    .query("mut_type == 'CtoT'")
    .head(20)
)

Unnamed: 0,protein,aa_mutation,nt_mutation,codon_change,synonymous,count,reversion_to_ref,reversion_to_founder,clade,subset,mut_type
444199,M,M,C26681T,TTC>TTT,True,1679,False,False,21J,all,CtoT
444201,ORF10,ORF10,C29614T,TGC>TGT,True,1586,False,False,21J,all,CtoT
444202,ORF1ab,ORF1ab,C19983T,GTC>GTT,True,1493,False,False,21J,all,CtoT
444203,ORF3a,ORF3a,C25521T,TTC>TTT,True,1479,False,False,21J,all,CtoT
444204,N,N,C29095T,TTC>TTT,True,1330,False,False,21J,all,CtoT
444212,ORF1ab,ORF1ab,C15240T,AAC>AAT,True,1159,False,False,21J,all,CtoT
444213,ORF1a;ORF1ab,ORF1a;ORF1ab,C1912T,TCC>TCT;TCC>TCT,True,1156,False,False,21J,all,CtoT
444216,ORF1a;ORF1ab,ORF1a;ORF1ab,C9430T,ATC>ATT;ATC>ATT,True,1124,False,False,21J,all,CtoT
444217,N,N,C28657T,GAC>GAT,True,1075,False,False,21J,all,CtoT
444221,ORF1a;ORF1ab,ORF1a;ORF1ab,C683T,CTA>TTA;CTA>TTA,True,1000,False,False,21J,all,CtoT


Get PCA of mutation spectrum separately for each clade and stratifying mutations by whether they are reversions to reference:

In [53]:
mut_type_freqs = (
    mut_type_counts
    .assign(
        total_count=lambda x: x.groupby(["clade", "reversion_to_ref"])["count"].transform("sum"),
        freq=lambda x: x["count"] / x["total_count"],
    )
    .pivot_table(
        index=["clade", "reversion_to_ref", "total_count"],
        values="freq",
        columns="mut_type",
        fill_value=0,
    )
)

pca = sklearn.decomposition.PCA(n_components=2)
pca_coords = pca.fit_transform(mut_type_freqs.values)
assert len(pca_coords) == len(mut_type_freqs)

mut_type_freqs_pca = (
    mut_type_freqs
    .reset_index()
    .assign(
        principal_component_1=pca_coords[:, 0],
        principal_component_2=pca_coords[:, 1],
        log10_total_count=lambda x: numpy.log(x["total_count"]) / numpy.log(10),
    )
)

Plot mutation spectrum PCA for all clades including both reversions to reference and non reversions to reference:

In [54]:
mut_type_freqs_chart = (
    alt.Chart(mut_type_freqs_pca)
    .encode(
        x="principal_component_1",
        y="principal_component_2",
        shape="reversion_to_ref",
        color=alt.Color("clade", scale=alt.Scale(scheme="viridis")),
        tooltip=["clade", "reversion_to_ref", "total_count"],
    )
    .mark_point(filled=True, size=50)
)

mut_type_freqs_chart

Now do PCA on just non-reversion mutations:

In [55]:
mut_type_freqs_no_revert = mut_type_freqs.query("reversion_to_ref == False")

pca = sklearn.decomposition.PCA(n_components=2)
pca_coords_no_revert = pca.fit_transform(mut_type_freqs_no_revert.values)
assert len(pca_coords_no_revert) == len(mut_type_freqs_no_revert)

mut_type_freqs_no_revert_pca = (
    mut_type_freqs_no_revert
    .reset_index()
    .assign(
        principal_component_1=pca_coords_no_revert[:, 0],
        principal_component_2=pca_coords_no_revert[:, 1],
        log10_total_count=lambda x: numpy.log(x["total_count"]) / numpy.log(10),
    )
)

Plot the PCA on just non-reversion mutations:

In [56]:
total_count_selection = alt.selection_single(
    fields=["log10_total_count"],
    init={"log10_total_count": 4},
    bind=alt.binding_range(
        name="minimum log10 total counts",
        min=int(mut_type_freqs_no_revert_pca["log10_total_count"].min()),
        max=mut_type_freqs_no_revert_pca["log10_total_count"].max(),
    )
)

mut_type_freqs_no_revert_chart = (
    alt.Chart(mut_type_freqs_no_revert_pca)
    .encode(
        x="principal_component_1",
        y="principal_component_2",
        color=alt.Color("clade", scale=alt.Scale(scheme="viridis")),
        tooltip=["clade", "total_count"],
    )
    .mark_point(filled=True, size=50)
    .add_selection(total_count_selection)
    .transform_filter(
        total_count_selection.log10_total_count <= alt.datum.log10_total_count
    )
)

mut_type_freqs_no_revert_chart