# Compute expected counts of mutations from synonymous mutation rates and counts.

Import Python modules:

In [1]:
import altair as alt

import pandas as pd

Get input variables from [papermill](https://papermill.readthedocs.io/) parameterization (note next cell is tagged as `parameters`).
So when this notebook is run via `papermill`, those values will be replaced with whatever is in pipeline:

In [2]:
clade_founder_nts_csv = "../results/clade_founder_nts/clade_founder_nts.csv"
rates_by_clade_csv = "../results/synonymous_mut_rates/rates_by_clade.csv"
expected_counts_csv = "../results/expected_mut_counts/expected_mut_counts.csv"

Read input data:

In [3]:
rates_by_clade = pd.read_csv(rates_by_clade_csv)

clade_founder_nts = pd.read_csv(clade_founder_nts_csv, low_memory=False)

For each clade, we first compute $T$ that satisfies 
$$N_s = T \sum\limits_{nt_1} s_{nt_1} \sum\limits_{nt_2 \ne nt_1} r_{nt_1\rightarrow nt_2}$$
where $N_s$ is the total number of mutations at 4-fold degenerate synonymous sites observed for the clade, $s_{nt}$ is the number of 4-fold synonymous sites in the clade founder that are nucleotide $nt$, and $r_{nt_1\rightarrow nt_2}$ is the non-normalized rate of mutations from nucleotide $nt_1$ to $nt_2$ at 4-fold degenerate synonymous sites.

The expected number of mutations at each site (under neutrality) from the parental identity of $nt_1$ to some other identity of $nt_2$ is then simply $T \times r_{nt_1\rightarrow nt_2}$, which we will call the normalized rate for that clade:

In [4]:
# first compute the normalization factor T
normalization_factor = (
    clade_founder_nts.query("four_fold_degenerate")
    .groupby(["clade", "nt"], as_index=False)
    .aggregate(sites_4fold=pd.NamedAgg("site", "nunique"))
    .rename(columns={"nt": "parent_nt"})
    .merge(rates_by_clade, validate="one_to_many")
    .groupby(["clade", "parent_nt", "total_count", "sites_4fold"], as_index=False)
    .aggregate(summand_2=pd.NamedAgg("rate", "sum"))
    .assign(s_nt_1_summand_2=lambda x: x["sites_4fold"] * x["summand_2"])
    .groupby(["clade", "total_count"], as_index=False)
    .aggregate(summand_1=pd.NamedAgg("s_nt_1_summand_2", "sum"))
    .assign(normalization_factor=lambda x: x["total_count"] / x["summand_1"])
    .drop(columns="summand_1")
)

display(normalization_factor)

expected_muts = (
    normalization_factor.merge(clade_founder_nts, on="clade", validate="one_to_many")
    .rename(columns={"nt": "parent_nt"})
    .merge(
        rates_by_clade[["clade", "parent_nt", "mut_type", "rate"]],
        on=["clade", "parent_nt"],
    )
    .assign(expected_count=lambda x: x["normalization_factor"] * x["rate"])
    .rename(columns={"parent_nt": "clade_founder_nt"})[
        [
            "clade",
            "site",
            "clade_founder_nt",
            "gene",
            "codon",
            "codon_position",
            "codon_site",
            "mut_type",
            "four_fold_degenerate",
            "expected_count",
        ]
    ]
)

display(expected_muts)

print(f"Writing expected counts to {expected_counts_csv}")
expected_muts.to_csv(expected_counts_csv, float_format="%.5g", index=False)

Unnamed: 0,clade,total_count,normalization_factor
0,20A,17089,4.02377
1,20B,14024,3.302114
2,20C,9210,2.16911
3,20E,10444,2.459717
4,20G,13850,3.264225
5,20I,60789,14.326908
6,21C,6306,1.485528
7,21I,24055,5.672056
8,21J,281410,66.385796
9,21K,113017,26.648926


Unnamed: 0,clade,site,clade_founder_nt,gene,codon,codon_position,codon_site,mut_type,four_fold_degenerate,expected_count
0,20A,266,A,ORF1a;ORF1ab,ATG;ATG,1;1,1;1,AtoC,False,0.230892
1,20A,266,A,ORF1a;ORF1ab,ATG;ATG,1;1,1;1,AtoG,False,1.307323
2,20A,266,A,ORF1a;ORF1ab,ATG;ATG,1;1,1;1,AtoT,False,0.415454
3,20A,270,A,ORF1a;ORF1ab,GAG;GAG,2;2,2;2,AtoC,False,0.230892
4,20A,270,A,ORF1a;ORF1ab,GAG;GAG,2;2,2;2,AtoG,False,1.307323
...,...,...,...,...,...,...,...,...,...,...
1139965,22B,29668,C,ORF10,CTC,3,37,CtoG,True,0.279999
1139966,22B,29668,C,ORF10,CTC,3,37,CtoT,True,33.661305
1139967,22B,29670,C,ORF10,ACA,2,38,CtoA,False,2.172191
1139968,22B,29670,C,ORF10,ACA,2,38,CtoG,False,0.279999


Writing expected counts to ../results/expected_mut_counts/expected_mut_counts.csv


Just do a sanity check on the total 4-fold degenerate counts of each mutation type to make sure highly correlated:

In [5]:
expected_vs_actual_4fold_total = (
    expected_muts.query("four_fold_degenerate")
    .groupby(["clade", "mut_type"], as_index=False)
    .aggregate({"expected_count": "sum"})
    .merge(
        rates_by_clade[["clade", "mut_type", "count"]].rename(
            columns={"count": "actual_count"}
        ),
        validate="one_to_one",
    )
)

display(expected_vs_actual_4fold_total)

(
    alt.Chart(expected_vs_actual_4fold_total)
    .encode(
        x=alt.X("expected_count", scale=alt.Scale(type="log")),
        y=alt.Y("actual_count", scale=alt.Scale(type="log")),
        tooltip=expected_vs_actual_4fold_total.columns.tolist(),
    )
    .mark_point()
    .configure_axis(grid=False)
)

Unnamed: 0,clade,mut_type,expected_count,actual_count
0,20A,AtoC,283.997099,284
1,20A,AtoG,1608.006994,1608
2,20A,AtoT,511.008686,511
3,20A,CtoA,334.999345,335
4,20A,CtoG,129.999885,130
...,...,...,...,...
151,22B,GtoC,310.989034,311
152,22B,GtoT,4332.863788,4333
153,22B,TtoA,2017.008820,2017
154,22B,TtoC,8348.925738,8349
