# Mutation spectrum and rates at four-fold synonymous sites
Analyze the mutation rates and spectrum only at four-fold synonymous sites, as defined in relation to the founder sequence for that clade.

Get input variables from [papermill](https://papermill.readthedocs.io/) parameterization (note next cell is tagged as `parameters`).
So when this notebook is run via `papermill`, those values will be replaced with whatever is in pipeline:

In [1]:
mutation_counts_csv = "../results/mutation_counts/aggregated.csv"
clade_founder_nts_csv = "../results/clade_founder_nts/clade_founder_nts.csv"
rates_by_clade_csv = "../results/synonymous_mut_rates/rates_by_clade.csv"
clade_rate_dist_csv = "../results/synonymous_mut_rates/clade_rate_distances.csv"

Import Python modules:

In [2]:
import itertools
import math
import os
import re

import altair as alt

import numpy

import scipy

import pandas as pd

import sklearn.decomposition

import yaml

Read the parameters from the `config.yaml` file:

In [3]:
with open("config.yaml" if os.path.isfile("config.yaml") else "../config.yaml") as f:
    config = yaml.safe_load(f)
    
synonymous_spectra_min_counts = config["synonymous_spectra_min_counts"]
subset_order = list(config["sample_subsets"])
clade_synonyms = config["clade_synonyms"]

Read the mutation counts and assign mutation types:

In [4]:
mutation_counts = pd.read_csv(mutation_counts_csv).assign(
    mut_type=lambda x: x["nt_mutation"].map(lambda m: f"{m[0]}to{m[-1]}")
)

For each clade plot the top mutations as a fraction of all mutations in that clade, just using the "all" subset.

You can mouseover points to highlight mutations (which will highlight all mutations at that site on all facets), and click the legend to show/hide excluded or non-excluded mutations.

This plot is useful to look at to identifier apparent outlier sites with aberrantly high mutation counts that can then be specified for exclusion (note those specifications are done in the pipeline `config.yaml` file, and also all reversions from clade founder to reference may be excluded):

In [5]:
top_n = 100  # plot this many per clade

mutation_freqs = (
    mutation_counts
    .query("subset == 'all'")
    .sort_values(["clade", "count"], ascending=False)
    .groupby("clade")
    .head(n=top_n)
    .assign(
        freq=lambda x: x["count"] / x.groupby("clade")["count"].transform("sum"),
        rank=lambda x: x.groupby("clade")["freq"].rank(ascending=False, method="first"),
        exclude=lambda x: x["exclude"].map({True: "yes", False: "no"}),
    )
)

select_exclude = alt.selection_multi(
    fields=["exclude"], bind="legend", init=[{"exclude": "yes"}, {"exclude": "no"}],
)

select_site = alt.selection_single(
    fields=["nt_site"], on="mouseover", empty="none",
)

mutation_freqs_chart = (
    alt.Chart(mutation_freqs)
    .encode(
        x="rank",
        y="freq",
        strokeWidth=alt.condition(select_site, alt.value(2), alt.value(0)),
        color=alt.Color("exclude", scale=alt.Scale(domain=["yes", "no"])),
        shape=alt.Shape("synonymous"),
        size=alt.condition(select_site, alt.value(50), alt.value(25)),
        tooltip=["nt_site", "nt_mutation", "count", "freq"],
    )
    .mark_point(filled=True, stroke="black")
    .properties(width=200, height=100)
    .facet("clade", columns=4)
    .add_selection(select_exclude, select_site)
    .transform_filter(select_exclude)
)

mutation_freqs_chart

Tally mutation type counts among **only four-fold synonymous** mutations for each clade and subset, also removing any mutations specified for exclusion:

In [6]:
mut_type_counts = (
    mutation_counts
    .query("synonymous")
    .query("four_fold_degenerate")
    .query("not exclude")
    .groupby(["clade", "subset", "mut_type"], as_index=False)
    .aggregate({"count": "sum"})
)

Now also repeat these mutation type counts tally, but any mutations in the top 10 most frequent observed mutation for any clade, not doing any subsetting (just taking subset "all"):

In [7]:
exclude_top_n = 5  # exclude mutations in this top rank for any clade

mut_type_counts_exclude_top = (
    mutation_counts
    .query("synonymous")
    .query("four_fold_degenerate")
    .query("not exclude")
    .query("subset == 'all'")
    .assign(
        clade_rank=lambda x: x.groupby("clade")["count"].rank(ascending=False, method="min"),
        highest_rank=lambda x: x.groupby("nt_mutation")["clade_rank"].transform("min"),
    )
    .query("highest_rank > @exclude_top_n")
    .groupby(["clade", "mut_type"], as_index=False)
    .aggregate({"count": "sum"})
)

Plot total mutation counts for each clade and subset on a log scale.
Also draw a line at our minimum cutoff: we only keep subsets above this cutoff:

In [8]:
clade_counts = (
    mut_type_counts
    .groupby(["clade", "subset"], as_index=False)
    .aggregate({"count": "sum"})
)

clade_counts_chart = (
    alt.Chart(clade_counts)
    .encode(
        x="clade",
        y=alt.Y("count", title="total mutations", scale=alt.Scale(type="log")),
        tooltip=["clade", "subset", "count"],
        color="subset",
    )
    .mark_circle(size=50, opacity=0.7)
    .properties(width=alt.Step(18), height=175)
)

# draw cutoff line
cutoff = (
    alt.Chart(pd.DataFrame({"y": [synonymous_spectra_min_counts]}))
    .encode(y="y")
    .mark_rule(strokeDash=[2, 2])
)

(clade_counts_chart + cutoff).configure_axis(grid=False)

For genome partitioning, we subdivide the genome into halves based on the first and last site with an observed mutation:

In [9]:
n_partitions = 2

min_site = mutation_counts["nt_site"].min()
max_site = mutation_counts["nt_site"].max() + 1
partition_bounds = numpy.linspace(min_site, max_site, n_partitions + 1)

def assign_partition(r):
    """Assign nucleotide mutation to its partition."""
    for i in range(1, n_partitions + 1):
        if partition_bounds[i - 1] <= r < partition_bounds[i]:
            return f"partition {i}"

mutation_counts = (
    mutation_counts
    .assign(partition=lambda x: x["nt_site"].map(assign_partition))
)

If we choose to normalize nucleotide composition across partitions, we determine the parental nucleotide count in each sequence for each partition relative to the unpartitioned genome, and calculate the enrichment of each base in each partition:

In [10]:
composition_by_partition = (
    pd.concat(
        [
            pd.read_csv(clade_founder_nts_csv).assign(
                partition=lambda x: x["site"].map(assign_partition)
            ),
            pd.read_csv(clade_founder_nts_csv).assign(partition="all"),
        ],
    )   
    .query("four_fold_degenerate")
    .groupby(["clade", "nt", "partition"], as_index=False)
    .aggregate(count=pd.NamedAgg("site", "nunique"))
    .assign(
        frac=lambda x: (
            x["count"] / x.groupby(["clade", "partition"])["count"].transform("sum")
        ),
    )
    .drop(columns="count")
)

partition_enrichment = (
    composition_by_partition.query("partition != 'all'")
    .merge(
        composition_by_partition.query("partition == 'all'")[["clade", "nt", "frac"]].rename(
            columns={"frac": "frac_all"}
        ),
        validate="many_to_one",
    )
    .assign(enrichment=lambda x: x["frac"] / x["frac_all"])
    .rename(columns={"nt": "clade_founder_nt"})
)

partition_enrichment

  pd.read_csv(clade_founder_nts_csv).assign(
  pd.read_csv(clade_founder_nts_csv).assign(partition="all"),


Unnamed: 0,clade,clade_founder_nt,partition,frac,frac_all,enrichment
0,19A,A,partition 1,0.285045,0.289684,0.983983
1,19A,A,partition 2,0.294368,0.289684,1.016169
2,19A,C,partition 1,0.127051,0.137070,0.926905
3,19A,C,partition 2,0.147184,0.137070,1.073786
4,19A,G,partition 1,0.058603,0.065002,0.901551
...,...,...,...,...,...,...
139,22C,C,partition 2,0.146179,0.136074,1.074266
140,22C,G,partition 1,0.059266,0.065438,0.905682
141,22C,G,partition 2,0.071666,0.065438,1.095168
142,22C,T,partition 1,0.529163,0.508386,1.040867


Get PCA of mutation spectrum, using only filtered synonymous mutation counts for non-excluded mutations for clades/subsets/partitions with adequate counts.

We do the PCA on four different ways of partitioning the data:

 1. Just looking at the "all" subset for each clade across entire genome.
 2. Looking at the "all" subset for each clade across entire genome, but excluding any mutation that is among the top ranked most counts for any clade.
 3. Looking at all subsets for each clade across entire genome.
 4. Looking at the "all" subset along partitions of the genome. 

We standardize the vectors of mutation fractions before doing the PCA.

There is also an option, currently not used, to do a composite log ratio (CLR) transform the fraction of all mutations that are each type before performing the PCA, using the formula [here](https://www.rdocumentation.org/packages/compositions/versions/2.0-4/topics/clr) (see [this paper](https://www.jstor.org/stable/2335943)).

In the plots below, you can mouseover the points for details and click on clades in legends (shift click for multiple clades) to highlight just points for the selected clade(s).
You can also use the scroll bar to only show points with at least the indicated number of total synonymous mutation counts (after filtering):

In [11]:
def clr(frac_vec):
    """Formula here: https://www.rdocumentation.org/packages/compositions/versions/2.0-4/topics/clr"""
    assert all(frac_vec > 0), "does not currently handle zeros in frac_vec"
    return numpy.log(frac_vec) - 1 / len(frac_vec) * numpy.log(frac_vec).sum()


def add_clade_synonym(clade):
    if clade in clade_synonyms:
        return f"{clade} ({clade_synonyms[clade]})"
    else:
        return clade

    
for title, subsets, partition, exclude_top, clr_transform, normalize_partitions, exclude_all in [
    ("all samples, whole genome", ["all"], False, False, False, False, False),
    ("all samples, whole genome, +/- top mutations", ["all"], False, True, False, False, True),
    ("by region, whole genome", subset_order, False, False, False, False, True),
    ("all samples, partitioned genome", ["all"], True, False, False, False, True),
]:
    
    if len(subsets) > 1 and exclude_all:
        subsets = [s for s in subsets if s != "all"]
    
    filtered_mutation_counts = (
        mutation_counts
        .query("synonymous")
        .query("four_fold_degenerate")
        .query("not exclude")
        .query("subset in @subsets")
    )
    
    if partition:
        filtered_mutation_counts = pd.concat(
            [
                filtered_mutation_counts.assign(partition="all"),
                filtered_mutation_counts,
            ]
        )
        if normalize_partitions:
            filtered_mutation_counts = (
                filtered_mutation_counts
                .merge(
                    partition_enrichment[
                        ["clade", "partition", "clade_founder_nt", "enrichment"]
                    ],
                    how="left",
                    validate="many_to_one",
                )
                .assign(
                    enrichment=lambda x: x["enrichment"].fillna(1),
                    count=lambda x: x["count"] / x["enrichment"],
                )
            )
    else:
        filtered_mutation_counts = filtered_mutation_counts.assign(partition="all")
        
    mut_type_counts = (
        filtered_mutation_counts
        .groupby(["clade", "subset", "partition", "mut_type"], as_index=False)
        .aggregate({"count": "sum"})
    )
    
    if exclude_top:
        assert all(mut_type_counts["partition"] == "all")
        assert all(mut_type_counts["subset"] == "all")
        mut_type_counts = pd.concat(
            [
                mut_type_counts.assign(excluded="no"),
                mut_type_counts_exclude_top.assign(
                    partition="all", subset="all", excluded="yes",
                ),
            ]
        )
    else:
        mut_type_counts = mut_type_counts.assign(excluded="no")
   
    mut_type_freqs = (
        mut_type_counts
        .assign(
            total_count=lambda x: (
                x.groupby(["clade", "subset", "partition", "excluded"])
                ["count"].transform("sum")
            ),
            freq=lambda x: x["count"] / x["total_count"],
        )
        .query("total_count >= @synonymous_spectra_min_counts")
    )
   
    mut_type_freqs = (
        mut_type_freqs
        .pivot_table(
            index=["clade", "subset", "partition", "excluded", "total_count"],
            values="freq",
            columns="mut_type",
            fill_value=0,
        )
    )
    
    if exclude_top and exclude_all:
        mut_type_freqs = mut_type_freqs.query("excluded == 'yes'")
    if partition and exclude_all:
        mut_type_freqs = mut_type_freqs.query("partition != 'all'")
    
    if clr_transform:
        mut_type_freqs = mut_type_freqs.apply(clr, axis=1)
    
    scaled_freqs = sklearn.preprocessing.StandardScaler().fit_transform(mut_type_freqs.values)
    pca = sklearn.decomposition.PCA(n_components=2)
    pca_coords = pca.fit_transform(scaled_freqs)
    assert len(pca_coords) == len(mut_type_freqs)

    mut_type_freqs_pca_coords = (
        mut_type_freqs
        .reset_index()
        .assign(
            principal_component_1=pca_coords[:, 0],
            principal_component_2=pca_coords[:, 1],
            log10_total_count=lambda x: numpy.log(x["total_count"]) / numpy.log(10),
            clade=lambda x: x["clade"].map(add_clade_synonym),
        )
    )
    
    # percent variance explained by each component
    pca_var = 100 * pca.explained_variance_ratio_
    
    total_count_selection = alt.selection_single(
        fields=["log10_total_count"],
        init={"log10_total_count": numpy.log(synonymous_spectra_min_counts) / numpy.log(10)},
        bind=alt.binding_range(
            name="minimum log10 total counts",
            min=int(mut_type_freqs_pca_coords["log10_total_count"].min()),
            max=mut_type_freqs_pca_coords["log10_total_count"].max(),
        )
    )
    
    clade_selection = alt.selection_multi(fields=["clade"], bind="legend")

    tooltip = ["clade", "total_count"]
    
    plot_size = 300  # scaled by component variance explained
    
    pca_chart = (
        alt.Chart(mut_type_freqs_pca_coords)
        .encode(
            y=alt.Y(
                "principal_component_1",
                title=f"PC1 ({pca_var[0]:.0f}% variance)",
                scale=alt.Scale(nice=False, padding=10),
                axis=alt.Axis(labels=False, ticks=False),
            ),
            x=alt.X(
                "principal_component_2",
                title=f"PC2 ({pca_var[1]:.0f}% variance)",
                scale=alt.Scale(nice=False, padding=10),
                axis=alt.Axis(labels=False, ticks=False),
            ),
            color=alt.Color(
                "clade",
                scale=alt.Scale(
                    scheme="viridis",
                    domain=sorted(mut_type_freqs_pca_coords["clade"].unique()),
                ),
            ),
            strokeWidth=alt.condition(clade_selection, alt.value(1.5), alt.value(0)),
            opacity=alt.condition(clade_selection, alt.value(0.9), alt.value(0.45)),
            size=alt.condition(clade_selection, alt.value(65), alt.value(45)),
        )
        .mark_point(filled=True, stroke="black")
        .add_selection(clade_selection)
        # do not include the total count selection for now
        #.add_selection(total_count_selection)
        #.transform_filter(
        #    total_count_selection.log10_total_count <= alt.datum.log10_total_count
        #)
        .configure_axis(grid=False)
        .configure_legend(columns=1)
        .properties(
            height=plot_size, width=plot_size * pca_var[1] / pca_var[0],
            title=title,
        )
    )
    
    if len(subsets) > 1:
        subset_selection = alt.selection_multi(fields=["subset"], bind="legend")
        pca_chart = (
            pca_chart
            .encode(shape=alt.Shape("subset", sort=subsets, scale=alt.Scale(domain=subsets)))
            .add_selection(subset_selection)
            .transform_filter(subset_selection)
        )
        tooltip.append("subset")
        
    if partition:
        partition_selection = alt.selection_multi(fields=["partition"], bind="legend")
        pca_chart = (
            pca_chart
            .encode(
                shape=alt.Shape(
                    "partition",
                    scale=alt.Scale(domain=mut_type_freqs_pca_coords["partition"].unique()),
                ),
            )
            .add_selection(partition_selection)
            .transform_filter(partition_selection)
        )
        tooltip.append("partition")
        
    if exclude_top:
        exclude_selection = alt.selection_multi(fields=["excluded"], bind="legend")
        shape_title = f"exclude top {exclude_top_n} mutations/clade"
        pca_chart = (
            pca_chart
            .encode(
                shape=alt.Shape(
                    "excluded",
                    title=shape_title,
                    scale=alt.Scale(domain=mut_type_freqs_pca_coords["excluded"].unique()),
                ),
            )
            .add_selection(exclude_selection)
            .transform_filter(exclude_selection)
        )
        tooltip.append(alt.Tooltip("excluded", title=shape_title))
                       
    pca_chart = pca_chart.encode(tooltip=tooltip)
    
    chart_name = os.path.join(
        os.path.dirname(rates_by_clade_csv),
        re.sub("\W+", "_", title) + "_chart.html",
    )
    
    display(pca_chart)
    print(f"Saving to {chart_name}\n\n")
    pca_chart.save(chart_name)

Saving to ../results/synonymous_mut_rates/all_samples_whole_genome_chart.html




Saving to ../results/synonymous_mut_rates/all_samples_whole_genome_top_mutations_chart.html




Saving to ../results/synonymous_mut_rates/by_region_whole_genome_chart.html




Saving to ../results/synonymous_mut_rates/all_samples_partitioned_genome_chart.html




Compute statistical significance of differences between clades.
We just do this on "all" sequences for a clade, not partitioning the genomes:

In [12]:
all_mut_type_counts = (
    mut_type_counts.query("subset == 'all'")
    .drop(columns="subset")
    .assign(
        total_count=lambda x: x.groupby("clade")["count"].transform("sum"),
        clade=lambda x: x["clade"].map(add_clade_synonym),
    )
    .query("total_count >= @synonymous_spectra_min_counts")
    .drop(columns="total_count")
)

wide_all_mut_type_counts = all_mut_type_counts.pivot_table(
    index="mut_type",
    columns="clade",
    values="count",
    fill_value=0,
)

Now run chi2 test.
Also, Bonferroni correct the P-values (this is conservative, but is fine as these P-values are so tiny):

In [13]:
min_p = 1e-20  # plot P-values less than this as this

records = []
for clade1, clade2 in itertools.combinations(wide_all_mut_type_counts.columns, 2):
    chi2, p, dof, _ = scipy.stats.chi2_contingency(
        wide_all_mut_type_counts[[clade1, clade2]]
    )
    records.append((clade1, clade2, p, chi2))
    
chi2_stats = (
    pd.DataFrame(records, columns=["clade_1", "clade_2", "p", "chi2"])
    .assign(
        p=lambda x: x["p"].clip(lower=min_p),
        bonferroni_p=lambda x: (x["p"] * len(x)).clip(upper=1),
    )
)

Plot the Bonferroni corrected P-values.
Note since counts are very large, many comparisons will be highly significant:

In [14]:
p_chart = (
    alt.Chart(chi2_stats)
    .encode(
        x=alt.X("clade_1", title=None),
        y=alt.Y("clade_2", title=None),
        fill=alt.Fill(
            "bonferroni_p",
            title="Bonferroni corrected P-value",
            scale=alt.Scale(type="log", scheme="yelloworangered", reverse=True),
            legend=alt.Legend(orient="top"),
        ),
        tooltip=[
            "clade_1",
            "clade_2",
            alt.Tooltip("p", format=".2g"),
            alt.Tooltip("bonferroni_p", format=".2g"),
            alt.Tooltip("chi2", format=".2g"),
        ],
    )
    .mark_rect(stroke="black")
    .properties(width=alt.Step(14), height=alt.Step(14))
)

chart_title = os.path.join(os.path.dirname(rates_by_clade_csv), "p_value_chart.html")
print(f"Saving to {chart_title}")
p_chart.save(chart_title)

p_chart

Saving to ../results/synonymous_mut_rates/p_value_chart.html


Now get the mutation counts and fractions for each clade, both with and without excluding top mutations.
Then also compute a normalized **rate** for each mutation type, which is the fraction of all mutations that are of that type divided by the overall fraction of 4-fold synonymous sites sites that are the parental nucleotide in the mutation.

Note that a **caveat** is that for these rates we do not adjust the composition to account for any excluded 4-fold synonymous sites.
This is probably not currently a big problem, but could become a concern if a lot of sites are excluded:

In [15]:
parent_composition = (
    pd.read_csv(clade_founder_nts_csv)
    .query("four_fold_degenerate")
    .groupby(["clade", "nt"], as_index=False)
    .aggregate(parent_nt_count=pd.NamedAgg("site", "nunique"))
    .assign(
        parent_nt_frac=lambda x: (
            x["parent_nt_count"] / x.groupby("clade")["parent_nt_count"].transform("sum")
        ),
    )
)

mut_type_count_frac_rate = (
    pd.concat(
        [
            mut_type_counts.query("subset == 'all'").query("partition == 'all'")[
                ["clade", "mut_type", "count"]
            ].assign(exclude_top_mutations="no"),
            mut_type_counts_exclude_top.assign(exclude_top_mutations="yes"),
        ]
    )
    .assign(
        total_count=lambda x: x.groupby(["clade", "exclude_top_mutations"])["count"].transform("sum"),
        fraction=lambda x: x["count"] / x["total_count"],
        parent_nt=lambda x: x["mut_type"].str[0],
    )
    .query("total_count >= @synonymous_spectra_min_counts")
    .merge(
        parent_composition[["clade", "nt", "parent_nt_frac"]].rename(columns={"nt": "parent_nt"}),
        how="left",
        validate="many_to_one",
    )
    .assign(rate=lambda x: x["fraction"] / x["parent_nt_frac"])
)

  pd.read_csv(clade_founder_nts_csv)


Plot the rates. The chart below is interactive, and you can click to select mutation types, clades, etc:

In [16]:
melted_df = (
    mut_type_count_frac_rate
    .query("exclude_top_mutations == 'no'")
    .assign(mut_type=lambda x: x["mut_type"].str.replace("to", " -> "))
    .melt(
        id_vars=["clade", "mut_type", "count"],
        value_vars=["fraction", "rate"],
    )
    .assign(
        variable=lambda x: x["variable"].map(
            {
                "fraction": "fraction of all mutations",
                "rate": "relative rate of mutation",
            }
        ),
        clade=lambda x: x["clade"].map(add_clade_synonym),
    )
)

mut_type_selection = alt.selection_multi(fields=["mut_type"])

mut_type_selection_bar = (
    alt.Chart(melted_df[["mut_type"]].drop_duplicates())
    .encode(
        x=alt.X("mut_type", title="click / shift-click to select mutation types"),
        color=alt.condition(mut_type_selection, alt.value("darkgray"), alt.value("white")),
    )
    .mark_rect(stroke="black")
    .add_selection(mut_type_selection)
    .properties(width=alt.Step(15))
)

frac_rate_chart_base = (
    alt.Chart(melted_df)
    .encode(
        x=alt.X("mut_type", title="mutation type"),
        y=alt.Y("value", title=None),
        color=alt.Color("clade", scale=alt.Scale(scheme="viridis")),
        strokeWidth=alt.condition(clade_selection, alt.value(0.5), alt.value(0)),
        opacity=alt.condition(clade_selection, alt.value(0.9), alt.value(0.2)),
        size=alt.condition(clade_selection, alt.value(50), alt.value(40)),
        column=alt.Column(
            "variable",
            title=None,
            header=alt.Header(labelOrient="left", labelFontSize=11, labelFontStyle="bold"),
        ),
        tooltip=[
            alt.Tooltip(c, format=".3g") if melted_df[c].dtype == float else c
            for c in melted_df.columns.tolist()
        ],
    )
    .mark_point(filled=True, stroke="black")
    .add_selection(clade_selection, mut_type_selection)
    .transform_filter(mut_type_selection)
    .resolve_scale(y="independent")
    .properties(width=alt.Step(15), height=200)
)

frac_rate_chart = (
    (frac_rate_chart_base & mut_type_selection_bar)
    .configure_axis(grid=False)
    .configure_legend(columns=2)
    .configure(padding=20)
)

chart_title = os.path.join(os.path.dirname(rates_by_clade_csv), "frac_rate_chart.html")
print(f"Saving to {chart_title}")
frac_rate_chart.save(chart_title)

frac_rate_chart

Saving to ../results/synonymous_mut_rates/frac_rate_chart.html


Get rates to write to file.
Get the rates by clade **without** excluding the top mutations:

In [17]:
rates_by_clade = (
    mut_type_count_frac_rate
    .query("exclude_top_mutations == 'no'")
    .drop(columns="exclude_top_mutations")
)

print(f"Writing rates to {rates_by_clade_csv}")
rates_by_clade.to_csv(rates_by_clade_csv, index=False, float_format="%.5g")

rates_by_clade

Writing rates to ../results/synonymous_mut_rates/rates_by_clade.csv


Unnamed: 0,clade,mut_type,count,total_count,fraction,parent_nt,parent_nt_frac,rate
0,20A,AtoC,286,17202,0.016626,A,0.289616,0.057407
1,20A,AtoG,1617,17202,0.094001,A,0.289616,0.324570
2,20A,AtoT,515,17202,0.029938,A,0.289616,0.103373
3,20A,CtoA,339,17202,0.019707,C,0.137038,0.143807
4,20A,CtoG,130,17202,0.007557,C,0.137038,0.055147
...,...,...,...,...,...,...,...,...
163,22C,GtoC,108,18958,0.005697,G,0.065438,0.087056
164,22C,GtoT,1616,18958,0.085241,G,0.065438,1.302619
165,22C,TtoA,713,18958,0.037609,T,0.508386,0.073978
166,22C,TtoC,3528,18958,0.186096,T,0.508386,0.366051


Also get rates by clade w/o G to T rates:

In [18]:
rates_by_clade_noGT = (
    pd.concat(
        [
            mut_type_counts.query("subset == 'all'").query("partition == 'all'")[
                ["clade", "mut_type", "count"]
            ].assign(exclude_top_mutations="no"),
            mut_type_counts_exclude_top.assign(exclude_top_mutations="yes"),
        ]
    )
    .query("mut_type != 'GtoT'")
    .assign(
        total_count=lambda x: x.groupby(["clade", "exclude_top_mutations"])["count"].transform("sum"),
        fraction=lambda x: x["count"] / x["total_count"],
        parent_nt=lambda x: x["mut_type"].str[0],
    )    .query("total_count >= @synonymous_spectra_min_counts")
    .merge(
        parent_composition[["clade", "nt", "parent_nt_frac"]].rename(columns={"nt": "parent_nt"}),        
        how="left",
        validate="many_to_one",
    )
    .assign(rate=lambda x: x["fraction"] / x["parent_nt_frac"])
    .query("exclude_top_mutations == 'no'")
    .drop(columns="exclude_top_mutations")
)

rates_by_clade_noGT

Unnamed: 0,clade,mut_type,count,total_count,fraction,parent_nt,parent_nt_frac,rate
0,20A,AtoC,286,14532,0.019681,A,0.289616,0.067954
1,20A,AtoG,1617,14532,0.111272,A,0.289616,0.384204
2,20A,AtoT,515,14532,0.035439,A,0.289616,0.122366
3,20A,CtoA,339,14532,0.023328,C,0.137038,0.170229
4,20A,CtoG,130,14532,0.008946,C,0.137038,0.065280
...,...,...,...,...,...,...,...,...
149,22C,GtoA,1040,17342,0.059970,G,0.065438,0.916437
150,22C,GtoC,108,17342,0.006228,G,0.065438,0.095168
151,22C,TtoA,713,17342,0.041114,T,0.508386,0.080872
152,22C,TtoC,3528,17342,0.203437,T,0.508386,0.400162


Compute distances between clade rates:
 - *rates*: Euclidean norm of rate vectors
 - *no_GtoT_rates*: Euclidean norm of rate vectors excluding G to T mutations
 - *clr_rates*: Euclideana norm of CLR-transformed rate vectors

In [19]:
clade_synonyms

{'20A': 'B.1',
 '20B': 'B.1.1',
 '20C': 'B.1.367',
 '20E': 'B.1.177',
 '20G': 'B.1.2',
 '20I': 'Alpha',
 '21C': 'Epsilon',
 '21I': 'Delta',
 '21J': 'Delta',
 '21K': 'Omicron BA.1',
 '21L': 'Omicron BA.2',
 '22A': 'Omicron BA.4',
 '22B': 'Omicron BA.5',
 '22C': 'Omicron BA.2.12.1'}

In [20]:
clade_rate_dist = []
for clade1, clade2 in itertools.combinations(rates_by_clade["clade"].unique(), 2):
    rates1 = rates_by_clade.query("clade == @clade1").sort_values("mut_type")
    rates2 = rates_by_clade.query("clade == @clade2").sort_values("mut_type")
    
    rates1_noGT = rates_by_clade_noGT.query("clade == @clade1").sort_values("mut_type")
    rates2_noGT = rates_by_clade_noGT.query("clade == @clade2").sort_values("mut_type")
    
    d = math.sqrt(((rates1["rate"].values - rates2["rate"].values)**2).sum())
    clade_rate_dist.append(("rates", clade1, clade2, d))
    
    d_no_GtoT = math.sqrt(((rates1_noGT["rate"].values - rates2_noGT["rate"].values)**2).sum())
    clade_rate_dist.append(("no_GtoT_rates", clade1, clade2, d_no_GtoT))
    
    d_clr = math.sqrt(
        ((clr(rates1["rate"].values) - clr(rates2["rate"].values))**2).sum()
    )
    clade_rate_dist.append(("clr_rates", clade1, clade2, d_clr))
    
clade_rate_dist = pd.DataFrame(
    clade_rate_dist, columns=["rate_type", "clade_1", "clade_2", "mut_rate_distance"],
)

def is_omicron(row):
    c1 = clade_synonyms[row["clade_1"]]
    c2 = clade_synonyms[row["clade_2"]]
    if "Omicron" in c1 and "Omicron" in c2:
        return "Omicron"
    elif "Omicron" in c1 or "Omicron" in c2:
        return "mixed"
    else:
        return "not Omicron"
    
clade_rate_dist["is_Omicron"] = clade_rate_dist.apply(is_omicron, axis=1)

print(f"Writing clade rate distances to {clade_rate_dist_csv}")
clade_rate_dist.to_csv(clade_rate_dist_csv, index=False, float_format="%.5g")
    
clade_rate_dist

Writing clade rate distances to ../results/synonymous_mut_rates/clade_rate_distances.csv


Unnamed: 0,rate_type,clade_1,clade_2,mut_rate_distance,is_Omicron
0,rates,20A,20B,0.122766,not Omicron
1,no_GtoT_rates,20A,20B,0.135596,not Omicron
2,clr_rates,20A,20B,0.457610,not Omicron
3,rates,20A,20C,0.105401,not Omicron
4,no_GtoT_rates,20A,20C,0.071930,not Omicron
...,...,...,...,...,...
268,no_GtoT_rates,22A,22C,0.143362,Omicron
269,clr_rates,22A,22C,0.266588,Omicron
270,rates,22B,22C,0.204203,Omicron
271,no_GtoT_rates,22B,22C,0.203933,Omicron


Correlations among the various distances:

In [21]:
(
    clade_rate_dist
    .pivot_table(
        index=["clade_1", "clade_2"],
        columns="rate_type",
    )
    .corr()
    .round(3)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,mut_rate_distance,mut_rate_distance,mut_rate_distance
Unnamed: 0_level_1,rate_type,clr_rates,no_GtoT_rates,rates
Unnamed: 0_level_2,rate_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
mut_rate_distance,clr_rates,1.0,0.171,0.334
mut_rate_distance,no_GtoT_rates,0.171,1.0,0.296
mut_rate_distance,rates,0.334,0.296,1.0


Number of 4-fold degenerate sites and mutation counts at them:

In [22]:
summary_stats = (
    pd.read_csv(clade_founder_nts_csv)
    .query("four_fold_degenerate")
    .groupby("clade", as_index=False)
    .aggregate(n_four_fold_sites=pd.NamedAgg("site", "nunique"))
    .merge(mut_type_counts)
    .query("subset == 'all'")
    .query("partition == 'all'")
    .query("excluded == 'no'")
    .assign(total_mutations=lambda x: x.groupby("clade")["count"].transform("sum"))
    .query("total_mutations >= @synonymous_spectra_min_counts")
    .pivot_table(
        index=["clade", "n_four_fold_sites", "total_mutations"],
        values="count",
        columns="mut_type",
    )
    .reset_index()
)

summary_stats

  pd.read_csv(clade_founder_nts_csv)


mut_type,clade,n_four_fold_sites,total_mutations,AtoC,AtoG,AtoT,CtoA,CtoG,CtoT,GtoA,GtoC,GtoT,TtoA,TtoC,TtoG
0,20A,4247,17202,286,1617,515,339,130,7113,837,137,2670,565,2598,395
1,20B,4247,14121,155,1288,401,281,84,5987,627,98,2250,451,2148,351
2,20C,4246,9344,123,945,264,218,27,3863,474,67,1399,338,1400,226
3,20E,4246,10454,137,1105,299,214,119,4591,452,48,1313,363,1578,235
4,20G,4243,14019,170,1281,459,297,36,6229,709,71,1904,490,2036,337
5,20I,4243,60858,708,5952,1742,1345,956,26348,2757,316,8274,2105,8862,1493
6,21C,4245,6308,81,568,206,144,26,2766,303,29,834,226,951,174
7,21I,4241,24117,349,2807,781,655,180,9139,1280,175,3662,659,3823,607
8,21J,4239,282051,4346,31879,9092,7962,3066,106378,14454,1989,41591,7627,47290,6377
9,21K,4241,113721,1573,12349,3826,3113,758,46856,6415,641,8009,6144,21422,2615
