# Count mutations from `matUtils` translated mutations
Get variables from `snakemake`:

In [None]:
max_nt_mutations = snakemake.params.max_nt_mutations
max_reversions_to_ref = snakemake.params.max_reversions_to_ref
max_reversions_to_founder = snakemake.params.max_reversions_to_clade_founder
input_tsv = snakemake.input.tsv
ref_fasta = snakemake.input.ref_fasta
clade_founder_fasta = snakemake.input.clade_founder_fasta
sites_to_exclude = snakemake.params.sites_to_exclude
exclude_ref_to_founder_muts = snakemake.params.exclude_ref_to_founder_muts
ref_to_founder_muts_csv = snakemake.input.ref_to_founder_muts
output_csv = snakemake.output.csv

Import Python modules:

In [None]:
import Bio.SeqIO

import numpy

import pandas as pd

Get reference and founder sequence:

In [None]:
ref = str(Bio.SeqIO.read(ref_fasta, "fasta").seq)
founder = str(Bio.SeqIO.read(clade_founder_fasta, "fasta").seq)

Get the sites and mutations to exclude:

In [None]:
if sites_to_exclude:
    sites_to_exclude = set(sites_to_exclude)
else:
    sites_to_exclude = set()
print(f"There are {len(sites_to_exclude)} sites to exclude")

if exclude_ref_to_founder_muts:
    muts_to_exclude = set(pd.read_csv(ref_to_founder_muts_csv)["mutation"])
else:
    muts_to_exclude = set()
print(f"There are {len(muts_to_exclude)} mutations to exclude")

Process mutations:

In [None]:
mutations = (
    pd.read_csv(input_tsv, sep="\t")
    .query("not nt_mutations.str.contains(',')")
    .assign(
        nt_mutations=lambda x: x["nt_mutations"].str.split(";"),
        codon_changes=lambda x: x["codon_changes"].str.split(";"),
        aa_mutations=lambda x: x["aa_mutations"].str.split(";"),
        n_nt_mutations=lambda x: x["nt_mutations"].map(lambda ms: len(set(ms))),
        n_reversions_to_ref=lambda x: x["nt_mutations"].map(
            lambda ms: sum(m[-1] == ref[int(m[1: -1]) - 1] for m in set(ms))                                         
        ),
        n_reversions_to_founder=lambda x: x["nt_mutations"].map(
            lambda ms: sum(m[-1] == founder[int(m[1: -1]) - 1] for m in set(ms))                                         
        ),
    )
    .query("n_reversions_to_ref <= @max_reversions_to_ref")
    .query("n_reversions_to_founder <= @max_reversions_to_founder")
    .query("n_nt_mutations <= @max_nt_mutations")
    .explode(["aa_mutations", "nt_mutations", "codon_changes"])
    .assign(
        protein=lambda x: x["aa_mutations"].str.split(":").str[0],
        aa_mutation=lambda x: x["aa_mutations"].str.split(":").str[1],
        synonymous=lambda x: x["aa_mutation"].map(lambda m: m[0] == m[-1]),
    )
    .rename(columns={"nt_mutations": "nt_mutation", "codon_changes": "codon_change"})
    .groupby(["node_id", "nt_mutation"], as_index=False)
    .aggregate(
        protein=pd.NamedAgg("protein", lambda s: ";".join(s)),
        aa_mutation=pd.NamedAgg("aa_mutation", lambda s: ";".join(s)),
        codon_change=pd.NamedAgg("codon_change", lambda s: ";".join(s)),
        synonymous=pd.NamedAgg("synonymous", "all"),
    )
)

mutation_counts = (
    mutations
    .groupby(
        ["protein", "aa_mutation", "nt_mutation", "codon_change", "synonymous"],
        as_index=False,
    )
    .aggregate(count=pd.NamedAgg("node_id", "count"))
    .sort_values("count", ascending=False)
    .assign(
        nt_site=lambda x: x["nt_mutation"].str[1: -1].astype(int),
        reference_nt=lambda x: x["nt_site"].map(lambda r: ref[r - 1]),
        clade_founder_nt=lambda x: x["nt_site"].map(lambda r: founder[r - 1]),
        exclude=lambda x: (
            x["nt_site"].isin(sites_to_exclude) | x["nt_mutation"].isin(muts_to_exclude)
        ),
        exclude_reason=lambda x: numpy.where(
            x["nt_site"].isin(sites_to_exclude),
            "site to exclude",
            numpy.where(x["nt_mutation"].isin(muts_to_exclude), "reference founder difference", pd.NA),
        )
    )
)

mutation_counts.to_csv(output_csv, index=False)