# Merge expected and actual counts

First, read the data:

In [None]:
import itertools

import Bio.Seq

import pandas as pd


expected = (
    pd.read_csv(snakemake.input.expected)
    .rename(columns={"site": "nt_site", "codon": "clade_founder_codon"})
    .assign(
        nt_mutation=lambda x: (
            x["clade_founder_nt"] + x["nt_site"].astype(str) + x["mut_type"].str[-1]
        )
    )
)

actual = pd.read_csv(snakemake.input.actual)[
    [
        "clade",
        "subset",
        "nt_site",
        "nt_mutation",
        "codon_change",
        "count",
        "count_terminal",
        "count_non_terminal",
        "mean_log_size",
    ]
].rename(columns={"count": "actual_count"})

muts_to_exclude = pd.read_csv(snakemake.input.muts_to_exclude).rename(
    columns={"site": "nt_site", "mutation": "nt_mutation"}
)

Merge expected and actual, putting counts of any expected that are not observed to zero:

In [None]:
merged = pd.concat(
    [
        expected.merge(subset_actual, how="left", validate="one_to_many").assign(
            subset=subset,
            actual_count=lambda x: x["actual_count"].fillna(0).astype(int),
            count_terminal=lambda x: x["count_terminal"].fillna(0).astype(int),
            count_non_terminal=lambda x: x["count_non_terminal"].fillna(0).astype(int),
        )
        for subset, subset_actual in actual.groupby("subset")
    ],
    ignore_index=True,
)

Now exclude any mutations that are not from the founder identity codon, which results when there are multiple substitutions per site so that a clade founder identity is different from the reference:

In [None]:
merged = merged.assign(
    start_codon=lambda x: x["codon_change"].map(
        lambda s: pd.NA if pd.isnull(s) else ";".join([c[:3] for c in s.split(";")])
    ),
    from_founder_codon=lambda x: (
        (x["start_codon"] == x["clade_founder_codon"]) | x["start_codon"].isnull()
    ),
).query("from_founder_codon")

# add back any zero count mutations lost to this process
merged = merged.merge(
    pd.concat([expected.assign(subset=subset) for subset in merged["subset"].unique()]),
    how="outer",
).assign(
    actual_count=lambda x: x["actual_count"].fillna(0).astype(int),
    count_terminal=lambda x: x["count_terminal"].fillna(0).astype(int),
    count_non_terminal=lambda x: x["count_non_terminal"].fillna(0).astype(int),
    mean_log_size=lambda x: x["mean_log_size"].fillna(0),
)

Now add the sites to exclude:

In [None]:
merged = merged.merge(
    muts_to_exclude.assign(exclude=True),
    on=["clade", "nt_site", "nt_mutation"],
    how="left",
    validate="many_to_one",
).assign(
    exclude=lambda x: x["exclude"].fillna(False),
    masked_in_usher=lambda x: x["masked_in_usher"].fillna(False),
)

merged

Get the amino-acid mutation translations:

In [None]:
translation_table = {}
for nt1, nt2, nt3 in itertools.product("ACGT", "ACGT", "ACGT"):
    codon = f"{nt1}{nt2}{nt3}"
    translation_table[codon] = str(Bio.Seq.Seq(codon).translate())


def mutant_codon(row):
    codon = row["clade_founder_codon"]
    i = row["codon_position"]
    mut = row["nt_mutation"]
    assert mut[0] == codon[i - 1]
    return codon[: i - 1] + mut[-1] + codon[i:]


add_aa_muts = (
    merged[
        ["clade", "nt_mutation", "clade_founder_codon", "codon_position", "codon_site"]
    ]
    .drop_duplicates()
    .assign(
        clade_founder_codon=lambda x: x["clade_founder_codon"].str.split(";"),
        codon_position=lambda x: x["codon_position"].str.split(";"),
        codon_site=lambda x: x["codon_site"].str.split(";"),
    )
    .explode(["clade_founder_codon", "codon_position", "codon_site"])
    .assign(
        codon_position=lambda x: x["codon_position"].astype(int),
        mutant_codon=lambda x: x.apply(mutant_codon, axis=1),
        clade_founder_aa=lambda x: x["clade_founder_codon"].map(translation_table),
        mutant_aa=lambda x: x["mutant_codon"].map(translation_table),
        aa_mutation=lambda x: (
            x["clade_founder_aa"] + x["codon_site"].astype(str) + x["mutant_aa"]
        ),
    )
    [["clade", "nt_mutation", "mutant_codon", "clade_founder_aa", "mutant_aa", "aa_mutation"]]
    .groupby(["clade", "nt_mutation"], as_index=False)
    .aggregate(lambda s: ";".join(s))
    .assign(synonymous=lambda x: x["clade_founder_aa"] == x["mutant_aa"])
)

Now add in amino acid mutations and clean up data frame:

In [None]:
# clean up data frame to just have columns of interest
merged = (
    merged.merge(add_aa_muts, validate="many_to_one", how="outer")[
        [
            "clade",
            "subset",
            "nt_site",
            "nt_mutation",
            "exclude",
            "masked_in_usher",
            "expected_count",
            "actual_count",
            "clade_founder_nt",
            "gene",
            "clade_founder_codon",
            "clade_founder_aa",
            "mutant_codon",
            "mutant_aa",
            "aa_mutation",
            "synonymous",
            "codon_position",
            "codon_site",
            "four_fold_degenerate",
            "count_terminal",
            "count_non_terminal",
            "mean_log_size",
        ]
    ]
    .sort_values(["clade", "subset", "nt_site", "nt_mutation"])
    .reset_index(drop=True)
)

assert merged.notnull().all().all()

merged.to_csv(snakemake.output.csv, index=False, float_format="%.5g")

Just a quick look at correlation between actual and expected counts:

In [None]:
(
    merged[[
        "exclude", "synonymous", "actual_count", "expected_count", "count_terminal", "count_non_terminal",
    ]]
    .groupby(["exclude", "synonymous"])
    .corr()
)