# Calculate amino-acid fitness effects of mutations from expected versus actual counts

Get values from `snakemake`:

In [None]:
orf1ab_to_nsps = snakemake.params.orf1ab_to_nsps
fitness_pseudocount = snakemake.params.fitness_pseudocount
expected_vs_actual_counts_csv = snakemake.input.csv

Import Python modules:

In [None]:
import numpy

import pandas as pd

Read data, then:
 - Ignore any codons where any constituent nucleotides are masked in `UShER`
 - Ignore excluded mutations
 - Exclude mutations in overlapping reading frames
 - Explode data frame to make a separate line for each gene.
 - Drop ORF1a, the reason being that after we exclude overlapping reading frame sites there aren't any ORF1a sites not also in ORF1ab.
 - Aggregate all expected and actual counts for the same amino acid change for each clade / subset / amino-acid mutation

In [None]:
expected_vs_actual = (
    pd.read_csv(expected_vs_actual_counts_csv, low_memory=False)
    .assign(
        masked_in_usher=lambda x: (
            x.groupby(["gene", "clade", "codon_site"])
            ["masked_in_usher"]
            .transform("any")
        ),
    )
    .query("not masked_in_usher")
    .query("not exclude")
    .assign(
        is_overlapping=lambda x: x["codon_position"].str.split(";").map(
            lambda s: len(set(s)) > 1
        )
    )
    .query("not is_overlapping")
)

explode_cols = ["gene", "clade_founder_aa", "mutant_aa", "codon_site", "aa_mutation"]

for col in explode_cols:
    expected_vs_actual[col] = expected_vs_actual[col].str.split(";")

expected_vs_actual = (
    expected_vs_actual
    .explode(explode_cols)
    .query("gene != 'ORF1a'")
    .assign(weighted_mean_log_size=lambda x: x["actual_count"] * x["mean_log_size"])
    .groupby(["clade", "subset", *explode_cols], as_index=False)
    .aggregate(
        expected_count=pd.NamedAgg("expected_count", "sum"),
        actual_count=pd.NamedAgg("actual_count", "sum"),
        count_terminal=pd.NamedAgg("count_terminal", "sum"),
        count_non_terminal=pd.NamedAgg("count_non_terminal", "sum"),
        weighted_mean_log_size=pd.NamedAgg("weighted_mean_log_size", "sum"),
    )
    .rename(columns={"codon_site": "aa_site"})
    .assign(
        aa_site=lambda x: x["aa_site"].astype(int),
        mean_log_size=lambda x: numpy.where(
            x["actual_count"] > 0,
            x["weighted_mean_log_size"] / x["actual_count"],
            0,
        )
    )
    .drop(columns="weighted_mean_log_size")
)

assert (
    expected_vs_actual["clade_founder_aa"] == expected_vs_actual["aa_mutation"].str[0]
).all()

expected_vs_actual

In addition to the entries for full ORF1ab, we also want to have mutations numbered by the nsp naming.

First, make a data frame that converts the numbering:

In [None]:
orf1ab_to_nsps_df = pd.concat(
    [
        pd.DataFrame(
            [(i, i - start + 1) for i in range(start, end + 1)],
            columns=["ORF1ab_site", "nsp_site"],
        ).assign(nsp=nsp).drop_duplicates()
        for nsp, (start, end) in orf1ab_to_nsps.items()
    ],
    ignore_index=True,
)

orf1ab_to_nsps_df

Now we add to our data frame that has ORF1ab the estimates for the nsp proteins.
Note that these means mutations in both ORF1ab and nsp show up **twice** in the data frame with different names, so we add a column to indicate which genes are a subset of ORF1ab:

In [None]:
assert (
    expected_vs_actual["aa_mutation"]
    == (
        expected_vs_actual["clade_founder_aa"]
        + expected_vs_actual["aa_site"].astype(str)
        + expected_vs_actual["mutant_aa"]
    )
).all()

aa_counts_df = pd.concat(
    [
        expected_vs_actual.assign(subset_of_ORF1ab=False),
        (
            expected_vs_actual
            .query("gene == 'ORF1ab'")
            .merge(
                orf1ab_to_nsps_df,
                left_on="aa_site",
                right_on="ORF1ab_site",
                validate="many_to_one",
            )
            .drop(columns=["gene", "aa_mutation", "aa_site", "ORF1ab_site"])
            .rename(columns={"nsp": "gene", "nsp_site": "aa_site"})
            .assign(
                aa_mutation=lambda x: (
                    x["clade_founder_aa"] + x["aa_site"].astype(str) + x["mutant_aa"]
                ),
                subset_of_ORF1ab=True,
            )
        ),
    ],
    ignore_index=True,
)

aa_counts_df

Now for each subset, compute fitness as log expected / actual (after adding pseudocount) and write to a file:

In [None]:
for by_clade, by_subset, outfile in [
    (False, False, snakemake.output.aamut_all),
    (True, False, snakemake.output.aamut_by_clade),
    (False, True, snakemake.output.aamut_by_subset),
]:
    df = aa_counts_df
    if not by_clade:
        assert "all" not in df["clade"].unique()
        df = (
            df
            .assign(
                weighted_mean_log_size=lambda x: x["mean_log_size"] * x["actual_count"],
            )
            .drop(columns="mean_log_size")
            .groupby(
                [
                    c for c in df.columns
                    if c not in {
                        "clade",
                        "expected_count",
                        "actual_count",
                        "count_terminal",
                        "count_non_terminal",
                        "mean_log_size",
                        "weighted_mean_log_size",
                    }
                ],
                as_index=False,
            )
            .aggregate(
                {
                    "expected_count": "sum",
                    "actual_count": "sum",
                    "count_terminal": "sum",
                    "count_non_terminal": "sum",
                    "weighted_mean_log_size": "sum",
                }
            )
            .assign(
                mean_log_size=lambda x: numpy.where(
                    x["actual_count"] > 0,
                    x["weighted_mean_log_size"] / x["actual_count"],
                    0,
                ),
            )
        )
    if not by_subset:
        df = df.query("subset == 'all'").drop(columns="subset")
    
    df["delta_fitness"] = numpy.log(
        (df["actual_count"] + fitness_pseudocount)
        / (df["expected_count"] + fitness_pseudocount)
    )
    
    df.to_csv(outfile, index=False, float_format="%.5g")