# Analyze correlation of amino-acid fitnesses with dN/dS values
This notebook looks at how dN/dS values from FEL correlation with the amino-acid fitness values.

In [None]:
# get variables from `snakemake`
dnds_csv = snakemake.input.dnds
aa_fitness_csv = snakemake.input.aa_fitness
corr_html = snakemake.output.corr_html
min_expected_count = snakemake.params.min_expected_count

In [None]:
import altair as alt

import numpy

import pandas as pd

import scipy.stats.mstats

Get dN/dS values averaging the dN (`beta`) and dS (`alpha`) values over timeframes first, and clipping the dN/dS values at 0.05 and 20:

In [None]:
gene_map = {
    "3C": "nsp5 (Mpro)",
    "RdRp": "nsp12 (RdRp)",
    "endornase": "nsp15",
    "exonuclease": "nsp14",
    "helicase": "nsp13",
    "leader": "nsp1",
    "methyltransferase": "nsp16",
}

dnds = (
    pd.read_csv(dnds_csv)
    .query("(alpha != 0) or (beta != 0)")
    .groupby(["gene", "site"], as_index=False)
    .aggregate({"alpha": "mean", "beta": "mean"})
    .assign(
        dnds=lambda x: numpy.clip(x["beta"] / x["alpha"], a_min=0.05, a_max=20),
        log_dnds=lambda x: numpy.log(x["dnds"]),
        gene=lambda x: x["gene"].map(lambda g: gene_map[g] if g in gene_map else g),
    )
    .rename(columns={"beta": "dn"})
    .drop(columns=["alpha", "dnds"])
)

dnds

Read amino-acid fitnesses, and for each site:
 - the mean fitness of mutations
 - the number of effective amino-acids when amino acids are assigned a probability weight of $e^{fitness}$

In [None]:
aa_fitness = (
    pd.read_csv(aa_fitness_csv)
    .query("gene not in ['ORF1a', 'ORF1ab']")
    .query("expected_count >= @min_expected_count")
    .rename(columns={"aa_site": "site"})
    .assign(
        p_aa=lambda x: numpy.exp(x["fitness"]),
        p=lambda x: x["p_aa"] / x.groupby(["gene", "site"])["p_aa"].transform("sum"),
    )
    .groupby(["gene", "site"], as_index=False)
    .aggregate(
        mean_fitness=pd.NamedAgg("fitness", "mean"),
        entropy=pd.NamedAgg("p", lambda p: -(p * numpy.log(p)).sum()),
    )
    .assign(n_effective=lambda x: numpy.exp(x["entropy"]))
)

assert not (set(dnds["gene"].unique()) - set(aa_fitness["gene"].unique()))

aa_fitness

Merge amino-acid fitness estimates aggregated at site level with dN/dS values:

In [None]:
df = (
    aa_fitness
    .merge(dnds, on=["gene", "site"], validate="one_to_one")
)

display(df.corr())

(
    df
    .groupby("gene")
    [["mean_fitness", "dn", "log_dnds"]]
    .corr()
    .reset_index()
    .query("level_1 == 'mean_fitness'")
    [["gene", "dn", "log_dnds"]]
    .sort_values("log_dnds")
)