# Analyze fitness effects of mutations fixed in each clade

Import Python modules:

In [16]:
import altair as alt

import Bio.Seq

import pandas as pd

import yaml

_ = alt.data_transformers.disable_max_rows()

Now get variables from `snakemake`:

In [2]:
if "snakemake" not in globals() and "snakemake" not in locals():
    # variables set manually for interactive debugging
    aa_fitness_csv = "../results/aa_fitness/aa_fitness.csv"
    aamut_by_clade_csv = "../results/aa_fitness/aamut_fitness_by_clade.csv"
    clade_founder_nts_csv = "../results/clade_founder_nts/clade_founder_nts.csv"
    
    with open("../config.yaml") as f:
        config = yaml.safe_load(f)
    min_expected_count = config["min_expected_count"]
    ref = config["clade_fixed_muts_ref"]
    
else:
    # get variables from `snakemake` when running pipeline
    aa_fitness_csv = snakemake.input.aafitness
    aamut_by_clade_csv = snakemake.input.aamut_by_clade
    clade_founder_nts_csv = snakemake.input.clade_founder_nts_csv
    min_expected_count = snakemake.params.min_expected_count
    ref = snakemake.params.ref

Read the input data:

In [3]:
aa_fitness = pd.read_csv(aa_fitness_csv)
aamut_by_clade = pd.read_csv(aamut_by_clade_csv)
clade_founder_nts = pd.read_csv(clade_founder_nts_csv)

First we get the amino acids at each position in each gene in each clade founder and the reference clade.
We ignore ORF1a and just look at ORF1ab:

In [30]:
codon_to_aa = {
    f"{nt1}{nt2}{nt3}": str(Bio.Seq.Seq(f"{nt1}{nt2}{nt3}").translate())
    for nt1 in "ACGT" for nt2 in "ACGT" for nt3 in "ACGT"
}

assert ref in clade_founder_nts["clade"].unique()

clade_founder_aas = (
    clade_founder_nts
    [["clade", "gene", "codon", "codon_site"]]
    .drop_duplicates()
    .assign(
        gene=lambda x: x["gene"].str.split(";"),
        codon=lambda x: x["codon"].str.split(";"),
        site=lambda x: x["codon_site"].str.split(";"),
    )
    .explode(["gene", "codon", "site"])
    .query("gene != 'ORF1a'")
    .assign(amino_acid=lambda x: x["codon"].map(codon_to_aa))
    [["clade", "gene", "site", "codon", "amino_acid"]]
    .drop_duplicates()
)

clade_founder_aas = clade_founder_aas.merge(
    (
        clade_founder_aas
        .query("clade == @ref")
        [["gene", "site", "codon", "amino_acid"]]
        .rename(columns={"codon": "ref_codon", "amino_acid": "ref_amino_acid"})
    ),
    on=["gene", "site"],
    validate="many_to_one",
)

Now get just the amino acid mutations in each clade founder, and summarize the number of such mutations:

In [34]:
clade_founder_muts = clade_founder_aas.query("amino_acid != ref_amino_acid")

(
    clade_founder_muts
    .groupby("clade")
    .aggregate(n_amino_acid_muts=pd.NamedAgg("site", "count"))
)

Unnamed: 0_level_0,n_amino_acid_muts
clade,Unnamed: 1_level_1
20A,2
20B,4
20C,4
20E,5
20F,6
20G,11
20I,18
20J,21
21C,9
21F,11


In [35]:
55 / 7000

0.007857142857142858