# Analyze fitness effects of mutations fixed in each clade

Import Python modules:

In [1]:
import altair as alt

import Bio.Seq

import pandas as pd

import yaml

_ = alt.data_transformers.disable_max_rows()

Now get variables from `snakemake`:

In [2]:
if "snakemake" not in globals() and "snakemake" not in locals():
    # variables set manually for interactive debugging
    aa_fitness_csv = "../results/aa_fitness/aa_fitness.csv"
    aamut_by_clade_csv = "../results/aa_fitness/aamut_fitness_by_clade.csv"
    clade_founder_nts_csv = "../results/clade_founder_nts/clade_founder_nts.csv"
    
    with open("../config.yaml") as f:
        config = yaml.safe_load(f)
    min_expected_count = config["min_expected_count"]
    ref = config["clade_fixed_muts_ref"]
    
else:
    # get variables from `snakemake` when running pipeline
    aa_fitness_csv = snakemake.input.aafitness
    aamut_by_clade_csv = snakemake.input.aamut_by_clade
    clade_founder_nts_csv = snakemake.input.clade_founder_nts_csv
    min_expected_count = snakemake.params.min_expected_count
    ref = snakemake.params.ref

Read the input data:

In [3]:
aa_fitness = pd.read_csv(aa_fitness_csv).rename(columns={"aa_site": "site"})
aamut_by_clade = pd.read_csv(aamut_by_clade_csv)
clade_founder_nts = pd.read_csv(clade_founder_nts_csv)

First we get the amino acids at each position in each gene in each clade founder and the reference clade.
We ignore ORF1a and just look at ORF1ab:

In [4]:
codon_to_aa = {
    f"{nt1}{nt2}{nt3}": str(Bio.Seq.Seq(f"{nt1}{nt2}{nt3}").translate())
    for nt1 in "ACGT" for nt2 in "ACGT" for nt3 in "ACGT"
}

assert ref in clade_founder_nts["clade"].unique()

clade_founder_aas = (
    clade_founder_nts
    [["clade", "gene", "codon", "codon_site"]]
    .drop_duplicates()
    .assign(
        gene=lambda x: x["gene"].str.split(";"),
        codon=lambda x: x["codon"].str.split(";"),
        site=lambda x: x["codon_site"].str.split(";"),
    )
    .explode(["gene", "codon", "site"])
    .query("gene != 'ORF1a'")
    .assign(
        amino_acid=lambda x: x["codon"].map(codon_to_aa),
        site=lambda x: x["site"].astype(int),
    )
    [["clade", "gene", "site", "codon", "amino_acid"]]
    .drop_duplicates()
)

clade_founder_aas = clade_founder_aas.merge(
    (
        clade_founder_aas
        .query("clade == @ref")
        [["gene", "site", "codon", "amino_acid"]]
        .rename(columns={"codon": "ref_codon", "amino_acid": "ref_amino_acid"})
    ),
    on=["gene", "site"],
    validate="many_to_one",
)

Now get just the amino acid mutations in each clade founder, and summarize the number of such mutations:

In [5]:
clade_founder_muts = clade_founder_aas.query("amino_acid != ref_amino_acid")

(
    clade_founder_muts
    .groupby("clade")
    .aggregate(n_amino_acid_muts=pd.NamedAgg("site", "count"))
)

Unnamed: 0_level_0,n_amino_acid_muts
clade,Unnamed: 1_level_1
20A,2
20B,4
20C,4
20E,5
20F,6
20G,11
20I,18
20J,21
21C,9
21F,11


Get the estimated fitness effect of each mutation from the overall fitness estimates:

In [6]:
# overall clade estimates of fitness effects of mutations
fitness_effects = (
    clade_founder_muts
    .merge(
        aa_fitness[["gene", "site", "aa", "fitness"]].rename(
            columns={"aa": "amino_acid", "fitness": "amino_acid_fitness"},
        ),
        on=["site", "gene", "amino_acid"],
        how="left",
        validate="many_to_one",
    )
    .merge(
        aa_fitness[["gene", "site", "aa", "fitness"]].rename(
            columns={"aa": "ref_amino_acid", "fitness": "ref_amino_acid_fitness"},
        ),
        on=["site", "gene", "ref_amino_acid"],
        how="left",
        validate="many_to_one",
    )
    .assign(
        fitness_effect=lambda x: x["amino_acid_fitness"] - x["ref_amino_acid_fitness"]
    )
)

fitness_effects

Unnamed: 0,clade,gene,site,codon,amino_acid,ref_codon,ref_amino_acid,amino_acid_fitness,ref_amino_acid_fitness,fitness_effect
0,21L,ORF1ab,135,AGG,R,AGT,S,0.44707,0.0,0.44707
1,22A,ORF1ab,135,AGG,R,AGT,S,0.44707,0.0,0.44707
2,22B,ORF1ab,135,AGG,R,AGT,S,0.44707,0.0,0.44707
3,22C,ORF1ab,135,AGG,R,AGT,S,0.44707,0.0,0.44707
4,22E,ORF1ab,135,AGG,R,AGT,S,0.44707,0.0,0.44707
...,...,...,...,...,...,...,...,...,...,...
427,22A,N,413,CGT,R,AGT,S,0.92395,0.0,0.92395
428,22B,N,413,CGT,R,AGT,S,0.92395,0.0,0.92395
429,22C,N,413,CGT,R,AGT,S,0.92395,0.0,0.92395
430,22E,N,413,CGT,R,AGT,S,0.92395,0.0,0.92395


In [7]:
aamut_by_clade

Unnamed: 0,clade,gene,clade_founder_aa,mutant_aa,aa_site,aa_mutation,expected_count,actual_count,subset_of_ORF1ab,delta_fitness
0,20A,E,A,A,22,A22A,1.64940,2,False,0.15112
1,20A,E,A,A,32,A32A,13.04700,17,False,0.25607
2,20A,E,A,A,36,A36A,13.21700,1,False,-2.21320
3,20A,E,A,A,41,A41A,13.21700,2,False,-1.70240
4,20A,E,A,D,22,A22D,0.58420,0,False,-0.77399
...,...,...,...,...,...,...,...,...,...,...
1678588,22C,nsp1,Y,F,97,Y97F,0.42753,0,True,-0.61792
1678589,22C,nsp1,Y,H,97,Y97H,1.64450,2,True,0.15338
1678590,22C,nsp1,Y,N,97,Y97N,0.33318,0,True,-0.51064
1678591,22C,nsp1,Y,S,97,Y97S,0.15635,0,True,-0.27209
