# Analyze fitness effects of mutations fixed in each clade

Import Python modules:

In [1]:
import altair as alt

import Bio.Seq

import numpy

import pandas as pd

import yaml

_ = alt.data_transformers.disable_max_rows()

Now get variables from `snakemake`:

In [2]:
if "snakemake" not in globals() and "snakemake" not in locals():
    # variables set manually for interactive debugging
    aa_fitness_csv = "../results/aa_fitness/aa_fitness.csv"
    aamut_by_clade_csv = "../results/aa_fitness/aamut_fitness_by_clade.csv"
    clade_founder_nts_csv = "../results/clade_founder_nts/clade_founder_nts.csv"
    
    with open("../config.yaml") as f:
        config = yaml.safe_load(f)
    min_expected_count = config["min_expected_count"]
    ref = config["clade_fixed_muts_ref"]
    
else:
    # get variables from `snakemake` when running pipeline
    aa_fitness_csv = snakemake.input.aafitness
    aamut_by_clade_csv = snakemake.input.aamut_by_clade
    clade_founder_nts_csv = snakemake.input.clade_founder_nts_csv
    min_expected_count = snakemake.params.min_expected_count
    ref = snakemake.params.ref

Read the input data:

In [3]:
aa_fitness = pd.read_csv(aa_fitness_csv).rename(columns={"aa_site": "site"})
aamut_by_clade = pd.read_csv(aamut_by_clade_csv).rename(columns={"aa_site": "site"})
clade_founder_nts = pd.read_csv(clade_founder_nts_csv)

We only consider clades that had sufficient counts to estimate the effects of mutations:

In [4]:
clades = aamut_by_clade["clade"].unique().tolist()

print(f"Analyzing {clades=}")

Analyzing clades=['20A', '20B', '20C', '20E', '20G', '20I', '21C', '21I', '21J', '21K', '21L', '22A', '22B', '22C']


First we get the amino acids at each position in each gene in each clade founder and the reference clade.
We ignore ORF1a and just look at ORF1ab:

In [5]:
codon_to_aa = {
    f"{nt1}{nt2}{nt3}": str(Bio.Seq.Seq(f"{nt1}{nt2}{nt3}").translate())
    for nt1 in "ACGT" for nt2 in "ACGT" for nt3 in "ACGT"
}

assert ref in clade_founder_nts["clade"].unique()

clade_founder_aas = (
    clade_founder_nts
    [["clade", "gene", "codon", "codon_site"]]
    .drop_duplicates()
    .assign(
        gene=lambda x: x["gene"].str.split(";"),
        codon=lambda x: x["codon"].str.split(";"),
        site=lambda x: x["codon_site"].str.split(";"),
    )
    .explode(["gene", "codon", "site"])
    .query("gene != 'ORF1a'")
    .assign(
        amino_acid=lambda x: x["codon"].map(codon_to_aa),
        site=lambda x: x["site"].astype(int),
    )
    [["clade", "gene", "site", "codon", "amino_acid"]]
    .drop_duplicates()
)

ref_aas = (
    clade_founder_aas
    .query("clade == @ref")
    [["gene", "site", "codon", "amino_acid"]]
    .rename(columns={"codon": "ref_codon", "amino_acid": "ref_amino_acid"})
) 

clade_founder_aas = clade_founder_aas.query("clade in @clades").merge(
    ref_aas,
    on=["gene", "site"],
    validate="many_to_one",
)

Now get just the amino acid mutations in each clade founder, and summarize the number of such mutations:

In [6]:
clade_founder_muts = clade_founder_aas.query("amino_acid != ref_amino_acid")

(
    clade_founder_muts
    .groupby("clade")
    .aggregate(n_amino_acid_muts=pd.NamedAgg("site", "count"))
)

Unnamed: 0_level_0,n_amino_acid_muts
clade,Unnamed: 1_level_1
20A,2
20B,4
20C,4
20E,5
20G,11
20I,18
21C,9
21I,22
21J,26
21K,42


Get the estimated fitness effect of each mutation from the overall fitness estimates:

In [7]:
# overall clade estimates of fitness effects of mutations
fitness_effects_tidy = (
    clade_founder_muts
    .merge(
        aa_fitness[["gene", "site", "aa", "fitness", "expected_count"]].rename(
            columns={"aa": "amino_acid", "fitness": "amino_acid_fitness"},
        ),
        on=["site", "gene", "amino_acid"],
        how="left",
        validate="many_to_one",
    )
    .merge(
        aa_fitness[["gene", "site", "aa", "fitness", "expected_count"]].rename(
            columns={"aa": "ref_amino_acid", "fitness": "ref_amino_acid_fitness"},
        ),
        on=["site", "gene", "ref_amino_acid"],
        how="left",
        validate="many_to_one",
    )
    .assign(
        mutation=lambda x: x["ref_amino_acid"] + x["site"].astype(str) + x["amino_acid"],
        fitness_effect=lambda x: x["amino_acid_fitness"] - x["ref_amino_acid_fitness"],
        expected_count=lambda x: numpy.minimum(x["expected_count_x"], x["expected_count_y"]),
    )
    .drop(columns=["expected_count_x", "expected_count_y"])
)

fitness_effects_tidy

Unnamed: 0,clade,gene,site,codon,amino_acid,ref_codon,ref_amino_acid,amino_acid_fitness,ref_amino_acid_fitness,mutation,fitness_effect,expected_count
0,21L,ORF1ab,135,AGG,R,AGT,S,0.44707,0.0,S135R,0.44707,96.22
1,22A,ORF1ab,135,AGG,R,AGT,S,0.44707,0.0,S135R,0.44707,96.22
2,22B,ORF1ab,135,AGG,R,AGT,S,0.44707,0.0,S135R,0.44707,96.22
3,22C,ORF1ab,135,AGG,R,AGT,S,0.44707,0.0,S135R,0.44707,96.22
4,20C,ORF1ab,265,ATC,I,ACC,T,0.81922,0.0,T265I,0.81922,496.45
...,...,...,...,...,...,...,...,...,...,...,...,...
334,21L,N,413,CGT,R,AGT,S,0.92395,0.0,S413R,0.92395,167.67
335,22A,N,413,CGT,R,AGT,S,0.92395,0.0,S413R,0.92395,167.67
336,22B,N,413,CGT,R,AGT,S,0.92395,0.0,S413R,0.92395,167.67
337,22C,N,413,CGT,R,AGT,S,0.92395,0.0,S413R,0.92395,167.67


Now get this information in a wide-form data frame that just lists the clades with the mutation, and also those clades alongside their clade founder codons:

In [10]:
fitness_effects = (
    fitness_effects_tidy
    .assign(clade_codon=lambda x: x["clade"] + " (" + x["codon"] + ")")
    .groupby(
        ["gene", "site", "mutation", "ref_codon", "ref_amino_acid", "fitness_effect", "expected_count"],
        as_index=False,
        dropna=False,
    )
    .aggregate(
        n_clades_with_mutation=pd.NamedAgg("clade", "count"),
        clades_with_mutation=pd.NamedAgg("clade", lambda s: "; ".join(s)),
        clade_codons_with_mutation=pd.NamedAgg("clade_codon", lambda s: "; ".join(s)),
    )
)

assert len(fitness_effects) == len(fitness_effects.groupby(["gene", "mutation"]))

fitness_effects

Unnamed: 0,gene,site,mutation,ref_codon,ref_amino_acid,fitness_effect,expected_count,n_clades_with_mutation,clades_with_mutation,clade_codons_with_mutation
0,E,9,T9I,ACA,T,1.03620,369.730,5,21K; 21L; 22A; 22B; 22C,21K (ATA); 21L (ATA); 22A (ATA); 22B (ATA); 22...
1,M,3,D3G,GAT,D,1.48940,134.820,1,21K,21K (GGT)
2,M,3,D3N,GAT,D,0.33533,102.120,1,22B,22B (AAT)
3,M,19,Q19E,CAA,Q,-1.00460,153.870,5,21K; 21L; 22A; 22B; 22C,21K (GAA); 21L (GAA); 22A (GAA); 22B (GAA); 22...
4,M,63,A63T,GCT,A,-1.09570,268.520,5,21K; 21L; 22A; 22B; 22C,21K (ACT); 21L (ACT); 22A (ACT); 22B (ACT); 22...
...,...,...,...,...,...,...,...,...,...,...
114,S,954,Q954H,CAA,Q,0.29488,292.690,5,21K; 21L; 22A; 22B; 22C,21K (CAT); 21L (CAT); 22A (CAT); 22B (CAT); 22...
115,S,969,N969K,AAT,N,0.37826,88.056,5,21K; 21L; 22A; 22B; 22C,21K (AAA); 21L (AAA); 22A (AAA); 22B (AAA); 22...
116,S,981,L981F,CTT,L,-1.93490,455.390,1,21K,21K (TTT)
117,S,982,S982A,TCA,S,1.72860,67.676,1,20I,20I (GCA)


Get amino-acid mutation fitness estimates by clade, polarized so they are coming from the reference amino-acid identity at the site, and keeping track whether each estimate is for a forward or reversion mutation in that clade.

First in tidy form:

In [12]:
fitness_effects_by_clade_tidy = (
    aamut_by_clade
    .query("mutant_aa != clade_founder_aa")
    .merge(ref_aas, on=["gene", "site"], validate="many_to_one")
    .query("(clade_founder_aa == ref_amino_acid) or (mutant_aa == ref_amino_acid)")
    .assign(
        mutation_polarity=lambda x: (x["clade_founder_aa"] == x["ref_amino_acid"]).map(
            {True: "forward", False: "reverse"}
        ),
        mutation=lambda x: numpy.where(
            x["mutation_polarity"] == "forward",
            x["clade_founder_aa"] + x["site"].astype(str) + x["mutant_aa"],
            x["mutant_aa"] + x["site"].astype(str) + x["clade_founder_aa"],
        ),
        fitness_effect=lambda x: numpy.where(
            x["mutation_polarity"] == "forward",
            x["delta_fitness"],
            -x["delta_fitness"],
        ),
    )
    [[
        "clade",
        "gene",
        "site",
        "mutation",
        "fitness_effect",
        "expected_count",
        "mutation_polarity",
    ]]
)

fitness_effects_by_clade_tidy

Unnamed: 0,clade,gene,site,mutation,fitness_effect,expected_count,mutation_polarity
0,20A,E,22,A22D,-0.77399,0.58420,forward
1,20A,E,22,A22G,-0.36931,0.22337,forward
2,20A,E,22,A22P,-0.68952,0.49638,forward
3,20A,E,22,A22S,-3.01370,9.68100,forward
4,20A,E,22,A22T,-0.85863,3.03990,forward
...,...,...,...,...,...,...,...
840553,22C,S,917,Y917D,-0.33554,0.19935,forward
840554,22C,S,917,Y917F,-0.61792,0.42753,forward
840555,22C,S,917,Y917H,-1.45610,1.64450,forward
840556,22C,S,917,Y917N,-0.51064,0.33318,forward


Now make a wide version with just the mutations that occurred in the clade founders, grouping together the forward and reverse clade estimates.
We calculate the **weighted average** (weighted by clade expected count) for each mutation polarity:

In [27]:
fitness_effects_by_clade = (
    fitness_effects_by_clade_tidy
    # use this merge as a way to retain just mutations that occurred
    .merge(fitness_effects[["gene", "site", "mutation"]])
    # group forward and reverse polarities, calculate weighted average fitness effects
    .assign(
        clade_effect=(
            lambda x: (
                x["clade"]
                + " (" + x["fitness_effect"].map(lambda f: "{:.2f}".format(f)) + ")"
            )
        ),
        weighted_effect=lambda x: x["fitness_effect"] * x["expected_count"],
    )
    .groupby(["gene", "mutation", "mutation_polarity"], as_index=False)
    .aggregate(
        expected_count=pd.NamedAgg("expected_count", "sum"),
        weighted_effect=pd.NamedAgg("weighted_effect", "sum"),
        clade_effect=pd.NamedAgg("clade_effect", lambda s: "; ".join(s)),
    )
    .assign(fitness_effect=lambda x: x["weighted_effect"] / x["expected_count"])
    .drop(columns="weighted_effect")
)

fitness_effects_by_clade

Unnamed: 0,gene,mutation,mutation_polarity,expected_count,clade_effect,fitness_effect
0,E,T9I,forward,296.41490,20A (1.00); 20B (1.32); 20C (1.27); 20E (0.94)...,1.019092
1,M,A63T,forward,79.21500,20A (0.44); 20B (-0.10); 20C (-0.39); 20E (-1....,-1.378999
2,M,D3G,forward,49.21322,20A (2.01); 20B (2.09); 20C (1.01); 20E (0.07)...,1.325150
3,M,D3N,forward,102.11730,20A (-0.35); 20B (-0.10); 20C (-1.49); 20E (0....,-0.104354
4,M,I82T,forward,34.49289,20A (2.44); 20B (1.74); 20C (1.11); 20E (-0.90...,1.479248
...,...,...,...,...,...,...
122,S,T716I,forward,468.98270,20A (0.87); 20B (1.50); 20C (1.05); 20E (0.22)...,0.205669
123,S,T95I,forward,234.78070,20A (1.95); 20B (1.82); 20C (2.08); 20E (1.80)...,1.228520
124,S,V213G,forward,4.73354,20A (-0.31); 20B (0.82); 20C (-0.19); 20E (-0....,-0.114797
125,S,W152C,forward,330.71780,20A (-0.66); 20B (-0.70); 20C (0.38); 20E (-1....,-2.054985


### Old stuff

In [13]:
fitness_effects_by_clade = fitness_effects_by_clade_tidy.pivot(
    index=["gene", "site", "mutation"],
    columns="clade",
    values=["fitness_effect", "expected_count", "mutation_polarity"],
)

# https://towardsdatascience.com/how-to-flatten-multiindex-columns-and-rows-in-pandas-f5406c50e569
fitness_effects_by_clade.columns = [
    "_".join(col) for col in fitness_effects_by_clade.columns
]

fitness_effects_by_clade = fitness_effects_by_clade.reset_index()

assert (
    len(fitness_effects_by_clade)
    == len(fitness_effects_by_clade.groupby(["gene", "mutation"]))
)

fitness_effects_by_clade

Unnamed: 0,gene,site,mutation,fitness_effect_20A,fitness_effect_20B,fitness_effect_20C,fitness_effect_20E,fitness_effect_20G,fitness_effect_20I,fitness_effect_21C,...,mutation_polarity_20G,mutation_polarity_20I,mutation_polarity_21C,mutation_polarity_21I,mutation_polarity_21J,mutation_polarity_21K,mutation_polarity_21L,mutation_polarity_22A,mutation_polarity_22B,mutation_polarity_22C
0,E,1,M1I,-3.3118,-3.1171,-2.7134,-2.6489,-3.0222,-4.4216,-2.2467,...,forward,forward,forward,forward,forward,forward,forward,forward,forward,forward
1,E,1,M1K,-0.42215,-0.34909,-0.27311,-0.28987,-0.37696,-1.083,-0.19017,...,forward,forward,forward,forward,forward,forward,forward,forward,forward,forward
2,E,1,M1L,-0.83326,-0.6457,-0.48813,-0.53621,-0.70729,-1.6055,-0.38299,...,forward,forward,forward,forward,forward,forward,forward,forward,forward,forward
3,E,1,M1R,-0.31181,-0.28223,-0.19009,-0.19696,-0.27321,-0.86945,-0.14951,...,forward,forward,forward,forward,forward,forward,forward,forward,forward,forward
4,E,1,M1T,-1.226,-1.0952,-0.83236,-0.90089,-1.0618,-2.2221,-0.63201,...,forward,forward,forward,forward,forward,forward,forward,forward,forward,forward
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60108,S,1272,Y1272D,-0.31181,-0.28223,-0.19009,-0.19696,-0.27321,-0.86945,-0.14951,...,forward,forward,forward,forward,forward,forward,forward,forward,forward,forward
60109,S,1272,Y1272F,-0.60746,-0.50397,-0.35716,-0.39648,-0.55973,-1.3429,-0.2889,...,forward,forward,forward,forward,forward,forward,forward,forward,forward,forward
60110,S,1272,Y1272H,-0.1274,-1.0952,-0.83236,0.19772,-1.0618,0.34288,-0.63201,...,forward,forward,forward,forward,forward,forward,forward,forward,forward,forward
60111,S,1272,Y1272N,-0.42215,-0.34909,-0.27311,-0.28987,-0.37696,-1.083,-0.19017,...,forward,forward,forward,forward,forward,forward,forward,forward,forward,forward


Finally, create a data frame that has both the overall average and the per-clade estimates for the mutations:

In [14]:
mut_effects = fitness_effects.merge(fitness_effects_by_clade, how="left")

mut_effects

Unnamed: 0,gene,site,mutation,ref_codon,ref_amino_acid,fitness_effect,expected_count,n_clades_with_mutation,clades_with_mutation,clade_codons_with_mutation,...,mutation_polarity_20G,mutation_polarity_20I,mutation_polarity_21C,mutation_polarity_21I,mutation_polarity_21J,mutation_polarity_21K,mutation_polarity_21L,mutation_polarity_22A,mutation_polarity_22B,mutation_polarity_22C
0,E,9,T9I,ACA,T,1.03620,369.730,5,21K; 21L; 22A; 22B; 22C,21K (ATA); 21L (ATA); 22A (ATA); 22B (ATA); 22...,...,forward,forward,forward,forward,forward,,,,,
1,M,3,D3G,GAT,D,1.48940,134.820,1,21K,21K (GGT),...,forward,forward,forward,forward,forward,,forward,forward,,forward
2,M,3,D3N,GAT,D,0.33533,102.120,1,22B,22B (AAT),...,forward,forward,forward,forward,forward,,forward,forward,,forward
3,M,19,Q19E,CAA,Q,-1.00460,153.870,5,21K; 21L; 22A; 22B; 22C,21K (GAA); 21L (GAA); 22A (GAA); 22B (GAA); 22...,...,forward,forward,forward,forward,forward,,,,,
4,M,63,A63T,GCT,A,-1.09570,268.520,5,21K; 21L; 22A; 22B; 22C,21K (ACT); 21L (ACT); 22A (ACT); 22B (ACT); 22...,...,forward,forward,forward,forward,forward,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,S,954,Q954H,CAA,Q,0.29488,292.690,5,21K; 21L; 22A; 22B; 22C,21K (CAT); 21L (CAT); 22A (CAT); 22B (CAT); 22...,...,forward,forward,forward,forward,forward,reverse,reverse,reverse,reverse,reverse
115,S,969,N969K,AAT,N,0.37826,88.056,5,21K; 21L; 22A; 22B; 22C,21K (AAA); 21L (AAA); 22A (AAA); 22B (AAA); 22...,...,forward,forward,forward,forward,forward,reverse,reverse,reverse,reverse,reverse
116,S,981,L981F,CTT,L,-1.93490,455.390,1,21K,21K (TTT),...,forward,forward,forward,forward,forward,reverse,forward,forward,forward,forward
117,S,982,S982A,TCA,S,1.72860,67.676,1,20I,20I (GCA),...,forward,,forward,forward,forward,forward,forward,forward,forward,forward


In [15]:
fitness_effects.query("site == 614")

Unnamed: 0,gene,site,mutation,ref_codon,ref_amino_acid,fitness_effect,expected_count,n_clades_with_mutation,clades_with_mutation,clade_codons_with_mutation
103,S,614,D614G,GAT,D,,,14,20A; 20B; 20C; 20E; 20G; 20I; 21C; 21I; 21J; 2...,20A (GGT); 20B (GGT); 20C (GGT); 20E (GGT); 20...
