# Get mutations in library strains relative to MRCA

In [1]:
import collections
import json

import Bio.Phylo
import Bio.SeqIO

import pandas as pd

First get a tree:

In [2]:
# Load the Nextstrain JSON file
with open("h3n2_ha.json") as f:
    data = json.load(f)

# Build tree with branch attributes
def build_phylo_tree(node):
    clade_name = node.get('name', '')
    clade = Bio.Phylo.BaseTree.Clade(name=clade_name)
    clade.branch_attrs = node["branch_attrs"]
    # If the node has children, create them recursively
    if 'children' in node:
        clade.clades = [build_phylo_tree(child) for child in node['children']]
    return clade

# Build the phylo tree from the JSON data
tree = build_phylo_tree(data["tree"])

Rename the one problematic strain in tree wrongly named "A/SOUTHAFRICA/R07876/202023" rather than "A/SOUTHAFRICA/R07876/2023":

In [3]:
for n in tree.find_clades(name="A/SOUTHAFRICA/R07876/202023"):
    n.name = "A/SOUTHAFRICA/R07876/2023"

Now we want to find the common ancestor of all of the library strains from 2022-2023 (these are the recent strains):

In [4]:
recent_strains = (
    pd.read_csv("../H3_seqneut_titers.csv")
    [["virus"]]
    .drop_duplicates()
    .replace({'virus': {'A/SOUTHAFRICA/R07876/202023': 'A/SOUTHAFRICA/R07876/2023'}}) # Replace same problematic strain
    .assign(year=lambda x: x["virus"].str.split("/").str[-1])
    .query(r"year.str.fullmatch(r'\d{4}')")
    .assign(year=lambda x: x["year"].astype(int))
    .query("(year >= 2022) and (year <= 2023)")
    .query('virus != "A/Thailand/8/2022"') # Ignore Thailand/8/2022, an egg-based vaccine strain
    # names used in seqneut titers have underscores removed
    .assign(treename=lambda x: x["virus"].str.replace("_", ""))
    .set_index("virus")
    ["treename"]
    .to_dict()
)

recent_strains_inv = {val: key for (key, val) in recent_strains.items()}

print(f"Got {len(recent_strains)} recent strains with titers")

Got 62 recent strains with titers


Get the most recent common ancestor of the strains w titers:

In [5]:
terminal_names = [n.name for n in tree.get_terminals()]

get_mrca = [s for s in recent_strains.values()]

assert set(get_mrca).issubset(terminal_names), set(get_mrca) - set(terminal_names)

mrca_tree = tree.common_ancestor(get_mrca)
print(f"{mrca_tree.count_terminals()=}, {tree.count_terminals()=}")

mrca_tree.count_terminals()=1719, tree.count_terminals()=1733


Now get mutations in each terminal node:

In [6]:
def condense_muts(muts):
    """Condense mutations for a site."""
    muts_by_site = collections.defaultdict(list)
    for m in muts:
        site = int(m[1: -1])
        muts_by_site[site].append(m)
    muts_by_site = sorted(muts_by_site.items())
    condensed_muts = []
    for site, mlist in muts_by_site:
        if len(mlist) == 1:
            condensed_muts.append(mlist[0])
        else:
            assert len(mlist) > 1
            start_nt = mlist[0][0]
            end_nt = mlist[-1][-1]
            if start_nt != end_nt:
                condensed_muts.append(f"{start_nt}{site}{end_nt}")
    return condensed_muts

records = []
for strain in get_mrca:
    node = list(mrca_tree.find_clades(name=strain))
    node = node[0]
    muts = {"nuc": [], "HA1": [], "HA2": []}
    for ancestor in mrca_tree.get_path(node):
        for mut_type in muts:
            if mut_type in ancestor.branch_attrs["mutations"]:
                muts[mut_type] += ancestor.branch_attrs["mutations"][mut_type]
    records.append(
        (
            recent_strains_inv[strain],
            condense_muts(muts["nuc"]),
            condense_muts(muts["HA1"]),
            condense_muts(muts["HA2"]),
        )
    )

muts = pd.DataFrame(
    records,
    columns=["strain", "nucleotide_mutations_list", "HA1_mutations_list", "HA2_mutations_list"],
).assign(
    nucleotide_mutations=lambda x: x["nucleotide_mutations_list"].map(len),
    HA1_protein_mutations=lambda x: x["HA1_mutations_list"].map(len),
    protein_mutations=lambda x: x["HA1_mutations_list"].map(len) + x["HA2_mutations_list"].map(len),
)

muts.to_csv("HA_mutations_from_MRCA.csv", index=False)

muts

Unnamed: 0,strain,nucleotide_mutations_list,HA1_mutations_list,HA2_mutations_list,nucleotide_mutations,HA1_protein_mutations,protein_mutations
0,A/AbuDhabi/6753/2023,"[A24G, G213A, G222A, A352G, C413A, A426G, T449...","[E50K, D53N, N96S, K121E, I140K, H156S, I192F,...",[N49S],21,8,9
1,A/Bangkok/P3599/2023,"[A24G, G95A, G213A, G222A, A352G, C413A, T449C...","[E50K, D53N, N96S, I140K, H156S, I192F, Q197H,...",[N49S],25,8,9
2,A/Bangkok/P3755/2023,"[A24G, G213A, G222A, A305G, A352G, C413A, A429...","[E50K, D53N, N96S, N122D, I140K, S144N, H156S,...",[N49S],24,10,11
3,A/Bhutan/0006/2023,"[T56A, G63A, G101A, T208C, A209G, G222A, A352G...","[I48T, D53N, N96S, H156S, Q173R, I192F]",[N49S],21,6,7
4,A/Bhutan/0845/2023,"[A24G, A138G, G213A, G222A, A352G, C413A, T449...","[I25V, E50K, D53N, N96S, I140K, H156S, I192F, ...","[V18M, N49S]",23,8,10
...,...,...,...,...,...,...,...
57,A/TECPAN/017FLU/2023,"[G213A, T300G, G392A, T484A, T650C, A665G, T74...","[E50K, F79V, I140K, I242M]",[V18M],12,4,5
58,A/Townsville/68/2023,"[A24G, T89C, G213A, G222A, A352G, C413A, T449C...","[E50K, D53N, N96S, I140K, H156S, I192F, I223V,...","[D46N, N49S]",23,8,10
59,A/Victoria/1033/2023,"[A24G, G213A, G222A, A352G, C413A, T449C, T484...","[E50K, D53N, N96S, I140K, H156S, I192F, I223V,...",[N49S],20,8,9
60,A/Wisconsin/27/2023,"[A24G, G213A, G222A, A352G, C413A, T449C, A452...","[E50K, D53N, N96S, I140K, H156S, I192F, I223V,...",[N49S],21,8,9
