# Get mutations in library strains relative to MRCA

In [1]:
import collections
import json

import Bio.Phylo
import Bio.SeqIO

import pandas as pd

First get a tree:

In [2]:
# Load the Nextstrain JSON file
with open("h3n2_ha.json") as f:
    data = json.load(f)

# Build tree with branch attributes
def build_phylo_tree(node):
    clade_name = node.get('name', '')
    clade = Bio.Phylo.BaseTree.Clade(name=clade_name)
    clade.branch_attrs = node["branch_attrs"]
    # If the node has children, create them recursively
    if 'children' in node:
        clade.clades = [build_phylo_tree(child) for child in node['children']]
    return clade

# Build the phylo tree from the JSON data
tree = build_phylo_tree(data["tree"])

Now we want to find the common ancestor of all of the library strains from 2022-2023 (these are the recent strains):

In [3]:
recent_strains = (
    pd.read_csv("../H3_seqneut_titers.csv")
    [["virus"]]
    .drop_duplicates()
    .assign(year=lambda x: x["virus"].str.split("/").str[-1])
    .query("year.str.fullmatch('\d{4}')")
    .assign(year=lambda x: x["year"].astype(int))
    .query("(year >= 2022) and (year <= 2023)")
    .assign(treename=lambda x: x["virus"].str.replace("_", ""))
    .set_index("virus")
    ["treename"]
    .to_dict()
)

recent_strains_inv = {val: key for (key, val) in recent_strains.items()}

print(f"Got {len(recent_strains)} recent strains with titers")

Got 62 recent strains with titers


  .query("year.str.fullmatch('\d{4}')")


Get the most recent common ancestor of the strains w titers:

In [4]:
terminal_names = [n.name for n in tree.get_terminals()]

# these two strains are known to be missing from the tree, but we do not have
# growth rates for them so allow this
allow_missing = {'A/Saint-Petersburg/RII-MH144113/2023', 'A/SouthAfrica/KO56863/2023'}

get_mrca = [s for s in recent_strains.values() if s not in allow_missing]

assert set(get_mrca).issubset(terminal_names)

mrca_tree = tree.common_ancestor(get_mrca)
print(f"{mrca_tree.count_terminals()=}, {tree.count_terminals()=}")

mrca_tree.count_terminals()=2474, tree.count_terminals()=2680


Now get mutations in each terminal node:

In [5]:
def condense_muts(muts):
    """Condense mutations for a site."""
    muts_by_site = collections.defaultdict(list)
    for m in muts:
        site = int(m[1: -1])
        muts_by_site[site].append(m)
    muts_by_site = sorted(muts_by_site.items())
    condensed_muts = []
    for site, mlist in muts_by_site:
        if len(mlist) == 1:
            condensed_muts.append(mlist[0])
        else:
            assert len(mlist) > 1
            start_nt = mlist[0][0]
            end_nt = mlist[-1][-1]
            if start_nt != end_nt:
                condensed_muts.append(f"{start_nt}{site}{end_nt}")
    return condensed_muts

records = []
for strain in get_mrca:
    node = list(mrca_tree.find_clades(name=strain))
    assert len(node) == 1
    node = node[0]
    muts = {"nuc": [], "HA1": [], "HA2": []}
    for ancestor in mrca_tree.get_path(node):
        for mut_type in muts:
            if mut_type in ancestor.branch_attrs["mutations"]:
                muts[mut_type] += ancestor.branch_attrs["mutations"][mut_type]
    records.append(
        (
            recent_strains_inv[strain],
            condense_muts(muts["nuc"]),
            condense_muts(muts["HA1"]),
            condense_muts(muts["HA2"]),
        )
    )

muts = pd.DataFrame(
    records,
    columns=["strain", "nucleotide_mutations_list", "HA1_mutations_list", "HA2_mutations_list"],
).assign(
    nucleotide_mutations=lambda x: x["nucleotide_mutations_list"].map(len),
    HA1_protein_mutations=lambda x: x["HA1_mutations_list"].map(len),
    protein_mutations=lambda x: x["HA1_mutations_list"].map(len) + x["HA2_mutations_list"].map(len),
)

muts.to_csv("HA_mutations_from_MRCA.csv", index=False)

muts

Unnamed: 0,strain,nucleotide_mutations_list,HA1_mutations_list,HA2_mutations_list,nucleotide_mutations,HA1_protein_mutations,protein_mutations
0,A/Abu_Dhabi/6753/2023,"[A24G, G213A, G222A, A352G, C413A, A426G, T449...","[E50K, D53N, N96S, K121E, I140K, H156S, I192F,...",[N49S],21,8,9
1,A/Bangkok/P3599/2023,"[A24G, G95A, G213A, G222A, A352G, C413A, T449C...","[E50K, D53N, N96S, I140K, H156S, I192F, Q197H,...",[N49S],25,8,9
2,A/Bangkok/P3755/2023,"[A24G, G213A, G222A, A305G, A352G, C413A, A429...","[E50K, D53N, N96S, N122D, I140K, S144N, H156S,...",[N49S],24,10,11
3,A/Bhutan/0006/2023,"[T56A, G63A, G101A, T208C, A209G, G222A, A352G...","[I48T, D53N, N96S, H156S, Q173R, I192F]",[N49S],21,6,7
4,A/Bhutan/0845/2023,"[A24G, A138G, G213A, G222A, A352G, C413A, T449...","[I25V, E50K, D53N, N96S, I140K, H156S, I192F, ...","[V18M, N49S]",23,8,10
5,A/Bhutan/FLU-BTG-00988/2022,"[G143A, T182G, G222A, G312A, A352G, T449C, A48...","[D53N, E83K, N96S, I140M, H156S, I192F]","[N49S, S113A, I149M]",19,6,9
6,A/Brisbane/429/2023,"[T47C, A223G, T287C, C333A, A376G, T484A, C531...","[D53G, D104G, I140K, H156S, I214T, K276R, R299K]",[],16,7,7
7,A/Busan/1301/2023,"[A223G, C333A, G422A, C531A, A532G, G632A, A89...","[D53G, H156S, K276R]",[],11,3,3
8,A/California/81/2023,"[A24G, G213A, G222A, A275G, A306G, A352G, C413...","[E50K, D53N, N81D, N96S, I140K, H156S, I192F, ...",[N49S],21,8,9
9,A/Catalonia/2041146NS/2023,"[C48A, A80G, G143A, G222A, A352G, T449C, A485G...","[D53N, N96S, I140M, H156S, I192F]",[N49S],14,5,6
