# Investigating BA.2 and BA.4 / BA.5 origins in the sc2ts ARG

In [1]:
# Don't need this if sc2ts and the arg visualizer are already installed
!pip uninstall -y tskit_arg_visualizer
!pip install -q tskit tszip git+http://github.com/jeromekelleher/sc2ts git+https://github.com/kitchensjn/tskit_arg_visualizer

Found existing installation: tskit_arg_visualizer 0.0.2
Uninstalling tskit_arg_visualizer-0.0.2:
  Successfully uninstalled tskit_arg_visualizer-0.0.2


In [2]:
import collections
import os

import sc2ts
import numpy as np
import tszip
import tskit


First we load the "find_problematic_v2-2022-09-01.ts.il.tsz" file, which contains the first-pass ARG, in which all mutations (except deletions) have been used in inference. We use the "imputed lineages" (`.il`) version, which is output by the following script:

```
python scripts/run_lineage_imputation.py data/consensus_mutations.json.bz data/find_problematic_v2-2022-09-01.ts.tsz
```

In [3]:
# Get the Viridian ARG

ts_dir = "../data"
filename = "find_problematic_v2-2022-09-01.ts.il.tsz"
ts = tszip.decompress(os.path.join(ts_dir, filename))
print(
    f"Loaded {ts.nbytes/1e6:0.1f} megabyte SARS-CoV2 genealogy of {ts.num_samples} strains",
    f"({ts.num_trees} trees, {ts.num_mutations} mutations over {ts.sequence_length} basepairs).",
    f"Last collection date is {ts.node(ts.samples()[-1]).metadata['date']}",
)
ti = sc2ts.TreeInfo(ts)

Loaded 468.9 megabyte SARS-CoV2 genealogy of 462049 strains (174 trees, 868199 mutations over 29904.0 basepairs). Last collection date is 2022-09-01


Counting descendants :   0%|          | 0/545076 [00:00<?, ?it/s]

Indexing metadata    :   0%|          | 0/545076 [00:00<?, ?it/s]

Classifying mutations:   0%|          | 0/868199 [00:00<?, ?it/s]

## BA.2

Identify the recombination nodes in the ancestry of all the BA.2xxx samples. Also include the pango X- lineages, other than XA, XB and XC, as we don't know if they are descendants

In [4]:
focal = "BA.2"

exclude_pango_X = ("XA", "XB", "XC", "XF", "XS")  # Hnad-checks that these are not BA.2 recombinants

samples = [
    ti.pango_lineage_samples[pango]
    for pango in ti.pango_lineage_samples.keys()
    if (pango.startswith("X") and not pango in exclude_pango_X) or pango.startswith(focal)]
small_ts, node_map = ts.simplify(np.concatenate(samples), keep_unary=True, map_nodes=True)
print(f"Simplified ts focussing on {focal} (plus X lineages) has {small_ts.num_trees} trees with breakpoints at {small_ts.breakpoints(as_array=True)}")

Simplified ts focussing on BA.2 (plus X lineages) has 40 trees with breakpoints at [    0.  3264.  4321.  5100.  6843.  8393.  9856.  9866. 10447. 10518.
 11537. 14619. 14857. 15009. 15240. 15521. 15960. 18078. 19955. 19999.
 21294. 21618. 21721. 22599. 22674. 22786. 22792. 22916. 22917. 23202.
 25416. 26028. 26060. 26491. 26858. 27012. 27259. 27382. 28245. 29729.
 29904.]


There are a lot of breakpoints: some of these might be later on in the BA.2 outbreak, corresponding to e.g. pango X lineages. List them by time

In [5]:
from tqdm.auto import tqdm

reverse_map = {v: k for k, v in enumerate(node_map) if v != tskit.NULL}
focal_times = small_ts.nodes_time[node_map[ti.pango_lineage_samples[focal]]]
focal_re_info = {} # save time since start, num older than focal and, max num desc 
for nd in small_ts.nodes():
    if nd.flags & sc2ts.NODE_IS_RECOMBINANT:
        focal_re_info[reverse_map[nd.id]] = (ts.max_time-nd.time, np.sum(focal_times > nd.time), collections.Counter())
for k, v in tqdm(focal_re_info.items()):
    seen = set()
    for tree in ts.trees():
    # this is v. inefficient: we should really be doing ARG traversals
        for u in tree.samples(k):
            if u not in seen:
                v[2][ts.node(u).metadata["Viridian_pangolin"]] += 1
                seen.add(u)
for k in sorted(focal_re_info.keys(), key = lambda x: focal_re_info[x][0]):
    v = focal_re_info[k]
    print(f"Recomb node {k} @ {v[0]:.2g} days from Wuhan has {v[1]} older {focal} samples and {v[2].total()} descendant samples ({v[2].most_common(10)})")

  0%|          | 0/70 [00:00<?, ?it/s]

Recomb node 346185 @ 7.3e+02 days from Wuhan has 0 older BA.2 samples and 130048 descendant samples ([('BA.2', 37800), ('BA.2.12.1', 18036), ('BA.5.2.1', 9662), ('BA.5.1', 7577), ('BA.5.2', 5102), ('BA.2.9', 4502), ('BA.5.5', 4207), ('BA.4.1', 3514), ('BA.2.3', 3329), ('BA.5.6', 2407)])
Recomb node 351805 @ 7.5e+02 days from Wuhan has 37 older BA.2 samples and 1 descendant samples ([('XM', 1)])
Recomb node 359353 @ 7.6e+02 days from Wuhan has 123 older BA.2 samples and 217 descendant samples ([('XE', 217)])
Recomb node 359567 @ 7.6e+02 days from Wuhan has 123 older BA.2 samples and 6 descendant samples ([('XJ', 5), ('BA.2', 1)])
Recomb node 376984 @ 7.8e+02 days from Wuhan has 913 older BA.2 samples and 57 descendant samples ([('BA.2', 39), ('XZ', 12), ('XAP', 6)])
Recomb node 385437 @ 7.9e+02 days from Wuhan has 2147 older BA.2 samples and 14 descendant samples ([('XQ', 9), ('BA.2', 3), ('XR', 2)])
Recomb node 385978 @ 7.9e+02 days from Wuhan has 2302 older BA.2 samples and 1 descenda

It appears as if there is only one major recombination node, 346185, which originates the BA.2 outbreak (no BA.2 nodes older than this). Check out that node and its breakpoint:

In [6]:
print(ts.node(346185))
re_edges = ts.edges_child == 346185
parents = np.unique(ts.edges_parent[re_edges])
print(
    f"Has {np.sum(re_edges)} parents ({', '.join([ts.node(p).metadata['Imputed_Viridian_pangolin'] for p in parents])}), "
    f"and breakpoints at {set(ts.edges_left[re_edges]) & set(ts.edges_right[re_edges])}"
)

Node(id=346185, flags=8388608, time=248.00000699999998, population=-1, individual=-1, metadata={'Imputed_Viridian_pangolin': 'Unknown (R)', 'sc2ts': {'date_added': '2022-01-01', 'group_id': '5b918451bd7ccb36af4cb9908a1fe972'}})
Has 2 parents (B.1, BA.1.1), and breakpoints at {np.float64(22674.0)}


And look at the subgraph

In [7]:
import tskit_arg_visualizer as argviz
d3arg = argviz.D3ARG.from_ts(ts, progress=True)

Edges:   0%|          | 0/545434 [00:00<?, ?it/s]

Sites:   0%|          | 0/29903 [00:00<?, ?it/s]

Nodes:   0%|          | 0/545076 [00:00<?, ?it/s]

In [8]:
%%javascript
// necessary incantation to get the tskit_arg_visualizer to display on some systems
var script = document.createElement('script');
script.type = 'text/javascript';
script.src = 'https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js';
document.head.appendChild(script);

<IPython.core.display.Javascript object>

In [9]:
re_node = 346185
d3arg.reset_all_node_styles()
d3arg.set_node_styles([{"id": nd.id, "fill": "red"} for nd in ts.nodes() if nd.flags & sc2ts.NODE_IS_RECOMBINANT])
d3arg.set_node_labels({
    nd.id: " ".join([
        # For recombination nodes, list their most common Pango descendant types
        ", ".join([x[0] for x in focal_re_info[nd.id][2].most_common(3)]) if nd.id in focal_re_info else nd.metadata.get("Imputed_Viridian_pangolin", ""),
        f"({nd.metadata.get('strain', '#' + str(nd.id))})"
    ]) for nd in ts.nodes()
})
d3arg.nodes.loc[np.isin(d3arg.nodes["id"], ts.samples()), "symbol"] = 'd3.symbolSquare'
d3arg.nodes.loc[:, "stroke_width"] = 2

parent_levels, child_levels  = 1, 20  # Only show immediate children, but show all parents up to the root

d3arg.draw_node(
    re_node,
    degree=(child_levels, parent_levels),
    height=600,
    width=600,
    show_mutations=True,
    #include_mutation_labels=True,
)

## BA.4/BA.5

The BA.4 and BA.5 outbreaks are concurrent and interesting, as they may be recombinants

In [10]:
focal = "BA.4"  # this also picks up the major BA.5 node

samples = [
    ti.pango_lineage_samples[pango]
    for pango in ti.pango_lineage_samples.keys()
    if (pango.startswith("X") and not pango in exclude_pango_X) or pango.startswith(focal)]
small_ts, node_map = ts.simplify(np.concatenate(samples), keep_unary=True, map_nodes=True)
print(f"Simplified ts focussing on {focal} (plus X lineages) has {small_ts.num_trees} trees with breakpoints at {small_ts.breakpoints(as_array=True)}")

Simplified ts focussing on BA.4 (plus X lineages) has 24 trees with breakpoints at [    0.  1453.  1627.  4321.  8393. 11537. 15240. 15960. 19955. 20883.
 21618. 22599. 22674. 22917. 25416. 25810. 26028. 26060. 26529. 26858.
 27012. 27128. 27259. 27382. 29904.]


In [11]:
from tqdm.auto import tqdm

reverse_map = {v: k for k, v in enumerate(node_map) if v != tskit.NULL}
focal_times = small_ts.nodes_time[node_map[ti.pango_lineage_samples[focal]]]
focal_re_info = {} # save time since start, num older than focal and, max num desc 
for nd in small_ts.nodes():
    if nd.flags & sc2ts.NODE_IS_RECOMBINANT:
        focal_re_info[reverse_map[nd.id]] = (ts.max_time-nd.time, np.sum(focal_times > nd.time), collections.Counter())
for k, v in tqdm(focal_re_info.items()):
    seen = set()
    for tree in ts.trees():
    # this is v. inefficient: we should really be doing ARG traversals
        for u in tree.samples(k):
            if u not in seen:
                v[2][ts.node(u).metadata["Viridian_pangolin"]] += 1
                seen.add(u)
for k in sorted(focal_re_info.keys(), key = lambda x: focal_re_info[x][0]):
    v = focal_re_info[k]
    print(f"Recomb node {k} @ {v[0]:.2g} days from Wuhan has {v[1]} older {focal} samples and {v[2].total()} descendant samples ({v[2].most_common(10)})")

  0%|          | 0/43 [00:00<?, ?it/s]

Recomb node 346185 @ 7.3e+02 days from Wuhan has 0 older BA.4 samples and 130048 descendant samples ([('BA.2', 37800), ('BA.2.12.1', 18036), ('BA.5.2.1', 9662), ('BA.5.1', 7577), ('BA.5.2', 5102), ('BA.2.9', 4502), ('BA.5.5', 4207), ('BA.4.1', 3514), ('BA.2.3', 3329), ('BA.5.6', 2407)])
Recomb node 351805 @ 7.5e+02 days from Wuhan has 0 older BA.4 samples and 1 descendant samples ([('XM', 1)])
Recomb node 359353 @ 7.6e+02 days from Wuhan has 0 older BA.4 samples and 217 descendant samples ([('XE', 217)])
Recomb node 359567 @ 7.6e+02 days from Wuhan has 0 older BA.4 samples and 6 descendant samples ([('XJ', 5), ('BA.2', 1)])
Recomb node 376984 @ 7.8e+02 days from Wuhan has 0 older BA.4 samples and 57 descendant samples ([('BA.2', 39), ('XZ', 12), ('XAP', 6)])
Recomb node 385437 @ 7.9e+02 days from Wuhan has 0 older BA.4 samples and 14 descendant samples ([('XQ', 9), ('BA.2', 3), ('XR', 2)])
Recomb node 388645 @ 7.9e+02 days from Wuhan has 0 older BA.4 samples and 6 descendant samples ([

It appears as if node 435205, with 46786 descendant samples is the one that is the origin of all the BA.5x samples. Let's look at it

In [12]:
ts.node(435205)
re_edges = ts.edges_child == 435205
parents = np.unique(ts.edges_parent[re_edges])
print(
    f"Has {np.sum(re_edges)} parents ({', '.join([ts.node(p).metadata['Imputed_Viridian_pangolin'] for p in parents])}), "
    f"and breakpoints at {set(ts.edges_left[re_edges]) & set(ts.edges_right[re_edges])}"
)

Has 2 parents (BA.1, BA.4), and breakpoints at {np.float64(26858.0)}


In [13]:
re_node = 435205
d3arg.reset_all_node_styles()
#d3arg.set_node_styles([{"id":u,"fill":"magenta"} for u in allX] + [{"id":mrca,"fill":"red"}])
d3arg.set_node_styles([{"id": nd.id, "fill": "red"} for nd in ts.nodes() if nd.flags & sc2ts.NODE_IS_RECOMBINANT])
d3arg.set_node_labels({
    nd.id: " ".join([
        # For recombination nodes, list their most common Pango descendant types
        ", ".join([x[0] for x in focal_re_info[nd.id][2].most_common(3)]) if nd.id in focal_re_info else nd.metadata.get("Imputed_Viridian_pangolin", ""),
        f"({nd.metadata.get('strain', '#' + str(nd.id))})"
    ]) for nd in ts.nodes()
})
d3arg.nodes.loc[np.isin(d3arg.nodes["id"], ts.samples()), "symbol"] = 'd3.symbolSquare'
d3arg.nodes.loc[:, "stroke_width"] = 2
parent_levels, child_levels  = 1, 20  # Only show immediate children, but show all parents up to the root
d3arg.draw_node(
    re_node,
    degree=(child_levels, parent_levels),
    height=800,
    width=1000,
    show_mutations=True,
    #y_axis_scale="time",
    #include_mutation_labels=True,
)

Note that the upper recombination node (red) here is the same as the one in the previous plot, that is the origin of the BA.2 outbreak. According to this ARG, BA.4 is a (non-recombinant) descendant of those BA.2 strains, whereas BA.5 is a recombination of BA.4 and BA.1