# Saltational subgraphs

A notebook to plot the subgraphs of a few of the major "saltational" outbreaks, specifically Alpha, Delta, and Omicron (BA.1 / BA.2)

Note that there is no automated way to save each subgraph to an SVG file, so this needs to be done for the
Alpha, Delta, and Omicron subgraphs by manually clicking on the "Download as" button, and moving/renaming
the resulting file to subgraph-Alpha.svg, subgraph-Delta.svg, etc. as appropriate. You can then run the following to get pdfs:

```
python src/makepdf.py figures/subgraph-*.svg
```

In [1]:
import tszip
import sc2ts
import numpy as np
import pandas as pd

import nb_utils
from nb_utils import DATA_DIR

ts = tszip.load(DATA_DIR / "sc2ts_viridian_v1.2.trees.tsz")
df = sc2ts.node_data(ts).set_index("sample_id")
ds = nb_utils.load_dataset()

In [2]:
# Load in the ARG to the visualizer - can take a few minutes
arg = nb_utils.D3ARG_viz(ts, df, pangolin_field="pango", progress=False)
arg.set_sc2ts_node_labels(progress=False)
arg.d3arg.nodes.loc[arg.d3arg.nodes.id == 86456, 'label'] = "Alpha-root"
arg.d3arg.nodes.loc[arg.d3arg.nodes.id == 200039, 'label'] = "Delta-root"
arg.d3arg.nodes.loc[arg.d3arg.nodes.id == 851246, 'label'] = "BA.1-root"
arg.d3arg.nodes.loc[arg.d3arg.nodes.id == 822854, 'label'] = "BA.2-root"
arg.d3arg.nodes.loc[arg.d3arg.nodes.id == 1189192, 'label'] = "BA.5-root"
arg.set_sc2ts_node_styles()

In [3]:
def make_mut_array(muts):
    muts = [m if m[0].isnumeric() else m[1:] for m in muts]
    pos = [int(m[:-1]) for m in muts]
    ds = [m[-1] for m in muts]
    return np.rec.fromarrays([pos, ds], names="positions,derived_states")

def add_muts_to_set(mut, s):
    mut = mut.replace("-", "_")
    if "," in mut:
        for mut in mut.split(","):
            s.add(mut.strip())
    elif mut.endswith("del"):
        pos = [int(pos) for pos in mut[:-3].split("_")]
        for mut in range(pos[0], pos[-1] + 1):
            s.add(f"{mut}-")
    elif mut.endswith("ins"):
        print(f"Skipping insertion {mut}")
    elif "(" in mut:
        print(f"Skipping {mut}")
    elif "_" in mut:
        print(f"Treating {mut} as a deletion")
        pos = [int(pos) for pos in mut.split("_")]
        for mut in range(pos[0], pos[1] + 1):
            s.add(f"{mut}-")
    else:
        s.add(mut)


## Alpha

Mutations from https://virological.org/t/preliminary-genomic-characterisation-of-an-emergent-sars-cov-2-lineage-in-the-uk-defined-by-a-novel-set-of-spike-mutations/563


In [4]:
muts = """C913T
C3267T
C5388A
C5986T
T6954C
11288_11296 del
C14676T
C15279T
T16176C
21765-21770 del
21991-21993 del
A23063T
C23271A
C23604A
C23709T
T24506G
G24914C
T26801C
C27972T
G28048T
A28111G
G28280C, A28281T, T28282A
C28977T"""

Alpha_muts = set()
for m in muts.split("\n"):
    add_muts_to_set(m, Alpha_muts)

extra_nodes = [86035, 4396, 4552]
arg.plot_pango_subgraph(
    ["B.1.1.7"],
    include=extra_nodes,
    restrict_to_first=2,
    parent_levels=10,
    child_levels=0,
    y_axis_scale="rank",
    highlight_nodes={"lightgrey": extra_nodes, arg.highlight_colour: df.loc[df.pango=="B.1.1.7", 'node_id']},
    highlight_mutations=make_mut_array(Alpha_muts),
    height=1000,
    width=800,
    positions_file="layout_data/subgraph-Alpha.json",
    save_filename="subgraph-Alpha",
    label_mutations=True,
    tree_highlighting=False,
    show_title=False,
)

## Delta

This saltational lineage was broken up by long branch splitting of an original long branch between B.1.617 and B.1. The list of mutations is taken from https://www.medrxiv.org/content/10.1101/2021.08.05.21261642v2.full.pdf and https://github.com/cov-lineages/constellations/blob/main/constellations/definitions/cB.1.617.2.json


In [6]:


muts = """G210T
G15451A
C16466T
C21618G
22029-22034 del
T22917G
C22995A
C23604G
G24410A
C25469T
T26767C
T27638C
C27752T
28248-28253 del
28271 del
A28461G
G28881T
G29402T
G29742T"""

Delta_muts = set()
for m in muts.split("\n"):
    add_muts_to_set(m, Delta_muts)

seed_nodes = [
    np.min([
        u
        for u in np.where(ts.nodes_flags & sc2ts.NODE_IS_UNCONDITIONALLY_INCLUDED != 0)[0]
        if ts.node(u).metadata['pango'].startswith(p)])
    for p in ("B.1.617", "B.1.617.1", "B.1.617.2")
]

keep_ids = [18017, 220186, 238161]
arg.plot_pango_subgraph(
    ["B.1.617.2"],
    restrict_to_first=1,
    include=keep_ids + seed_nodes,
    parent_levels=10,
    child_levels=0,
    y_axis_scale="rank",
    highlight_nodes={
        'lightgrey': keep_ids, arg.highlight_colour: df.loc[df.pango=="B.1.617.2", 'node_id'], 'red': [seed_nodes]
    },
    highlight_mutations=make_mut_array(Delta_muts),
    height=1000,
    width=800,
    positions_file="layout_data/subgraph-Delta.json",
    label_mutations=True,
    tree_highlighting=False,
    show_title=False,
    save_filename="subgraph-Delta",
)

## Omicron (BA.1 / BA.2)

This saltational lineage was broken up by long branch splitting of an original long branch between BA.1 and B.1.1. The list of mutations is taken from https://github.com/cov-lineages/pango-designation/issues/361.

In [7]:
# Read in the mutations file, which is complex as mutations can be one of 3 classes
# (BA.1, BA.2, or both)
def conv(val):
    return val == 'Y' or val == 'y'

pangos = "BA.2", "BA.1", "B.1.1.529"
Omicron_muts = {p: set() for p in pangos}
mut_df = pd.read_csv(
    DATA_DIR / "Omicron_BA.1_BA.2_mutations.csv",
    converters={colname: conv for colname in (pangos)},
    dtype={"notes": str},
    keep_default_na=False
)
for i, row in mut_df.iterrows():
    which_pango = [p for p in pangos if row[p]]
    assert len(which_pango) == 1
    if "B.1" not in row.notes:
        add_muts_to_set(row.iloc[0], Omicron_muts[which_pango[0]])

s = Omicron_muts["BA.2"] & Omicron_muts["BA.1"]
if len(s):
    print(f"Moving joint mutations {s} in BA.1 and BA.2")
    Omicron_muts["BA.2"] -= s
    Omicron_muts["BA.1"] -= s
    Omicron_muts["B.1.1.529"] |= s

Skipping G21987A (outlier); 21987_21995 (main)
Treating 21987_21995 as a deletion
Skipping insertion 22205GAGCCAGAAins
Moving joint mutations {'C22674T'} in BA.1 and BA.2


In [8]:
seed_nodes = [
    u
    for u in np.where(ts.nodes_flags & sc2ts.NODE_IS_UNCONDITIONALLY_INCLUDED != 0)[0]
    if ts.node(u).metadata['pango'].startswith("BA")
]
keep_ids = list(df.loc[["ERR7858953", "SRR17051902", "ERR7551924", "ERR7552008", "SRR17041376", "SRR17461792", "SRR19117184", "ERR9447482"], "node_id"]) + seed_nodes

# Colours to match those in https://github.com/cov-lineages/pango-designation/issues/361
newcols = {"BA.1": "LightSteelBlue", "BA.2": "Moccasin", "B.1.1.529": "DarkSeaGreen"}

arg.plot_pango_subgraph(
    ["BA.1", "BA.2"],
    restrict_to_first=1,
    include=keep_ids + [848636],
    parent_levels=10,
    child_levels=0,
    y_axis_scale="rank",
    highlight_nodes={
        **{v: df.loc[df.pango==k, 'node_id'] for k, v in newcols.items()}, 'red': seed_nodes
    },
    highlight_mutations={v: make_mut_array(Omicron_muts[k]) for k, v in newcols.items()},
    height=1000,
    width=800,
    positions_file="layout_data/subgraph-Omicron.json",
    label_mutations=True,
    tree_highlighting=False,
    show_title=False,
    save_filename="subgraph-Omicron",
)

## Overview

This is the tree of the major lineages and their sister taxa (note that it is not used in the supplementary). The two recombination nodes in the subgraph are both likely to be artefactual: the one labelled BA.2 does not pass QC, and the one labelled BA.5-root would have been be removed by long branch splitting had it been performed at the time of matching.


In [9]:
# Have a quick look at the tree of the major lineages, and their sister taxa
# Note that this is not used in the supplementary. The two recombination nodes are both
# likely to be artefactual as the one labelled BA.2 does not pass QC, and the BA.5 one
# would be removed by long branch splitting if had be performed at the time of matching

import tskit_arg_visualizer as argviz
import json
from pathlib import Path
# Identify some important lineages by hand
outbreaks = ["A", "B", "B.1.1.7", "B.1.617.2", "BA.1", "BA.2", "BA.4", "BA.4.1", "BA.5"]


arg.plot_pango_subgraph(
    outbreaks,
    restrict_to_first=1,
    parent_levels=10,
    child_levels=0,
    y_axis_scale="rank",
    highlight_colour="gray",
    height=1000,
    width=800,
    positions_file="layout_data/SimplifiedMajorLineages.json",
    label_mutations=False
)