# Pango-X recombinant origins in the sc2ts ARG

In [None]:
import collections

import sc2ts
import numpy as np
import tskit
import tszip
import numpy as np
from IPython.display import HTML
import warnings

import nb_utils

# NB - tag the cells with progressbars with `remove_cell`, and export only the output cells to PDF via:
# jupyter nbconvert --to webpdf --no-prompt --no-input --PDFExporter.scale_factor=0.8  --TagRemovePreprocessor.remove_cell_tags='{"remove_cell"}' --PDFExporter.margin_left=0.2cm --PDFExporter.margin_right=0.2cm Viridian-PangoX.ipynb

In [None]:
# Get the Viridian ARG
ts = tszip.load("../data/sc2ts_v1_2023-02-21_pp_dels_bps_pango_dated_mmps.trees.tsz")

# Join with the associated data
df = sc2ts.node_data(ts)
ds = sc2ts.Dataset("../data/viridian_mafft_2024-10-14_v1.vcz")
df.set_index("sample_id", inplace=True)
df = df.join(ds.metadata.as_dataframe(["Date_tree", "Viridian_pangolin"]))

print(f"Loaded ARG has {ts.num_nodes} nodes, {ts.num_edges} edges, {ts.num_mutations} mutations, {np.sum(ts.nodes_flags & sc2ts.NODE_IS_RECOMBINANT != 0)} recomb. events")

In [None]:
# Set which pango designation to use: try "pangolin" (native sc2ts) or "Viridian_pangolin" (assigned by viridian)
Pango = "pango"

### List out the pango-X nodes

In [None]:
dfX = df[np.logical_and(df.is_sample, df[Pango].str.startswith("X"))]
pango_lineage_samples = df[df.is_sample].groupby(Pango)['node_id'].apply(list).to_dict()
pangoFullX = np.unique(dfX[Pango])
pangoX = [p for p in pangoFullX if "." not in p]
pangoSubX = [p for p in pangoFullX if "." in p]
display(HTML(
    f'<table><tr><th>{len(pangoX)} main pango-X lineages</th><th>{len(pangoSubX)} sub pango-X lineages</th></tr>'
    f'<tr><td>{", ".join(pangoX)}</td><td>{", ".join(pangoSubX)}</td></tr></table>'
))

In [None]:
print("Consensus mutations for each lineage taken from https://covidcg.org")
lineage_defining_muts = nb_utils.read_in_mutations("../data/consensus_mutations.json.bz2")

In [None]:
# Find most recent RE node above all samples of each type
from tqdm.auto import tqdm
MRC_RE = {pango: (None, np.inf) for pango in pangoX}
recombination_nodes = set(np.where(ts.nodes_flags & sc2ts.NODE_IS_RECOMBINANT)[0])
nodes_time = ts.nodes_time
for tree in ts.trees():
    for x in pangoX:
        samples = pango_lineage_samples[x]
        if len(samples) == 0:
            continue
        u = samples[0] if len(samples) == 1 else tree.mrca(*samples)
        while u not in recombination_nodes:
            u = tree.parent(u)
            if u == tskit.NULL:
                break
        if u != tskit.NULL and nodes_time[u] < MRC_RE[x][1]:
            MRC_RE[x] = (u, nodes_time[u])

In [None]:
# This is a bit tedious, as we have to look at all samples in all trees
samples = {pango: set() for pango in pangoX}
for tree in tqdm(ts.trees()):
    for pango, (potential_re, _) in MRC_RE.items():
        if potential_re is not None:
            samples[pango].update(tree.samples(potential_re))

In [None]:
pango_counts = {pango: collections.Counter() for pango in pangoX}
sample_to_pango = {}
for p, sample_ids in pango_lineage_samples.items():
    for s in sample_ids:
        sample_to_pango[s] = p
for pango, sample_set in samples.items():
    for s in sample_set:
        pango_counts[pango][sample_to_pango[s]] += 1

# Seemingly missing from Viridian QCed data
pango_counts["XD"] = None
pango_counts["XK"] = None
pango_counts["XT"] = None
pango_counts["XV"] = None
pango_counts["XAB"] = None
pango_counts["XAH"] = None
pango_counts["XAK"] = None
pango_counts["XAQ"] = None
pango_counts["XAR"] = None
pango_counts["XAT"] = None
pango_counts["XAW"] = None
pango_counts["XAY"] = None
pango_counts["XBA"] = None
pango_counts["XBC"] = None
# Others past XBH not added here

In [None]:
tot_pango_x_re = []
pango_x_nodes = collections.defaultdict(set)
td = '<td style="padding: 0.5px 10px">'
th = '<th style="padding: 0.5px 10px; font-style: italic">'
html =f'<table style="font-size: 8pt"><tr><td colspan="2" style="font-size: smaller">Bold if the dominant pango</td></tr>'
html += f'<tr>{th}RE node</th>{th}PangoX</th>{th}# descendants</th>{th}Most common</th></tr>'
for pango in sorted(pango_counts, key=lambda x: (len(x), x)):
    if len(pango_lineage_samples.get(pango, [])) == 0:
        html += f'<tr>{td}</td>{td}<i>{pango}</i></td><td style="padding: 1px" colspan="2" style="text-align: center">not in dataset</td></tr>'
    else:
        counts = pango_counts[pango]
        tot = counts.total()
        p = counts[pango]
        most_common_X = None
        is_recomb = (p > 0 and p/tot > 0.001)
        re_nd = ""
        pg = f'<s>{pango}</s>'
        if is_recomb:
            pango_x_nodes[MRC_RE[pango][0]].add(pango)
            most_common_X = max([x for x in counts if x.startswith("X")], key=lambda x: counts[x])
            re_nd = str(MRC_RE[pango][0])
            pg = f'{pango}'
            if most_common_X == pango:
                tot_pango_x_re.append(MRC_RE[pango][0])
                re_nd = f'<b>{re_nd}</b>'
                pg = f'<b>{pango}</b>'
        html += f'<tr>{td}{re_nd}</td>{td}{pg}</td>{td}{tot} of which {p} {pango}</td>{td}{counts.most_common(3)}</td></tr>'
html += "</table>"
display(HTML(html))
print(len(pango_x_nodes),
      "total pango X recombinant origins of which",
      len(tot_pango_x_re),
      "include all descendants of the dominant group (exceptions: XM and XBB)")
print("Exceptions = RE nodes:", set(pango_x_nodes.keys()) - set(tot_pango_x_re))
print("RE node for Pangos", pango_x_nodes)

# Setup

In [None]:
import json
from tqdm.auto import tqdm
from pathlib import Path
import msprime

def issue(issue_number):
    return f'See GitHub sc2ts-paper <a href="https://github.com/jeromekelleher/sc2ts-paper/issues/{issue_number}">issue #{issue_number}</a>'


In [None]:
arg = nb_utils.D3ARG_viz(ts, df, lineage_defining_muts, pangolin_field=Pango)

In [None]:
arg.set_sc2ts_node_labels()
arg.d3arg.nodes.loc[arg.d3arg.nodes.id == 200039, 'label'] = "*DELTA*"
arg.d3arg.nodes.loc[arg.d3arg.nodes.id == 822854, 'label'] = "*BA.2*"
arg.d3arg.nodes.loc[arg.d3arg.nodes.id == 1189192, 'label'] = "*BA.5*"
arg.set_sc2ts_node_styles()

# Pango-X Subgraphs

In [None]:
# Scale all the viz versions for print, so that a standard 750 x 1000 subgraph fits onto one size of A4
display(HTML("<style>@media print {.d3arg {zoom: 0.8}}</style>"));
def txt(html, right="15em", top="15em", width="300px"):
    return f'<div style="position: absolute; z-index:1;  right: {right}; top: {top}; width: {width}">{html}</div>'

In [None]:
arg.plot_pango_subgraph("XA", width=750, height=1000, parent_pangos=("B.1.1.7", "B.1.177.18"))

In [None]:
# XB has too many samples so we collapse some
exclude = np.array(list(ts.first().samples(223239)))
exclude = exclude[exclude != 223230]

arg.plot_pango_subgraph("XB", exclude=exclude, parent_pangos=["B.1.243"])

In [None]:
arg.plot_pango_subgraph("XC", parent_pangos=["AY.29", "B.1.1.7"])

In [None]:
html = '''<p>Some weirdness going on with deletions just on the RHS of the breakpoint (see copying table below).
    Could these be misaligned?</p>
    <p>The 2 recombination nodes to the bottom right may be spurious.
    Possible alignment problems with the deletion here?</p>''' +  issue(337)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    arg.plot_pango_subgraph(
        ["XE", "XH"],
        txt(html),
        include=[1212052, 1177107],
        restrict_to_first=20,
        parent_pangos=["BA.1.17.2", "BA.2"],
        child_levels=0,
        parent_levels=5,
        height=700,  # make room for the copying pattern
        y_axis_scale="rank",
    )
HTML(
    "<style>table.copying-table {font-size: 8px; @media print {zoom: 0.6}} table.copying-table .pattern td {font-size: 0.5em; width:0.3em}</style>" +
    sc2ts.info.CopyingTable(ts, 965353).html(show_bases=None)
)

In [None]:
arg.plot_pango_subgraph("XF", txt("<p>Looks clean</p>"), y_axis_scale="rank", height=800, parent_pangos=["AY.4", "BA.1"])

In [None]:
html = (
    "<p>Some dodgy reverted deletions on the LHS branch. We probably got the breakpoint wrong, and it should be to the LHS of 6513</p>"
)

arg.plot_pango_subgraph("XG", txt(html, "25em"), parent_pangos=["BA.1.17", "BA.2.9"], y_axis_scale="rank")

In [None]:
arg.plot_pango_subgraph("XJ",include=[1090786], parent_pangos=["BA.1.17.2", "BA.2"])

In [None]:
arg.plot_pango_subgraph(["XL"], parent_pangos=["BA.1.17.2", "BA.2"])

In [None]:
html = "Multiple origins of XM, but one dominant one (node 1003220)"

pangos = ["XM", "XAL"]
colours = ['#332288', '#88CCEE', '#44AA99', '#117733', '#999933', '#DDCC77']  # from https://personal.sron.nl/~pault/
arg.plot_pango_subgraph(
    pangos, txt(html),
    parent_pangos=["BA.1.1", "BA.2"],
    child_levels=0,
    highlight_nodes={c: pango_lineage_samples[pX] for c, pX in zip(colours, pangos)},
    y_axis_scale="rank",
    height=700,
)

HTML(
    "<style>table.copying-table {font-size: 8px; @media print {zoom: 0.6}} table.copying-table .pattern td {font-size: 0.5em; width:0.3em}</style>" +
    sc2ts.info.CopyingTable(ts, 1003220).html(child_label="1003219", show_bases=None)
)

In [None]:
html = "No recombination node." + issue(358)

arg.plot_pango_subgraph("XN",
    txt(html, right="25em"),
    parent_levels=6,
    parent_pangos=["BA.2.23"],
    y_axis_scale="rank", oldest_y_label="2021-10"
    
)

In [None]:
html = "Apparently some missing deletion, but I can't see it. " + issue(345)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    arg.plot_pango_subgraph(
        "XP",
        txt(html),
        parent_levels=12,
        parent_pangos=["BA.1.1"],
        y_axis_scale="rank",
    )

In [None]:
pangoX = ["XQ", "XR", "XU", "XAA", "XAG", "XAM"]
html = (
    "Displayed are a maximum of 10 samples from each group to display, plus some extra BA.2 samples that appear nested. " +
    issue(338)
)
extras1 = [1249828, 1228294, 1219946, 1197469, 1182958, 1182957, 1161394, 1146404, 1146405]
extras2 = [2521553, 1152676, 1150120, 2513694, 2477211, 2466117, 2448160, 2449100]
extras3 = [1126313, 2534274, 2534275, 1141965, 2508149, 1105611, 1142202, 1111753]

colours = ['#332288', '#88CCEE', '#44AA99', '#117733', '#999933', '#DDCC77']  # from https://personal.sron.nl/~pault/


arg.plot_pango_subgraph(
    pangoX,
    txt(html, right="20em"),
    include=extras1 + extras2 + extras3 + [1200258, 1158324],
    restrict_to_first=10,
    parent_levels=7,
    child_levels=0,
    parent_pangos=["BA.1.1.15", "BA.2.9"],
    highlight_nodes={c: pango_lineage_samples[pX] for c, pX in zip(colours, pangoX)},
    y_axis_scale="rank",
)

In [None]:
html = (
    "Something weird here, as there are two recombination nodes only separated by a deletion. " +
    "Apparently some XS samples have the deletion, and some don't? " +
    issue(287)
)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    arg.plot_pango_subgraph(
        "XS",
        txt(html, top="9em"),
        parent_levels=6,
        parent_pangos=["AY.103", "BA.1.1"],
        y_axis_scale="rank",
        oldest_y_label="2021-06",
        height=600,
    )

In [None]:
arg.plot_pango_subgraph(
    "XW",
    parent_levels=6,
    parent_pangos=["BA.1.1.15", "BA.2"],
    oldest_y_label="2021-10",
    y_axis_scale="rank",
)

In [None]:
arg.plot_pango_subgraph("XY",
    parent_levels=6,
    parent_pangos=["BA.1.1", "BA.2"],
    oldest_y_label="2021-10",
    y_axis_scale="rank",
)

In [None]:
colours = ['#332288', '#88CCEE', '#44AA99', '#999933', '#DDCC77']  # from https://personal.sron.nl/~pault/
pangoX = ["XZ", "XAC", "XAD", "XAE", "XAP"]
html = issue(339)

cmap = {c: pango_lineage_samples[pX] for c, pX in zip(colours, pangoX)}
extra_BA_2 = [964554, 2340545, 2372712, 1056883] #, 1192387, 1112147, 1145629]
cmap.update({'lightgrey': extra_BA_2})

arg.plot_pango_subgraph(
    pangoX,
    txt(html),
    include=extra_BA_2,
    parent_levels=9,
    highlight_nodes=cmap,
    height=700,
)

HTML(
    "<style>table.copying-table {font-size: 8px; @media print {zoom: 0.6}} table.copying-table .pattern td {font-size: 0.5em; width:0.3em}</style>" +
    sc2ts.info.CopyingTable(ts, 964555).html(show_bases=None)
)

In [None]:
html = issue(360)
arg.plot_pango_subgraph(
    "XAF",
    txt(html),
    include=[1177107],
    parent_levels=10,
    child_levels=10,
    parent_pangos=["BA.2", "BA.1"],
    height=700,
    y_axis_scale="rank",
)

HTML(
    "<style>table.copying-table {font-size: 8px; @media print {zoom: 0.6}} table.copying-table .pattern td {font-size: 0.5em; width:0.3em}</style>" +
    sc2ts.info.CopyingTable(ts, 1177107).html(show_bases=None)
)

In [None]:
html = issue(352)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    arg.plot_pango_subgraph(
        "XAJ",
        txt(html),
        parent_levels=16,
        parent_pangos=["BA.2.12"],
        y_axis_scale="rank",
    )

In [None]:
pangoX = ["XAN", "XAV"]
cmap = {c: pango_lineage_samples[pX] for c, pX in zip(colours, pangos)}

arg.plot_pango_subgraph(
    ["XAN", "XAV"],
    txt(issue(353)),
    parent_levels=10,
    parent_pangos=["BA.5.1", "BA.5.1.24"],
    highlight_nodes=cmap,
    oldest_y_label="2021-10",
)

In [None]:
html = (
    "Complex: multiple origins, but main clade does not seem to be a recombinant. "
    "2 single-sample clades are, however. " + issue(340))

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    arg.plot_pango_subgraph(
        "XAS",
        parent_levels=4,
        parent_pangos=["BA.4"],
        oldest_y_label="2021-11",
        y_axis_scale="rank",
    )

In [None]:
pangoX = ["XAU", "XN"]
cmap = {c: pango_lineage_samples[pX] for c, pX in zip(colours, ["XAU", "XN"])}

arg.plot_pango_subgraph(
    pangoX,
    txt(issue(348)),
    include=[1137492, 887654],
    parent_levels=20, child_levels=0,
    parent_pangos=["BA.2"],
    highlight_nodes=cmap,
    y_axis_scale="rank",
)

In [None]:
arg.plot_pango_subgraph(
    "XAV",
    txt(issue(354)),
    parent_levels=10, child_levels=1,
    parent_pangos=["BA.2"],
    oldest_y_label="2021-11",
)

In [None]:
html = (
    "There are a lot of XAZ samples, but they all form a clade, so just pick the first 20 for viz. " +
    issue(356)
)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    arg.plot_pango_subgraph(
        "XAZ",
        txt(html),
        restrict_to_first=20,
        parent_levels=10, child_levels=0,
        parent_pangos=["BA.2"],
        oldest_y_label="2021-10",
    )

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    arg.plot_pango_subgraph(
        "XBB",
        restrict_to_first=5,
        include=[1408964, 1396838, 1404568, 1423196, 1398292, 2681617, 1409763],
        parent_levels=8,
        parent_pangos=["BA.2.10", "BM.1.1.1"],
        oldest_y_label="2021-11",
        y_axis_scale="rank",
    )

In [None]:
arg.plot_pango_subgraph(
    "XBE",
    txt(issue(351)),
    include = [2661358],
    parent_levels=15,
    parent_pangos=["BA.5.2"],
    oldest_y_label="2021-10",
)

In [None]:
colours = ['#332288', '#88CCEE', '#44AA99', '#999933', '#DDCC77']  # from https://personal.sron.nl/~pault/
pangoX = ["XBQ", "XBK", "XBK.1", "CJ.1.3"]
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    arg.plot_pango_subgraph(
        pangoX,
        txt(issue(349)),
        include = [1363939, 1342796],
        parent_levels=10,
        parent_pangos=["BM.1.1.1", "BM.1.1", "BA.2", "BA.2.75", "BA.2.75.3"],
        highlight_nodes={c: pango_lineage_samples[pX] for c, pX in zip(colours, pangoX)},
        oldest_y_label="2021-10",
    )

## Extra notes below

In [None]:
#df = ti.recombinants_summary()
#df.set_index("recombinant", inplace=True)


# Here we document some investigations into Pango X assignments that appear not to have re nodes
reasons_for_no_re = dict(
    XP="Dependent on a deletion which sc2ts does not use",
    XN="Definitely not a recombinant according to sc2ts. See https://github.com/jeromekelleher/sc2ts-paper/issues/285",
    XAU="Definitely not a recombinant according to sc2ts. See https://github.com/jeromekelleher/sc2ts-paper/issues/285",
    XAZ="Not a recombinant. Nearest RE node is #1189192 which is a BA.5 with 112911 descendants",
    XAJ="Probably not a recombinant. See https://github.com/jeromekelleher/sc2ts-paper/issues/285",
    XAS="Complex: multiple origins, but main clade does not seem to be a recombinant. 2 single-sample clades are, however",
    XAV="Possibly not a recombinant: see https://github.com/jeromekelleher/sc2ts-paper/issues/285",
    XBQ="Probably not a recombinant. See https://github.com/jeromekelleher/sc2ts-paper/issues/285",
    XBK="Probably not a recombinant. See https://github.com/jeromekelleher/sc2ts-paper/issues/285",
)

display(HTML("<h3>SUMMARY</h3>"))
display(HTML("<h4>PangoX with a valid RE node</h4>"))
display(HTML("<dl>"))

for pango in sorted(pango_counts, key=lambda x: (len(x), x)):
    if pango_counts[pango] is None:
        display(HTML(f"&nbsp;&nbsp;&nbsp;{pango}: No samples in Viridian which passed QC"))
        continue
    try:
        node = pango_x_to_node[pango]
        others = pango_x_nodes[node]
        #row = df[node]
        display(HTML(
            f"* {pango}: re_node {node}" +
            (f" ({others.index(pango) + 1}/{len(others)})" if len(others) > 1 else "") +
            f", {int(row.interval_left)}-{int(row.interval_right)}bp" +
            f", {row.parent_left_pango} + {row.parent_right_pango}"
        ))
    except KeyError:            
        display(HTML(f"&nbsp;&nbsp;&nbsp;{pango}: {reasons_for_no_re.get(pango, 'No clear associated re node')}"))
        
