# Details of Pango recombinants

This notebook performs the analysis required for the Results/Recombinant pango lineages in the Long ARG.

In [28]:
import sys
import collections

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import tskit
import tszip

sys.path.append("../../sc2ts/")
import sc2ts.utils


In [5]:
%%time
ts = tszip.decompress("../data/upgma-mds-1000-md-30-mm-3-2022-06-30-recinfo-gisaid-il.ts.tsz")
ts

CPU times: user 588 ms, sys: 262 ms, total: 851 ms
Wall time: 587 ms


Tree Sequence,Unnamed: 1
Trees,958
Sequence Length,29904.0
Time Units,days
Sample Nodes,657239
Total Size,494.0 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,785539,24.0 MiB,
Individuals,0,24 Bytes,
Migrations,0,8 Bytes,
Mutations,1062072,40.2 MiB,✅
Nodes,783231,422.5 MiB,✅
Populations,0,8 Bytes,
Provenances,2,1.8 KiB,
Sites,29422,1.3 MiB,✅


In [6]:
ti = sc2ts.utils.TreeInfo(ts)

def report(*args, **kwargs):
    display(*ti.node_report(*args, **kwargs))

ti

Counting descendants : 100%|██████████████████████| 783231/783231 [00:00<00:00, 2393813.25it/s]
Indexing metadata    : 100%|████████████████████████| 783231/783231 [00:13<00:00, 60151.69it/s]
Classifying mutations: 100%|█████████████████████| 1062072/1062072 [00:10<00:00, 101537.12it/s]


Unnamed: 0,property,value
0,latest_sample,2022-06-30
1,max_submission_delay,29 days
2,samples,657239
3,nodes,783231
4,mc_nodes,37749
5,pr_nodes,34358
6,re_nodes,2078
7,recombinants,2078
8,mutations,1062072
9,recurrent,50099


In [12]:
x_lineage_names = [lin for lin in ti.pango_lineage_samples.keys() if lin.startswith("X")]
x_lineage_sample_count = {lin: len(ti.pango_lineage_samples[lin]) for lin in x_lineage_names}
x_lineage_sample_count

{'XA': 5,
 'XB': 58,
 'XC': 5,
 'XD': 4,
 'XH': 11,
 'XF': 2,
 'XS': 6,
 'XZ': 92,
 'XG': 32,
 'XAF': 36,
 'XN': 69,
 'XE': 170,
 'XM': 48,
 'XV': 3,
 'XK': 6,
 'XAB': 50,
 'XT': 1,
 'XL': 10,
 'XR': 8,
 'XP': 2,
 'XQ': 14,
 'XAD': 6,
 'XAH': 14,
 'XW': 11,
 'XAA': 8,
 'XY': 6,
 'XU': 3,
 'XAC': 27,
 'XJ': 3,
 'XAE': 9,
 'XAG': 17,
 'XAJ': 12,
 'XAK': 1}

In [13]:
all_x_samples = []
for lin in x_lineage_names:
    all_x_samples.extend(ti.pango_lineage_samples[lin])
df = ti.recombinant_samples_report(all_x_samples)
df

Unnamed: 0,recombinant,direct,path_length,node,strain,pango,parents,children,descendants,date,delay,qc,mutations,reversions,immediate_reversions,child_mutations,child_reversions
0,172374,True,1,172373,Wales/ALDP-125C4D7/2021,XA,1,0,1,2021-02-06,12 days,0000,0,0,0,0,0
1,172374,False,1,182624,Wales/LIVE-DFCFFE/2021,XA,1,0,1,2021-02-14,15 days,1000,0,0,0,0,0
2,172374,False,1,221655,England/ALDP-142CC21/2021,XA,1,2,3,2021-03-17,14 days,0000,1,0,0,3,0
3,172374,False,2,227648,England/ALDP-1458CD1/2021,XA,1,0,1,2021-03-22,11 days,0000,2,0,0,0,0
4,172374,False,2,228656,Wales/PHWC-PYBIFF/2021,XA,1,0,1,2021-03-23,13 days,0100,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
744,731513,False,2,772350,Germany/BY-RKI-I-911253/2022,XAJ,1,0,1,2022-06-21,28 days,0000,5,3,0,0,0
745,774725,True,1,774724,Mexico/BCN_LANGEBIO_IMSS_9303/2022,XAJ,1,0,1,2022-06-23,20 days,0100,3,1,0,0,0
746,555388,False,20,782398,England/LSPA-3EAC5B5/2022,XAJ,1,0,1,2022-06-30,7 days,0000,0,0,0,0,0
747,555388,False,20,782399,England/LSPA-3EAC56A/2022,XAJ,1,0,1,2022-06-30,7 days,1000,0,0,0,0,0


# General recombinant information

In [44]:
recombinants = {rec.node: rec for rec in ti.combine_recombinant_info()}


In [45]:
len(recombinants)

2078

# Single origin pango x lineages

Where all of the samples in the X lineage trace back to the same recombination node.



In [22]:
lineage_re_nodes = collections.defaultdict(set)
for _, row in df.iterrows():
    lineage_re_nodes[row.pango].add(row.recombinant)
lineage_re_nodes

defaultdict(set,
            {'XA': {172374},
             'XB': {206466, 285181, 394059},
             'XC': {251176, 292030, 322219, 363172},
             'XD': {573905, 638775},
             'XH': {582054},
             'XF': {588841},
             'XS': {589269, 636566, 676648},
             'XZ': {555388,
              623997,
              628656,
              636903,
              641595,
              648753,
              651744,
              676531,
              693927,
              697157,
              697596,
              704521,
              705521,
              716084,
              746741,
              756841},
             'XG': {609881},
             'XAF': {582054, 611036, 677584},
             'XN': {555388,
              615119,
              668317,
              674332,
              677977,
              700575,
              703379,
              731665},
             'XE': {555388,
              582054,
              621567,
              629086,
     

In [85]:
all_re_nodes = set()
for nodes in lineage_re_nodes.values():
    all_re_nodes|= nodes
len(all_re_nodes)

89

Remove XP is it's a known outlier:


In [24]:
del lineage_re_nodes["XP"]

In [63]:
single_origin_lineages = {lin: list(nodes)[0] for lin, nodes in lineage_re_nodes.items() if len(nodes) == 1}
single_origin_lineages 

{'XA': 172374,
 'XH': 582054,
 'XF': 588841,
 'XG': 609881,
 'XT': 637876,
 'XL': 638141,
 'XR': 639472,
 'XW': 663539,
 'XAA': 635896,
 'XY': 677584,
 'XAC': 682411,
 'XAE': 701181,
 'XAG': 635896,
 'XAK': 555388}

We remove XAK because it's an outlier:
- The causal sample is BA.2.9 (all others agree with the lineage in question (or are close by)
- It's inferred to have 127227 descendands, probably tracing back to a spurious recombination
- There's only a single sample.

In [64]:
del single_origin_lineages["XAK"]

In [58]:
# Taken from https://github.com/cov-lineages/pango-designation/blob/master/pango_designation/alias_key.json
pango_x_aliases = {
    "XA": ["B.1.1.7","B.1.177"],
    "XB": ["B.1.634","B.1.631"],
    "XC": ["AY.29","B.1.1.7"],
    "XD": ["B.1.617.2*","BA.1*"],
    "XE": ["BA.1*","BA.2*"],
    "XF": ["B.1.617.2*","BA.1*"],
    "XG": ["BA.1*","BA.2*"],
    "XH": ["BA.1*","BA.2*"],
    "XJ": ["BA.1*","BA.2*"],
    "XK": ["BA.1*","BA.2*"],
    "XL": ["BA.1*","BA.2*"],
    "XM": ["BA.1.1*","BA.2*"],
    "XN": ["BA.1*","BA.2*"],
    "XP": ["BA.1.1*","BA.2*"],
    "XQ": ["BA.1.1*","BA.2*"],
    "XR": ["BA.1.1*","BA.2*"],
    "XS": ["B.1.617.2*","BA.1.1*"],
    "XT": ["BA.2*","BA.1*"],
    "XU": ["BA.1*","BA.2*"],
    "XV": ["BA.1*","BA.2*"],
    "XW": ["BA.1*","BA.2*"],
    "XY": ["BA.1*","BA.2*"],
    "XZ": ["BA.2*","BA.1*"],
    "XAA": ["BA.1*","BA.2*"],
    "XAB": ["BA.1*","BA.2*"],
    "XAC": ["BA.2*","BA.1*","BA.2*"],
    "XAD": ["BA.2*","BA.1*"],
    "XAE": ["BA.2*","BA.1*"],
    "XAF": ["BA.1*","BA.2*"],
    "XAG": ["BA.1*","BA.2*"],
    "XAH": ["BA.2*","BA.1*"],
    "XAJ": ["BA.2.12.1*","BA.4*"],
    "XAK": ["BA.2*","BA.1*","BA.2*"],
    "XAL": ["BA.1*","BA.2*"],
    "XAM": ["BA.1.1","BA.2.9"],
    "XAN": ["BA.2*","BA.5.1"],
    "XAP": ["BA.2*","BA.1*"],
    "XAQ": ["BA.1*","BA.2*"],
    "XAR": ["BA.1*","BA.2*"],
    "XAS": ["BA.5*","BA.2*"],
    "XAT": ["BA.2.3.13","BA.1*"],
    "XAU": ["BA.1.1*","BA.2.9*"],
    "XAV": ["BA.2*","BA.5*"],
    "XAW": ["BA.2*","AY.122"],
    "XAY": ["BA.2*","AY.45","BA.2*","AY.45","BA.2*"],
    "XAZ": ["BA.2.5","BA.5","BA.2.5"],
    "XBA": ["BA.2*","AY.45","BA.2*","AY.45","BA.2*"],
    "XBB": ["BJ.1","BM.1.1.1"],
    "XBC": ["BA.2*","B.1.617.2*","BA.2*","B.1.617.2*"],
    "XBD": ["BA.2.75.2","BF.5"],
    "XBE": ["BA.5.2","BE.4.1"],
    "XBF": ["BA.5.2.3","CJ.1"],
    "XBG": ["BA.2.76","BA.5.2"],
    "XBH": ["BA.2.3.17","BA.2.75.2"],
    "XBJ": ["BA.2.3.20","BA.5.2"],
    "XBK": ["BA.5.2","CJ.1"],
    "XBL": ["XBB.1","BA.2.75","XBB.1"],
    "XBM": ["BA.2.76","BF.3"],
    "XBN": ["BA.2.75","XBB.3"],
    "XBP": ["BA.2.75*","BQ.1*"],
    "XBQ": ["BA.5.2","CJ.1"],
    "XBR": ["BA.2.75","BQ.1"],
    "XBS": ["BA.2.75","BQ.1"],
    "XBT": ["BA.5.2.34","BA.2.75","BA.5.2.34"],
    "XBU": ["BA.2.75.3","BQ.1","BA.2.75.3"],
    "XBV": ["CR.1","XBB.1"],
    "XBW": ["XBB.1.5","BQ.1.14"],
    "XBY": ["BR.2.1","XBF"],
    "XBZ": ["BA.5.2*","EF.1.3"],
    "XCA": ["BA.2.75*","BQ.1*"]
}

In [67]:
def get_sample_composition(u):
    """
    Returns a Counter of the lineage pango lineages descending from the specified node.
    """
    samples = set()
    for tree in ts.trees():
        samples |= set(tree.samples(u))
    return collections.Counter(ts.node(v).metadata["Nextclade_pango"] for v in samples)

get_sample_composition(172374)

Counter({'XA': 5})

In [99]:
data = []
for lin, node in single_origin_lineages.items():
    rec = recombinants[node]
    sample_count = get_sample_composition(node)
    row = {"lineage": lin, 
          
           "parent_lineages": rec.arg_info.parent_imputed_lineages, 
           "pango_parents": pango_x_aliases[lin],
           "n": x_lineage_sample_count[lin], 
           #"descendants": sum(sample_count.values()),
           "descendant_lineages": dict(sample_count.most_common(5)),
          }
          # **rec.data_summary()}
    data.append(row)
df_summary = pd.DataFrame(data).sort_values("lineage").set_index("lineage")
df_summary

Unnamed: 0_level_0,parent_lineages,pango_parents,n,descendant_lineages
lineage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
XA,"[B.1.177.18, B.1.1.7]","[B.1.1.7, B.1.177]",5,{'XA': 5}
XAA,"[BA.1, BA.2.9]","[BA.1*, BA.2*]",8,"{'XAB': 39, 'XAG': 17, 'XAA': 8, 'XQ': 2, 'XU'..."
XAC,"[BA.2.3, BA.1.17.2]","[BA.2*, BA.1*, BA.2*]",27,{'XAC': 27}
XAE,"[BA.2, BA.1]","[BA.2*, BA.1*]",9,{'XAE': 9}
XAG,"[BA.1, BA.2.9]","[BA.1*, BA.2*]",17,"{'XAB': 39, 'XAG': 17, 'XAA': 8, 'XQ': 2, 'XU'..."
XF,"[AY.4, BA.1]","[B.1.617.2*, BA.1*]",2,{'XF': 2}
XG,"[BA.1.17, BA.2]","[BA.1*, BA.2*]",32,"{'XG': 32, 'XAB': 1}"
XH,"[BA.1.20, BA.2.9]","[BA.1*, BA.2*]",11,"{'XAF': 34, 'XH': 11, 'B.1.1.529': 3, 'XE': 3,..."
XL,"[BA.1.17.2, BA.2]","[BA.1*, BA.2*]",10,"{'XL': 10, 'XAB': 1, 'XU': 1, 'XQ': 1}"
XR,"[BA.1.1, BA.2]","[BA.1.1*, BA.2*]",8,"{'XQ': 9, 'XR': 8, 'XAB': 2}"


In [100]:
print(df_summary.to_latex())

\begin{tabular}{lllrl}
\toprule
{} &         parent\_lineages &          pango\_parents &   n &                                descendant\_lineages \\
lineage &                         &                        &     &                                                    \\
\midrule
XA      &   [B.1.177.18, B.1.1.7] &     [B.1.1.7, B.1.177] &   5 &                                          \{'XA': 5\} \\
XAA     &          [BA.1, BA.2.9] &         [BA.1*, BA.2*] &   8 &  \{'XAB': 39, 'XAG': 17, 'XAA': 8, 'XQ': 2, 'XU'... \\
XAC     &     [BA.2.3, BA.1.17.2] &  [BA.2*, BA.1*, BA.2*] &  27 &                                        \{'XAC': 27\} \\
XAE     &            [BA.2, BA.1] &         [BA.2*, BA.1*] &   9 &                                         \{'XAE': 9\} \\
XAG     &          [BA.1, BA.2.9] &         [BA.1*, BA.2*] &  17 &  \{'XAB': 39, 'XAG': 17, 'XAA': 8, 'XQ': 2, 'XU'... \\
XF      &            [AY.4, BA.1] &    [B.1.617.2*, BA.1*] &   2 &                                        

In [56]:
recombs[0].arg_info

ArgRecombinant(breakpoints=[0, 379, 29904], parents=[1748, 3829], parent_imputed_lineages=['B.1.371', 'B.1.320'], mrcas=[1484])

In [26]:
len(single_origin_lineages)

14

In [32]:
len(set(single_origin_lineages.values()))

13

In [36]:
data = []
for lin, re_node in single_origin_lineages.items():
    data.append({
        "pango": lin,
        "num_samples":  x_lineage_sample_count[lin]}
    )
df_sl_summary = pd.DataFrame(data).sort_values("pango")
df_sl_summary

Unnamed: 0,pango,num_samples
0,XA,5
8,XAA,8
10,XAC,27
11,XAE,9
12,XAG,17
13,XAK,1
2,XF,2
3,XG,32
1,XH,11
5,XL,10


In [37]:
print(df_sl_summary.to_latex())

\begin{tabular}{llr}
\toprule
{} & pango &  num\_samples \\
\midrule
0  &    XA &            5 \\
8  &   XAA &            8 \\
10 &   XAC &           27 \\
11 &   XAE &            9 \\
12 &   XAG &           17 \\
13 &   XAK &            1 \\
2  &    XF &            2 \\
3  &    XG &           32 \\
1  &    XH &           11 \\
5  &    XL &           10 \\
6  &    XR &            8 \\
4  &    XT &            1 \\
7  &    XW &           11 \\
9  &    XY &            6 \\
\bottomrule
\end{tabular}

