# Details of Pango recombinants

This notebook performs the analysis required for the Results/Recombinant pango lineages in the Long ARG.

In [1]:
import sys
import collections

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import tskit
import tszip

sys.path.append("../../sc2ts/")
import sc2ts.utils


In [2]:
%%time
ts = tszip.decompress("../data/upgma-mds-1000-md-30-mm-3-2022-06-30-recinfo-gisaid-il.ts.tsz")
ts

CPU times: user 496 ms, sys: 321 ms, total: 818 ms
Wall time: 533 ms


Tree Sequence,Unnamed: 1
Trees,958
Sequence Length,29904.0
Time Units,days
Sample Nodes,657239
Total Size,494.0 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,785539,24.0 MiB,
Individuals,0,24 Bytes,
Migrations,0,8 Bytes,
Mutations,1062072,40.2 MiB,✅
Nodes,783231,422.5 MiB,✅
Populations,0,8 Bytes,
Provenances,2,1.8 KiB,
Sites,29422,1.3 MiB,✅


In [3]:
ti = sc2ts.utils.TreeInfo(ts)

def report(*args, **kwargs):
    display(*ti.node_report(*args, **kwargs))

ti

Counting descendants : 100%|███████████████████████████████████████████████| 783231/783231 [00:00<00:00, 2391728.84it/s]
Indexing metadata    : 100%|█████████████████████████████████████████████████| 783231/783231 [00:12<00:00, 61925.49it/s]
Classifying mutations: 100%|██████████████████████████████████████████████| 1062072/1062072 [00:10<00:00, 101233.16it/s]


Unnamed: 0,property,value
0,latest_sample,2022-06-30
1,max_submission_delay,29 days
2,samples,657239
3,nodes,783231
4,mc_nodes,37749
5,pr_nodes,34358
6,re_nodes,2078
7,recombinants,2078
8,mutations,1062072
9,recurrent,50099


In [4]:
recombinants = {rec.node: rec for rec in ti.combine_recombinant_info()}

In [5]:
x_lineage_names = [lin for lin in ti.pango_lineage_samples.keys() if lin.startswith("X")]
x_lineage_sample_count = {lin: len(ti.pango_lineage_samples[lin]) for lin in x_lineage_names}
x_lineage_sample_count

{'XA': 5,
 'XB': 58,
 'XC': 5,
 'XD': 4,
 'XH': 11,
 'XF': 2,
 'XS': 6,
 'XZ': 92,
 'XG': 32,
 'XAF': 36,
 'XN': 69,
 'XE': 170,
 'XM': 48,
 'XV': 3,
 'XK': 6,
 'XAB': 50,
 'XT': 1,
 'XL': 10,
 'XR': 8,
 'XP': 2,
 'XQ': 14,
 'XAD': 6,
 'XAH': 14,
 'XW': 11,
 'XAA': 8,
 'XY': 6,
 'XU': 3,
 'XAC': 27,
 'XJ': 3,
 'XAE': 9,
 'XAG': 17,
 'XAJ': 12,
 'XAK': 1}

In [6]:
all_x_samples = []
for lin in x_lineage_names:
    all_x_samples.extend(ti.pango_lineage_samples[lin])
df = ti.recombinant_samples_report(all_x_samples)
df

Unnamed: 0,recombinant,direct,path_length,node,strain,pango,parents,children,descendants,date,delay,qc,mutations,reversions,immediate_reversions,child_mutations,child_reversions
0,172374,True,1,172373,Wales/ALDP-125C4D7/2021,XA,1,0,1,2021-02-06,12 days,0000,0,0,0,0,0
1,172374,False,1,182624,Wales/LIVE-DFCFFE/2021,XA,1,0,1,2021-02-14,15 days,1000,0,0,0,0,0
2,172374,False,1,221655,England/ALDP-142CC21/2021,XA,1,2,3,2021-03-17,14 days,0000,1,0,0,3,0
3,172374,False,2,227648,England/ALDP-1458CD1/2021,XA,1,0,1,2021-03-22,11 days,0000,2,0,0,0,0
4,172374,False,2,228656,Wales/PHWC-PYBIFF/2021,XA,1,0,1,2021-03-23,13 days,0100,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
744,731513,False,2,772350,Germany/BY-RKI-I-911253/2022,XAJ,1,0,1,2022-06-21,28 days,0000,5,3,0,0,0
745,774725,True,1,774724,Mexico/BCN_LANGEBIO_IMSS_9303/2022,XAJ,1,0,1,2022-06-23,20 days,0100,3,1,0,0,0
746,555388,False,20,782398,England/LSPA-3EAC5B5/2022,XAJ,1,0,1,2022-06-30,7 days,0000,0,0,0,0,0
747,555388,False,20,782399,England/LSPA-3EAC56A/2022,XAJ,1,0,1,2022-06-30,7 days,1000,0,0,0,0,0


# Filter out singleton nodes

In [7]:
recombination_nodes = df.recombinant.unique()
recombination_nodes

array([172374, 206466, 285181, 394059, 251176, 292030, 322219, 363172,
       573905, 638775, 582054, 588841, 589269, 636566, 676648, 555388,
       623997, 628656, 636903, 641595, 648753, 651744, 676531, 693927,
       697157, 697596, 704521, 705521, 716084, 746741, 756841, 609881,
       611036, 677584, 615119, 668317, 674332, 677977, 700575, 703379,
       731665, 621567, 629086, 642711, 674229, 687046, 699692, 705501,
       729195, 749143, 756466, 625355, 640831, 642277, 711560, 733196,
       630214, 638733, 752475, 630235, 653674, 676545, 751632, 633497,
       635896, 638141, 663092, 679355, 639472, 699376, 733742, 738307,
       637876,     -1, 666230, 696838, 739545, 649001, 674321, 729963,
       663539, 682411, 701594, 741659, 701181, 722544, 745522, 749460,
       731513, 774725])

In [8]:
sum(recombination_nodes == -1)

1

There is one sample that has no recombinants in its history (an XP sample; see elsewhere for discussion). Remove this.

In [9]:
recombination_nodes = recombination_nodes[recombination_nodes >= 0]
recombination_nodes

array([172374, 206466, 285181, 394059, 251176, 292030, 322219, 363172,
       573905, 638775, 582054, 588841, 589269, 636566, 676648, 555388,
       623997, 628656, 636903, 641595, 648753, 651744, 676531, 693927,
       697157, 697596, 704521, 705521, 716084, 746741, 756841, 609881,
       611036, 677584, 615119, 668317, 674332, 677977, 700575, 703379,
       731665, 621567, 629086, 642711, 674229, 687046, 699692, 705501,
       729195, 749143, 756466, 625355, 640831, 642277, 711560, 733196,
       630214, 638733, 752475, 630235, 653674, 676545, 751632, 633497,
       635896, 638141, 663092, 679355, 639472, 699376, 733742, 738307,
       637876, 666230, 696838, 739545, 649001, 674321, 729963, 663539,
       682411, 701594, 741659, 701181, 722544, 745522, 749460, 731513,
       774725])

In [10]:
ti.nodes_max_descendant_samples[recombination_nodes]

array([     5,     68,      8,      1,      6,      4,      1,      1,
            3,      3,     51,      2,      1,      4,      1, 127227,
            1,     13,      2,     73,      1,      1,      2,      3,
            1,      1,      1,      1,      1,      1,      1,     33,
            1,      7,      1,      2,      2,      1,      1,      7,
            3,    155,      2,      1,      1,      3,      2,      1,
            1,      1,      1,      3,      2,     40,      1,      4,
            9,      1,      1,      1,      1,      1,      3,      1,
           67,     13,      2,      3,     19,     19,      1,      2,
            5,      1,      2,      1,      4,      1,      2,     13,
           27,      2,      1,      9,     13,      5,      2,      2,
            1], dtype=int32)

In [11]:
singleton_recombination_nodes = recombination_nodes[ti.nodes_max_descendant_samples[recombination_nodes] == 1]
singleton_recombination_nodes

array([394059, 322219, 363172, 589269, 676648, 623997, 648753, 651744,
       697157, 697596, 704521, 705521, 716084, 746741, 756841, 611036,
       615119, 677977, 700575, 642711, 674229, 705501, 729195, 749143,
       756466, 711560, 638733, 752475, 630235, 653674, 676545, 633497,
       733742, 666230, 739545, 674321, 741659, 774725])

In [12]:
singleton_recombination_nodes.shape

(38,)

In [13]:
df_singleton = df[df.recombinant.isin(singleton_recombination_nodes)]
df_singleton

Unnamed: 0,recombinant,direct,path_length,node,strain,pango,parents,children,descendants,date,delay,qc,mutations,reversions,immediate_reversions,child_mutations,child_reversions
59,394059,True,1,394058,USA/NY-PRL-2021_0809_00K06/2021,XB,1,0,1,2021-08-08,5 days,10,6,0,0,0,0
66,322219,True,1,322218,USA/NC-CDC-LC0070730/2021,XC,1,0,1,2021-06-07,11 days,0,9,3,0,0,0
67,363172,True,1,363171,Sweden/4088533864/2021,XC,1,0,1,2021-07-12,21 days,0,7,1,0,0,0
85,589269,True,1,589268,USA/AZ-ASPHL-0290/2022,XS,1,0,1,2022-01-13,19 days,1000,13,3,0,0,0
90,676648,True,1,676647,Peru/LIM-INS-16397/2022,XS,1,0,1,2022-03-31,21 days,0,9,0,0,0,0
92,623997,True,1,623996,England/MILK-36C3556/2022,XZ,1,0,1,2022-02-13,14 days,0,2,0,0,0,0
97,648753,True,1,648752,Australia/VIC45939/2022,XZ,1,0,1,2022-03-07,17 days,0,4,1,0,0,0
99,651744,True,1,651743,Australia/VIC46095/2022,XZ,1,0,1,2022-03-09,15 days,100,4,0,0,0,0
117,697157,True,1,697156,Scotland/QEUH-3D91BEF/2022,XZ,1,0,1,2022-04-18,8 days,0,3,0,0,0,0
118,697596,True,1,697595,Croatia/35852/2022,XZ,1,0,1,2022-04-18,16 days,1000,2,0,0,0,0


In [14]:
collections.Counter(df_singleton.pango).most_common(100)

[('XZ', 10),
 ('XE', 6),
 ('XN', 3),
 ('XK', 3),
 ('XC', 2),
 ('XS', 2),
 ('XV', 2),
 ('XAB', 2),
 ('XQ', 2),
 ('XB', 1),
 ('XAF', 1),
 ('XM', 1),
 ('XAH', 1),
 ('XJ', 1),
 ('XAJ', 1)]

In [15]:
recombination_nodes = recombination_nodes[ti.nodes_max_descendant_samples[recombination_nodes] > 1]
recombination_nodes

array([172374, 206466, 285181, 251176, 292030, 573905, 638775, 582054,
       588841, 636566, 555388, 628656, 636903, 641595, 676531, 693927,
       609881, 677584, 668317, 674332, 703379, 731665, 621567, 629086,
       687046, 699692, 625355, 640831, 642277, 733196, 630214, 751632,
       635896, 638141, 663092, 679355, 639472, 699376, 738307, 637876,
       696838, 649001, 729963, 663539, 682411, 701594, 701181, 722544,
       745522, 749460, 731513])

# Artefactual recombinant

In [16]:

recombination_nodes[ti.nodes_max_descendant_samples[recombination_nodes] > 1000]

array([555388])

In [17]:
u = 555388

In [18]:
df_dodgy = df[df.recombinant == u]
df_dodgy

Unnamed: 0,recombinant,direct,path_length,node,strain,pango,parents,children,descendants,date,delay,qc,mutations,reversions,immediate_reversions,child_mutations,child_reversions
91,555388,False,8,609189,France/OCC-2201311067/2022,XZ,1,0,1,2022-01-31,11 days,1000,0,0,0,0,0
96,555388,False,12,642924,Germany/NW-RKI-I-625140/2022,XZ,1,0,1,2022-03-02,26 days,0000,4,0,0,0,0
100,555388,False,13,652360,Turkey/HSGM-GS728/2022,XZ,1,3,4,2022-03-10,29 days,0000,1,1,0,5,1
106,555388,False,13,672817,Slovakia/BA_22_00022659/2022,XZ,1,0,1,2022-03-28,25 days,0000,9,4,0,0,0
107,555388,False,14,673306,Switzerland/FR-ETHZ-36981952/2022,XZ,1,4,6,2022-03-28,22 days,0000,4,4,0,8,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742,555388,False,19,758379,England/LSPA-3E52D89/2022,XAJ,1,1,2,2022-06-09,8 days,0000,1,0,0,0,0
743,555388,False,20,770189,England/PHEC-YYFS1MG/2022,XAJ,1,0,1,2022-06-19,19 days,0000,0,0,0,0,0
746,555388,False,20,782398,England/LSPA-3EAC5B5/2022,XAJ,1,0,1,2022-06-30,7 days,0000,0,0,0,0,0
747,555388,False,20,782399,England/LSPA-3EAC56A/2022,XAJ,1,0,1,2022-06-30,7 days,1000,0,0,0,0,0


In [19]:
collections.Counter(df_dodgy.pango).most_common(100)

[('XN', 53),
 ('XZ', 16),
 ('XAJ', 6),
 ('XE', 1),
 ('XAD', 1),
 ('XAH', 1),
 ('XAK', 1)]

In [20]:
recombination_nodes = recombination_nodes[ti.nodes_max_descendant_samples[recombination_nodes] < 1000]
recombination_nodes

array([172374, 206466, 285181, 251176, 292030, 573905, 638775, 582054,
       588841, 636566, 628656, 636903, 641595, 676531, 693927, 609881,
       677584, 668317, 674332, 703379, 731665, 621567, 629086, 687046,
       699692, 625355, 640831, 642277, 733196, 630214, 751632, 635896,
       638141, 663092, 679355, 639472, 699376, 738307, 637876, 696838,
       649001, 729963, 663539, 682411, 701594, 701181, 722544, 745522,
       749460, 731513])

In [21]:
dfx = df[df.recombinant.isin(recombination_nodes)]
dfx

Unnamed: 0,recombinant,direct,path_length,node,strain,pango,parents,children,descendants,date,delay,qc,mutations,reversions,immediate_reversions,child_mutations,child_reversions
0,172374,True,1,172373,Wales/ALDP-125C4D7/2021,XA,1,0,1,2021-02-06,12 days,0000,0,0,0,0,0
1,172374,False,1,182624,Wales/LIVE-DFCFFE/2021,XA,1,0,1,2021-02-14,15 days,1000,0,0,0,0,0
2,172374,False,1,221655,England/ALDP-142CC21/2021,XA,1,2,3,2021-03-17,14 days,0000,1,0,0,3,0
3,172374,False,2,227648,England/ALDP-1458CD1/2021,XA,1,0,1,2021-03-22,11 days,0000,2,0,0,0,0
4,172374,False,2,228656,Wales/PHWC-PYBIFF/2021,XA,1,0,1,2021-03-23,13 days,0100,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737,722544,False,1,726588,USA/NC-CDC-LC0635753/2022,XAJ,1,3,6,2022-05-13,13 days,0000,2,0,0,5,2
739,745522,True,1,745521,USA/UT-UPHL-220611523412/2022,XAJ,1,0,1,2022-05-29,16 days,0001,4,1,0,0,0
740,745522,False,1,747677,USA/UT-UPHL-220624962006/2022,XAJ,1,0,1,2022-05-31,27 days,0000,5,4,0,0,0
741,749460,True,1,749459,USA/UT-UPHL-220619166831/2022,XAJ,1,1,2,2022-06-01,20 days,0000,6,5,0,0,0


In [22]:
len(dfx)

630

In [23]:
filtered_lineage_sample_count = collections.Counter(dfx.pango)


# Single origin pango x lineages

Where all of the samples in the X lineage trace back to the same recombination node.



In [24]:
lineage_re_nodes = collections.defaultdict(set)
for _, row in dfx.iterrows():
    lineage_re_nodes[row.pango].add(row.recombinant)
lineage_re_nodes

defaultdict(set,
            {'XA': {172374},
             'XB': {206466, 285181},
             'XC': {251176, 292030},
             'XD': {573905, 638775},
             'XH': {582054},
             'XF': {588841},
             'XS': {636566},
             'XZ': {628656, 636903, 641595, 676531, 693927},
             'XG': {609881},
             'XAF': {582054, 677584},
             'XN': {668317, 674332, 703379, 731665},
             'XE': {582054, 621567, 629086, 687046, 699692},
             'XM': {625355, 640831, 642277, 733196},
             'XV': {630214},
             'XK': {751632},
             'XAB': {609881,
              635896,
              638141,
              639472,
              663092,
              679355,
              699376,
              738307},
             'XT': {637876},
             'XL': {638141},
             'XR': {639472},
             'XQ': {635896, 639472, 696838},
             'XAD': {641595, 649001},
             'XAH': {641595, 729963},
           

In [25]:
all_re_nodes = set()
for nodes in lineage_re_nodes.values():
    all_re_nodes|= nodes
len(all_re_nodes)

50

In [26]:
single_origin_lineages = {lin: list(nodes)[0] for lin, nodes in lineage_re_nodes.items() if len(nodes) == 1}
single_origin_lineages 

{'XA': 172374,
 'XH': 582054,
 'XF': 588841,
 'XS': 636566,
 'XG': 609881,
 'XV': 630214,
 'XK': 751632,
 'XT': 637876,
 'XL': 638141,
 'XR': 639472,
 'XW': 663539,
 'XAA': 635896,
 'XY': 677584,
 'XAC': 682411,
 'XAE': 701181,
 'XAG': 635896}

In [27]:
# Taken from https://github.com/cov-lineages/pango-designation/blob/master/pango_designation/alias_key.json
pango_x_aliases = {
    "XA": ["B.1.1.7","B.1.177"],
    "XB": ["B.1.634","B.1.631"],
    "XC": ["AY.29","B.1.1.7"],
    "XD": ["B.1.617.2*","BA.1*"],
    "XE": ["BA.1*","BA.2*"],
    "XF": ["B.1.617.2*","BA.1*"],
    "XG": ["BA.1*","BA.2*"],
    "XH": ["BA.1*","BA.2*"],
    "XJ": ["BA.1*","BA.2*"],
    "XK": ["BA.1*","BA.2*"],
    "XL": ["BA.1*","BA.2*"],
    "XM": ["BA.1.1*","BA.2*"],
    "XN": ["BA.1*","BA.2*"],
    "XP": ["BA.1.1*","BA.2*"],
    "XQ": ["BA.1.1*","BA.2*"],
    "XR": ["BA.1.1*","BA.2*"],
    "XS": ["B.1.617.2*","BA.1.1*"],
    "XT": ["BA.2*","BA.1*"],
    "XU": ["BA.1*","BA.2*"],
    "XV": ["BA.1*","BA.2*"],
    "XW": ["BA.1*","BA.2*"],
    "XY": ["BA.1*","BA.2*"],
    "XZ": ["BA.2*","BA.1*"],
    "XAA": ["BA.1*","BA.2*"],
    "XAB": ["BA.1*","BA.2*"],
    "XAC": ["BA.2*","BA.1*","BA.2*"],
    "XAD": ["BA.2*","BA.1*"],
    "XAE": ["BA.2*","BA.1*"],
    "XAF": ["BA.1*","BA.2*"],
    "XAG": ["BA.1*","BA.2*"],
    "XAH": ["BA.2*","BA.1*"],
    "XAJ": ["BA.2.12.1*","BA.4*"],
    "XAK": ["BA.2*","BA.1*","BA.2*"],
    "XAL": ["BA.1*","BA.2*"],
    "XAM": ["BA.1.1","BA.2.9"],
    "XAN": ["BA.2*","BA.5.1"],
    "XAP": ["BA.2*","BA.1*"],
    "XAQ": ["BA.1*","BA.2*"],
    "XAR": ["BA.1*","BA.2*"],
    "XAS": ["BA.5*","BA.2*"],
    "XAT": ["BA.2.3.13","BA.1*"],
    "XAU": ["BA.1.1*","BA.2.9*"],
    "XAV": ["BA.2*","BA.5*"],
    "XAW": ["BA.2*","AY.122"],
    "XAY": ["BA.2*","AY.45","BA.2*","AY.45","BA.2*"],
    "XAZ": ["BA.2.5","BA.5","BA.2.5"],
    "XBA": ["BA.2*","AY.45","BA.2*","AY.45","BA.2*"],
    "XBB": ["BJ.1","BM.1.1.1"],
    "XBC": ["BA.2*","B.1.617.2*","BA.2*","B.1.617.2*"],
    "XBD": ["BA.2.75.2","BF.5"],
    "XBE": ["BA.5.2","BE.4.1"],
    "XBF": ["BA.5.2.3","CJ.1"],
    "XBG": ["BA.2.76","BA.5.2"],
    "XBH": ["BA.2.3.17","BA.2.75.2"],
    "XBJ": ["BA.2.3.20","BA.5.2"],
    "XBK": ["BA.5.2","CJ.1"],
    "XBL": ["XBB.1","BA.2.75","XBB.1"],
    "XBM": ["BA.2.76","BF.3"],
    "XBN": ["BA.2.75","XBB.3"],
    "XBP": ["BA.2.75*","BQ.1*"],
    "XBQ": ["BA.5.2","CJ.1"],
    "XBR": ["BA.2.75","BQ.1"],
    "XBS": ["BA.2.75","BQ.1"],
    "XBT": ["BA.5.2.34","BA.2.75","BA.5.2.34"],
    "XBU": ["BA.2.75.3","BQ.1","BA.2.75.3"],
    "XBV": ["CR.1","XBB.1"],
    "XBW": ["XBB.1.5","BQ.1.14"],
    "XBY": ["BR.2.1","XBF"],
    "XBZ": ["BA.5.2*","EF.1.3"],
    "XCA": ["BA.2.75*","BQ.1*"]
}

In [28]:
def get_sample_composition(u):
    """
    Returns a Counter of the lineage pango lineages descending from the specified node.
    """
    samples = set()
    for tree in ts.trees():
        samples |= set(tree.samples(u))
    return collections.Counter(ts.node(v).metadata["Nextclade_pango"] for v in samples)

get_sample_composition(172374)

Counter({'XA': 5})

In [29]:
data = []
for lin, node in single_origin_lineages.items():
    rec = recombinants[node]
    sample_count = get_sample_composition(node)
    row = {"lineage": lin, 
          
           "parent_lineages": rec.arg_info.parent_imputed_lineages, 
           "pango_parents": pango_x_aliases[lin],
           "n": filtered_lineage_sample_count[lin], 
           #"descendants": sum(sample_count.values()),
           "descendant_lineages": dict(sample_count.most_common(5)),
          }
          # **rec.data_summary()}
    data.append(row)
df_summary = pd.DataFrame(data).sort_values("lineage").set_index("lineage")
df_summary

Unnamed: 0_level_0,parent_lineages,pango_parents,n,descendant_lineages
lineage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
XA,"[B.1.177.18, B.1.1.7]","[B.1.1.7, B.1.177]",5,{'XA': 5}
XAA,"[BA.1, BA.2.9]","[BA.1*, BA.2*]",8,"{'XAB': 39, 'XAG': 17, 'XAA': 8, 'XQ': 2, 'XU'..."
XAC,"[BA.2.3, BA.1.17.2]","[BA.2*, BA.1*, BA.2*]",27,{'XAC': 27}
XAE,"[BA.2, BA.1]","[BA.2*, BA.1*]",9,{'XAE': 9}
XAG,"[BA.1, BA.2.9]","[BA.1*, BA.2*]",17,"{'XAB': 39, 'XAG': 17, 'XAA': 8, 'XQ': 2, 'XU'..."
XF,"[AY.4, BA.1]","[B.1.617.2*, BA.1*]",2,{'XF': 2}
XG,"[BA.1.17, BA.2]","[BA.1*, BA.2*]",32,"{'XG': 32, 'XAB': 1}"
XH,"[BA.1.20, BA.2.9]","[BA.1*, BA.2*]",11,"{'XAF': 34, 'XH': 11, 'B.1.1.529': 3, 'XE': 3,..."
XK,"[BA.1.1.1, BA.2]","[BA.1*, BA.2*]",3,{'XK': 3}
XL,"[BA.1.17.2, BA.2]","[BA.1*, BA.2*]",10,"{'XL': 10, 'XAB': 1, 'XU': 1, 'XQ': 1}"


In [30]:
len(df_summary)

16

In [31]:
print(df_summary.to_latex())

\begin{tabular}{lllrl}
\toprule
{} &              parent\_lineages &          pango\_parents &   n &                                descendant\_lineages \\
lineage &                              &                        &     &                                                    \\
\midrule
XA      &        [B.1.177.18, B.1.1.7] &     [B.1.1.7, B.1.177] &   5 &                                          \{'XA': 5\} \\
XAA     &               [BA.1, BA.2.9] &         [BA.1*, BA.2*] &   8 &  \{'XAB': 39, 'XAG': 17, 'XAA': 8, 'XQ': 2, 'XU'... \\
XAC     &          [BA.2.3, BA.1.17.2] &  [BA.2*, BA.1*, BA.2*] &  27 &                                        \{'XAC': 27\} \\
XAE     &                 [BA.2, BA.1] &         [BA.2*, BA.1*] &   9 &                                         \{'XAE': 9\} \\
XAG     &               [BA.1, BA.2.9] &         [BA.1*, BA.2*] &  17 &  \{'XAB': 39, 'XAG': 17, 'XAA': 8, 'XQ': 2, 'XU'... \\
XF      &                 [AY.4, BA.1] &    [B.1.617.2*, BA.1*] &   2 &

# Multi origin lineages

There are no multi-origin lineages that are fully monophyletic.

In [32]:
multi_monophyletic = []
for lin, nodes in lineage_re_nodes.items():

    if len(nodes) > 1:
        s = sum(ti.nodes_max_descendant_samples[u] for u in nodes)
        print(lin, s, filtered_lineage_sample_count[lin])
        if s == filtered_lineage_sample_count[lin]:
            multi_monophyletic.append(lin)
            #print(lin,  x_lineage_sample_count[lin], nodes)
        
            #for u in nodes:
            #    print("\t", get_sample_composition(u))
            #print("\t", u, ti.nodes_max_descendant_samples[u])
        #
multi_monophyletic

XB 76 57
XC 10 3
XD 6 4
XZ 93 66
XAF 58 35
XN 14 13
XE 213 163
XM 49 47
XAB 158 48
XQ 88 12
XAD 77 5
XAH 75 12
XU 82 3
XJ 5 2
XAJ 22 5


[]