# Pango events within the ARG

In this notebook we examine how well the ARG reflects evolutionary events implicit in the pango naming system.


## Summary

Of the 2058 distinct pango lineages in the ARG, 1474 of these (comprising 693278 samples) match perfectly, with unique origination events in the ARG where all samples assigned a given lineage descend from the first node assigned that lineage. A further 253 lineages (600565 samples) match perfectly when we count the descendants of the  parent of the first node (accounting for polytomies in which multiple originating nodes for a given lineage are siblings). We then have 332 lineages (1189039 samples) where the difference in the number descendants of the first node's parent is < 100. The remaining 33 lineages (391795) are dominated by a few large lineages such as BA.1.1 (155595 samples) and AY.4.2 (54607 samples) which have multiple non-sibling origins within the ARG.


In [1]:
import sc2ts
import tszip
import pathlib
import numpy as np
import pandas as pd
import concurrent.futures as cf
from tqdm.notebook import tqdm

datadir = pathlib.Path("../data")

## Code

In [2]:
ts = tszip.load(datadir / "sc2ts_viridian_v1.1.trees.tsz")


In [3]:
df_node = sc2ts.node_data(ts).set_index("node_id")
df_node["node_time"] = ts.nodes_time
df_node

Unnamed: 0_level_0,pango,sample_id,scorpio,is_sample,is_recombinant,num_mutations,max_descendant_samples,date,node_time
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,B,Vestigial_ignore,.,False,False,0,0,2019-11-26,1183.997288
1,B,Wuhan/Hu-1/2019,.,False,False,0,2482157,2019-12-26,1153.000000
2,A,SRR11772659,.,True,False,1,255,2020-01-19,1129.000000
3,B,SRR11397727,.,True,False,0,1,2020-01-24,1124.000000
4,B,SRR11397730,.,True,False,0,1,2020-01-24,1124.000000
...,...,...,...,...,...,...,...,...,...
2744708,AY.3,,Delta (B.1.617.2-like),False,False,5,11,2021-03-20,703.504247
2744709,BA.1.1,,Omicron (BA.1-like),False,False,1,7,2022-01-03,414.000000
2744710,BA.2.1,,Omicron (BA.2-like),False,False,1,128,2021-12-30,418.409007
2744711,BA.5.1,,Omicron (BA.5-like),False,False,2,3,2022-06-01,265.165622


We sort the nodes by the descending samples first, and then by node time. This should guarantee that the first node in the dataframe for each pango lineage is the "majority" node for that pango.

In [4]:
dfn_sorted = df_node.sort_values(["max_descendant_samples", "node_time"], ascending=False)
dfn_sorted

Unnamed: 0_level_0,pango,sample_id,scorpio,is_sample,is_recombinant,num_mutations,max_descendant_samples,date,node_time
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,B,Wuhan/Hu-1/2019,.,False,False,0,2482157,2019-12-26,1153.000000
27,B,,.,False,False,1,2477500,2019-12-26,1153.000000
12,B.1,SRR11597205,.,True,False,2,2477495,2020-01-28,1120.016020
59,B.1,,.,False,False,1,2477489,2020-01-28,1120.016020
98,B.1.1,,.,False,False,3,1218787,2020-01-28,1120.000082
...,...,...,...,...,...,...,...,...,...
2689016,XBB.1.5.62,ERR10937891,Omicron (XBB.1.5-like),True,False,0,1,2023-02-20,1.000000
2689017,FD.1,ERR10937893,Omicron (XBB.1.5-like),True,False,0,1,2023-02-20,1.000000
2689018,CH.1.1.3,ERR10937945,Omicron (BA.2-like),True,False,0,1,2023-02-20,1.000000
2689019,CH.1.1.3,ERR10937969,Omicron (BA.2-like),True,False,0,1,2023-02-20,1.000000


In [5]:
dfn_pango = dfn_sorted.reset_index().groupby(["pango"]).first()
dfn_pango

Unnamed: 0_level_0,node_id,sample_id,scorpio,is_sample,is_recombinant,num_mutations,max_descendant_samples,date,node_time
pango,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A,9,,.,False,False,2,1085,2019-12-26,1153.000000
A.1,227,,.,False,False,2,245,2020-01-22,1126.832540
A.2,530,,.,False,False,4,234,2020-02-01,1116.000000
A.2.2,1190,,.,False,False,1,65,2020-02-28,1089.781309
A.2.3,1186,,.,False,False,2,79,2020-02-23,1094.280592
...,...,...,...,...,...,...,...,...,...
XW,1159411,,Omicron (BA.2-like),False,True,1,32,2022-03-10,348.296203
XY,1187989,,Omicron (Unassigned),False,True,2,23,2022-03-16,342.196661
XZ,1163537,,Omicron (BA.2-like),False,False,1,48,2022-03-06,352.818252
Y.1,55861,,.,False,False,1,36,2020-08-18,917.000000


In [6]:

def worker(work):
    pango, row = work
    df_pango = df_node[df_node.pango == pango]
    samples = df_pango[df_pango.is_sample].index
    root = row["node_id"]
    tracked_samples = []
    parent_tracked_samples = []
    for tree in ts.trees(tracked_samples=samples):
        tracked_samples.append(tree.num_tracked_samples(root))
        parent = tree.parent(root)
        if parent != -1:
            parent_tracked_samples.append(tree.num_tracked_samples(parent))
        else:
            parent_tracked_samples.append(0)
            
    return {
        "pango": pango,
        "root": root,
        "total_samples": len(samples),
        "max_descendants": np.max(tracked_samples),
        "min_descendants": np.min(tracked_samples),
        "parent_max_descendants": np.max(parent_tracked_samples),
        "parent_min_descendants": np.min(parent_tracked_samples),
    }
    
# Note: set things up this way with an eye to using concurrent.futures,
# but it was totally GIL-blocked, seemingly. Not worth setting up
# process level parallelism.
data = []
for work in tqdm(dfn_pango.iterrows(), total=dfn_pango.shape[0]):
    result = worker(work)
    data.append(result)
        
df_pango_events = pd.DataFrame(data)
df_pango_events

  0%|          | 0/2058 [00:00<?, ?it/s]

Unnamed: 0,pango,root,total_samples,max_descendants,min_descendants,parent_max_descendants,parent_min_descendants
0,A,9,225,225,225,225,225
1,A.1,227,245,245,245,245,245
2,A.2,530,47,47,47,47,47
3,A.2.2,1190,65,65,65,65,65
4,A.2.3,1186,79,79,79,79,79
...,...,...,...,...,...,...,...
2053,XW,1159411,32,32,32,32,32
2054,XY,1187989,23,23,23,23,23
2055,XZ,1163537,48,48,48,48,48
2056,Y.1,55861,36,36,36,36,36


In [7]:
total = df_pango_events["total_samples"]
diff = (total - df_pango_events["max_descendants"]).abs() 
diff_parent = (total - df_pango_events["parent_max_descendants"]).abs() 
df_pango_events["diff"] = diff
df_pango_events["diff_parent"] = diff_parent
df_pango_events["relative_diff"] = diff / total
df_pango_events["relative_diff_parent"] = diff_parent / total

In [8]:
df_pango_events

Unnamed: 0,pango,root,total_samples,max_descendants,min_descendants,parent_max_descendants,parent_min_descendants,diff,diff_parent,relative_diff,relative_diff_parent
0,A,9,225,225,225,225,225,0,0,0.0,0.0
1,A.1,227,245,245,245,245,245,0,0,0.0,0.0
2,A.2,530,47,47,47,47,47,0,0,0.0,0.0
3,A.2.2,1190,65,65,65,65,65,0,0,0.0,0.0
4,A.2.3,1186,79,79,79,79,79,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2053,XW,1159411,32,32,32,32,32,0,0,0.0,0.0
2054,XY,1187989,23,23,23,23,23,0,0,0.0,0.0
2055,XZ,1163537,48,48,48,48,48,0,0,0.0,0.0
2056,Y.1,55861,36,36,36,36,36,0,0,0.0,0.0


In [9]:
perfect = df_pango_events[df_pango_events["diff"] == 0]
perfect

Unnamed: 0,pango,root,total_samples,max_descendants,min_descendants,parent_max_descendants,parent_min_descendants,diff,diff_parent,relative_diff,relative_diff_parent
0,A,9,225,225,225,225,225,0,0,0.0,0.0
1,A.1,227,245,245,245,245,245,0,0,0.0,0.0
2,A.2,530,47,47,47,47,47,0,0,0.0,0.0
3,A.2.2,1190,65,65,65,65,65,0,0,0.0,0.0
4,A.2.3,1186,79,79,79,79,79,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2053,XW,1159411,32,32,32,32,32,0,0,0.0,0.0
2054,XY,1187989,23,23,23,23,23,0,0,0.0,0.0
2055,XZ,1163537,48,48,48,48,48,0,0,0.0,0.0
2056,Y.1,55861,36,36,36,36,36,0,0,0.0,0.0


In [10]:
perfect.shape

(1474, 11)

In [11]:
perfect.total_samples.sum()

np.int64(693278)

# Consider the effects of polytomies 

In [12]:
perfect_for_parent = df_pango_events[(df_pango_events["diff"] > 0) & (df_pango_events["diff_parent"] == 0)]
perfect_for_parent

Unnamed: 0,pango,root,total_samples,max_descendants,min_descendants,parent_max_descendants,parent_min_descendants,diff,diff_parent,relative_diff,relative_diff_parent
15,A.5,818,47,46,46,47,47,1,0,0.021277,0.0
19,AA.3,50820,28,27,27,28,28,1,0,0.035714,0.0
33,AM.1,97590,2,1,1,2,2,1,0,0.500000,0.0
43,AY.103,266229,67055,67021,67015,67055,67050,34,0,0.000507,0.0
46,AY.105,264390,222,220,219,222,222,2,0,0.009009,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1989,XBB.1.5.9,2715635,14,12,12,14,14,2,0,0.142857,0.0
1991,XBB.1.5.91,1430871,4,3,3,4,4,1,0,0.250000,0.0
2001,XBB.2,1396939,96,90,90,96,96,6,0,0.062500,0.0
2025,XBF.2,1429629,4,2,2,4,4,2,0,0.500000,0.0


In [13]:
perfect_for_parent.shape

(253, 11)

In [14]:
perfect_for_parent.total_samples.sum()

np.int64(600565)

# The rest

In [15]:
not_perfect = df_pango_events[df_pango_events["diff_parent"] != 0]
not_perfect

Unnamed: 0,pango,root,total_samples,max_descendants,min_descendants,parent_max_descendants,parent_min_descendants,diff,diff_parent,relative_diff,relative_diff_parent
6,A.2.5.1,255097,3,2,2,2,2,1,1,0.333333,0.333333
7,A.2.5.2,325703,4,3,3,3,3,1,1,0.250000,0.250000
40,AY.100,280335,18621,17693,17426,17693,17426,928,928,0.049836,0.049836
50,AY.109,380418,316,305,304,305,304,11,11,0.034810,0.034810
54,AY.112,407132,137,45,45,45,45,92,92,0.671533,0.671533
...,...,...,...,...,...,...,...,...,...,...,...
1984,XBB.1.5.79,1430422,3,2,2,2,2,1,1,0.333333,0.333333
1994,XBB.1.5.96,1429403,16,8,8,8,8,8,8,0.500000,0.500000
2006,XBB.2.4,1434117,4,3,3,3,3,1,1,0.250000,0.250000
2007,XBB.2.5,1420329,33,30,30,30,30,3,3,0.090909,0.090909


In [16]:
not_perfect.total_samples.sum()

np.int64(1189039)

## Lineages that are pretty close 

And have a reasonable number of samples

In [17]:
close_to_right = not_perfect[(not_perfect["diff_parent"] < 100)]
close_to_right

Unnamed: 0,pango,root,total_samples,max_descendants,min_descendants,parent_max_descendants,parent_min_descendants,diff,diff_parent,relative_diff,relative_diff_parent
6,A.2.5.1,255097,3,2,2,2,2,1,1,0.333333,0.333333
7,A.2.5.2,325703,4,3,3,3,3,1,1,0.250000,0.250000
50,AY.109,380418,316,305,304,305,304,11,11,0.034810,0.034810
54,AY.112,407132,137,45,45,45,45,92,92,0.671533,0.671533
59,AY.116,291940,97,92,91,95,95,5,2,0.051546,0.020619
...,...,...,...,...,...,...,...,...,...,...,...
1984,XBB.1.5.79,1430422,3,2,2,2,2,1,1,0.333333,0.333333
1994,XBB.1.5.96,1429403,16,8,8,8,8,8,8,0.500000,0.500000
2006,XBB.2.4,1434117,4,3,3,3,3,1,1,0.250000,0.250000
2007,XBB.2.5,1420329,33,30,30,30,30,3,3,0.090909,0.090909


In [18]:
close_to_right.shape

(299, 11)

In [19]:
close_to_right.total_samples.sum()

np.int64(797244)

## Important stuff that's a long way off

In [20]:
important = not_perfect[not_perfect["diff_parent"] >= 100]
important.sort_values("total_samples", ascending=False)

Unnamed: 0,pango,root,total_samples,max_descendants,min_descendants,parent_max_descendants,parent_min_descendants,diff,diff_parent,relative_diff,relative_diff_parent
977,BA.1.1,2728818,155595,13829,13811,13829,13811,141766,141766,0.911122,0.911122
142,AY.4.2,280324,54607,54499,54344,54500,54345,108,107,0.001978,0.001959
1010,BA.1.17.2,827900,38302,27513,27507,27527,27521,10789,10775,0.281682,0.281317
1003,BA.1.15,820352,21492,19362,19360,19364,19362,2130,2128,0.099107,0.099014
1144,BA.2.9,2727955,19401,13040,13040,13836,13836,6361,5565,0.32787,0.286841
40,AY.100,280335,18621,17693,17426,17693,17426,928,928,0.049836,0.049836
987,BA.1.1.18,818916,14887,5951,5945,5951,5945,8936,8936,0.600255,0.600255
1056,BA.2.3,934105,14704,14582,14581,14596,14595,122,108,0.008297,0.007345
1027,BA.2.10,902972,10067,9128,9126,9171,9169,939,896,0.093275,0.089004
1013,BA.1.20,852614,4420,4030,4030,4038,4038,390,382,0.088235,0.086425


In [21]:
important.shape

(33, 11)

In [22]:
important.total_samples.sum()

np.int64(391795)

In [23]:
df_pango_events.to_csv(datadir / "pango_events_in_arg.csv", index=False)