# Generate the table for displaying Pango X events

In [190]:

import numpy as np
import pandas as pd
import tszip
import sc2ts

In [191]:
dfe = pd.read_csv("../data/pango_x_events.csv")
dfe

Unnamed: 0,root,root_pango,root_mutations,root_type,pango_samples,non_pango_samples,closest_recombinant,closest_recombinant_path_len,closest_recombinant_time,pango
0,1083412,XG,3,R,3,{},1083412,0.0,0.0,XG
1,1118099,XAE,7,S,9,{},964555,2.0,47.0,XAE
2,1159411,XW,1,R,32,{},1159411,0.0,0.0,XW
3,1422955,XBQ,1,I,14,{},-1,inf,inf,XBQ
4,1183815,XAA,3,I,17,{},1058654,6.0,33.0,XAA
5,965352,XE,2,S,1116,{},965353,1.0,19.0,XE
6,1425824,XBK,1,I,7,{},-1,inf,inf,XBK
7,946761,XF,1,R,16,{},946761,0.0,0.0,XF
8,1092789,XP,1,S,45,{},-1,inf,inf,XP
9,1265115,XAS,1,S,77,{},-1,inf,inf,XAS


In [192]:
dfr = pd.read_csv("../data/recombinants.csv").set_index("recombinant")
dfr

Unnamed: 0_level_0,sample_id,num_descendant_samples,num_samples,distinct_sample_pango,interval_left,interval_right,num_mutations,Viridian_amplicon_scheme,Artic_primer_version,date_added,...,parent_mrca_pango,parent_mrca_scorpio,parent_mrca_time,parent_mrca_date,is_rebar_recombinant,parent_pangonet_distance,net_min_supporting_loci_lft,net_min_supporting_loci_rgt,net_min_supporting_loci_lft_rgt_ge_4,k1000_muts
recombinant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1280342,ERR9939974,1,1,1,695,958,1,COVID-ARTIC-V4.1,.,2022-06-27,...,B.1.1.529,Probable Omicron (Unassigned),957.981880,2020-07-09,False,5,2,16,False,8
663484,SRR20259474,1,1,1,510,1222,1,COVID-AMPLISEQ-V1,.,2021-10-21,...,B.1.617.2,Delta (B.1.617.2-like),838.212323,2020-11-05,False,2,2,16,False,5
1356368,ERR10219711,2,1,1,695,1453,1,COVID-ARTIC-V4.1,.,2022-08-30,...,B.1.1.529,Probable Omicron (Unassigned),957.981880,2020-07-09,False,4,1,16,False,5
1253364,ERR9848224,855,1,1,695,1627,1,COVID-ARTIC-V4.1,.,2022-05-30,...,B.1.1.529,Probable Omicron (Unassigned),957.981880,2020-07-09,False,4,1,54,False,7
1279026,ERR9940192,662,1,1,695,1627,0,COVID-ARTIC-V4.1,.,2022-06-26,...,B.1.1.529,Probable Omicron (Unassigned),957.981880,2020-07-09,False,3,2,8,False,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1375056,SRR21797337,1,1,1,26276,29729,1,COVID-ARTIC-V4.1,.,2022-09-19,...,BA.2,Omicron (BA.2-like),447.000000,2021-12-01,False,2,22,2,False,5
1373412,SRR21794908,2,1,1,28331,29729,2,COVID-ARTIC-V4.1,.,2022-09-17,...,B.1.1.529,Probable Omicron (Unassigned),957.981880,2020-07-09,False,5,25,2,False,7
1418709,ERR10708209,37,12,1,28682,29729,8,COVID-ARTIC-V4.1,4.1alt,2022-12-17,...,B.1.1.529,Probable Omicron (Unassigned),957.981880,2020-07-09,False,8,23,2,False,10
1436032,ERR10933490,2,1,1,27916,29729,3,COVID-ARTIC-V4.1,4.1alt,2023-02-13,...,BA.2,Omicron (BA.2-like),447.000000,2021-12-01,True,6,27,2,False,7


In [193]:
dfr["mutations_averted"] = dfr["k1000_muts"] - dfr["num_mutations"]

In [194]:
dfe = dfe.join(dfr, "closest_recombinant")

# List the events

List the events that are monophyletic for the pango lineage in question --- almost. XBB.1 and XM are the exceptions. 

We also exclude XAC and XAD, which are split and subsets of XZ+

In [195]:
counts = dfe["root_pango"].value_counts() 
multiple = counts[counts > 1]
multiple

root_pango
XM     4
XAC    4
XAD    2
Name: count, dtype: int64

In [196]:
dfe[dfe["root_pango"].isin(multiple.index)]

Unnamed: 0,root,root_pango,root_mutations,root_type,pango_samples,non_pango_samples,closest_recombinant,closest_recombinant_path_len,closest_recombinant_time,pango,...,parent_mrca_scorpio,parent_mrca_time,parent_mrca_date,is_rebar_recombinant,parent_pangonet_distance,net_min_supporting_loci_lft,net_min_supporting_loci_rgt,net_min_supporting_loci_lft_rgt_ge_4,k1000_muts,mutations_averted
12,938900,XM,2,R,1,{},938900,0.0,0.0,XM,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,3.0,26.0,14.0,True,27.0,27.0
13,1003220,XM,0,R,26,"{'BA.2': 16, 'XAL': 3}",1003220,0.0,0.0,XM,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,3.0,22.0,22.0,True,7.0,6.0
14,1158127,XM,1,R,1,{},1158127,0.0,0.0,XM,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,3.0,27.0,17.0,True,4.0,4.0
15,1158182,XM,6,R,1,{},1158182,0.0,0.0,XM,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,3.0,33.0,23.0,True,10.0,7.0
16,1223586,XAC,1,I,7,{},964555,5.0,66.343249,XAC,...,Probable Omicron (Unassigned),957.98188,2020-07-09,False,4.0,43.0,5.0,True,6.0,6.0
17,2740307,XAC,2,I,9,{},964555,5.0,84.170422,XAC,...,Probable Omicron (Unassigned),957.98188,2020-07-09,False,4.0,43.0,5.0,True,6.0,6.0
18,1214068,XAC,2,S,1,{},964555,5.0,94.0,XAC,...,Probable Omicron (Unassigned),957.98188,2020-07-09,False,4.0,43.0,5.0,True,6.0,6.0
19,1235679,XAC,1,S,1,{},964555,5.0,114.0,XAC,...,Probable Omicron (Unassigned),957.98188,2020-07-09,False,4.0,43.0,5.0,True,6.0,6.0
45,1108280,XAD,5,S,1,{},964555,4.0,44.0,XAD,...,Probable Omicron (Unassigned),957.98188,2020-07-09,False,4.0,43.0,5.0,True,6.0,6.0
46,1237984,XAD,7,S,1,{},964555,5.0,117.0,XAD,...,Probable Omicron (Unassigned),957.98188,2020-07-09,False,4.0,43.0,5.0,True,6.0,6.0


These are messy - let's leave everything except the dominant XM out and see where we go. XAC and XAD are all under the same recombinaiont, which we'll probably analyse separately.

In [197]:
pango_events_tbl = dfe[~dfe["root_pango"].isin(["XM", "XAC", "XAD"])]
pango_events_tbl = pd.concat([pango_events_tbl, dfe[dfe["root"] == 1003220]])
pango_events_tbl

Unnamed: 0,root,root_pango,root_mutations,root_type,pango_samples,non_pango_samples,closest_recombinant,closest_recombinant_path_len,closest_recombinant_time,pango,...,parent_mrca_scorpio,parent_mrca_time,parent_mrca_date,is_rebar_recombinant,parent_pangonet_distance,net_min_supporting_loci_lft,net_min_supporting_loci_rgt,net_min_supporting_loci_lft_rgt_ge_4,k1000_muts,mutations_averted
0,1083412,XG,3,R,3,{},1083412,0.0,0.0,XG,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,3.0,9.0,38.0,True,7.0,4.0
1,1118099,XAE,7,S,9,{},964555,2.0,47.0,XAE,...,Probable Omicron (Unassigned),957.98188,2020-07-09,False,4.0,43.0,5.0,True,6.0,6.0
2,1159411,XW,1,R,32,{},1159411,0.0,0.0,XW,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,4.0,5.0,48.0,True,5.0,4.0
3,1422955,XBQ,1,I,14,{},-1,inf,inf,XBQ,...,,,,,,,,,,
4,1183815,XAA,3,I,17,{},1058654,6.0,33.0,XAA,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,5.0,8.0,43.0,True,7.0,7.0
5,965352,XE,2,S,1116,{},965353,1.0,19.0,XE,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,4.0,15.0,33.0,True,7.0,6.0
6,1425824,XBK,1,I,7,{},-1,inf,inf,XBK,...,,,,,,,,,,
7,946761,XF,1,R,16,{},946761,0.0,0.0,XF,...,.,1120.01602,2020-01-28,True,6.0,6.0,75.0,True,6.0,5.0
8,1092789,XP,1,S,45,{},-1,inf,inf,XP,...,,,,,,,,,,
9,1265115,XAS,1,S,77,{},-1,inf,inf,XAS,...,,,,,,,,,,


In [198]:
recombs_tbl = pango_events_tbl[pango_events_tbl["root_type"] == "R"]
recombs_tbl

Unnamed: 0,root,root_pango,root_mutations,root_type,pango_samples,non_pango_samples,closest_recombinant,closest_recombinant_path_len,closest_recombinant_time,pango,...,parent_mrca_scorpio,parent_mrca_time,parent_mrca_date,is_rebar_recombinant,parent_pangonet_distance,net_min_supporting_loci_lft,net_min_supporting_loci_rgt,net_min_supporting_loci_lft_rgt_ge_4,k1000_muts,mutations_averted
0,1083412,XG,3,R,3,{},1083412,0.0,0.0,XG,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,3.0,9.0,38.0,True,7.0,4.0
2,1159411,XW,1,R,32,{},1159411,0.0,0.0,XW,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,4.0,5.0,48.0,True,5.0,4.0
7,946761,XF,1,R,16,{},946761,0.0,0.0,XF,...,.,1120.01602,2020-01-28,True,6.0,6.0,75.0,True,6.0,5.0
23,1396207,XBB,14,R,6452,{'BA.2': 1},1396207,0.0,0.0,XBB,...,Omicron (BA.2-like),536.774383,2021-09-03,True,6.0,14.0,10.0,True,19.0,6.0
24,1429711,XBB.1,1,R,2,{},1429711,0.0,0.0,XBB,...,Omicron (BA.2-like),536.774383,2021-09-03,True,15.0,21.0,27.0,True,20.0,19.0
25,1000242,XS,1,R,17,{},1000242,0.0,0.0,XS,...,.,1120.01602,2020-01-28,True,7.0,16.0,76.0,True,13.0,11.0
26,1420166,XBR,2,R,1,{},1420166,0.0,0.0,XBR,...,Omicron (BA.2-like),474.57444,2021-11-04,True,13.0,25.0,26.0,True,24.0,22.0
27,1187989,XY,2,R,23,{},1187989,0.0,0.0,XY,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,3.0,22.0,31.0,True,8.0,5.0
29,1034619,XL,1,R,64,{},1034619,0.0,0.0,XL,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,4.0,10.0,40.0,True,9.0,8.0
30,1058654,XQ,3,R,55,"{'BA.2': 37, 'XR': 17, 'XAA': 17, 'XU': 1, 'XA...",1058654,0.0,0.0,XQ,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,5.0,8.0,43.0,True,7.0,7.0


In [199]:
close_recombs = pango_events_tbl[(pango_events_tbl["root_type"] != "R")
    & (pango_events_tbl["closest_recombinant"] != -1)]
close_recombs = close_recombs.sort_values(["closest_recombinant", "closest_recombinant_time"])
close_recombs

Unnamed: 0,root,root_pango,root_mutations,root_type,pango_samples,non_pango_samples,closest_recombinant,closest_recombinant_path_len,closest_recombinant_time,pango,...,parent_mrca_scorpio,parent_mrca_time,parent_mrca_date,is_rebar_recombinant,parent_pangonet_distance,net_min_supporting_loci_lft,net_min_supporting_loci_rgt,net_min_supporting_loci_lft_rgt_ge_4,k1000_muts,mutations_averted
1,1118099,XAE,7,S,9,{},964555,2.0,47.0,XAE,...,Probable Omicron (Unassigned),957.98188,2020-07-09,False,4.0,43.0,5.0,True,6.0,6.0
22,1163537,XZ,1,I,48,{},964555,4.0,48.181748,XZ,...,Probable Omicron (Unassigned),957.98188,2020-07-09,False,4.0,43.0,5.0,True,6.0,6.0
38,1216836,XAP,1,I,20,{},964555,4.0,67.371983,XAP,...,Probable Omicron (Unassigned),957.98188,2020-07-09,False,4.0,43.0,5.0,True,6.0,6.0
5,965352,XE,2,S,1116,{},965353,1.0,19.0,XE,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,4.0,15.0,33.0,True,7.0,6.0
42,1098084,XH,7,S,2,{},965353,1.0,55.0,XH,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,4.0,15.0,33.0,True,7.0,6.0
40,966904,XJ,2,S,68,{'BA.2': 3},966905,1.0,5.0,XJ,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,4.0,19.0,28.0,True,6.0,5.0
47,1264107,XAL,2,I,3,{},1003220,3.0,65.566058,XAL,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,3.0,22.0,22.0,True,7.0,6.0
33,1148222,XR,4,I,17,{'BA.2': 1},1058654,3.0,11.0,XR,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,5.0,8.0,43.0,True,7.0,7.0
4,1183815,XAA,3,I,17,{},1058654,6.0,33.0,XAA,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,5.0,8.0,43.0,True,7.0,7.0
50,1240312,XAM,3,I,21,{},1058654,6.0,42.885654,XAM,...,Probable Omicron (Unassigned),957.98188,2020-07-09,True,5.0,8.0,43.0,True,7.0,7.0


In [200]:
non_recombs = pango_events_tbl[pango_events_tbl["closest_recombinant"] == -1]
non_recombs = non_recombs.sort_values(["root_mutations", "pango_samples"])
non_recombs

Unnamed: 0,root,root_pango,root_mutations,root_type,pango_samples,non_pango_samples,closest_recombinant,closest_recombinant_path_len,closest_recombinant_time,pango,...,parent_mrca_scorpio,parent_mrca_time,parent_mrca_date,is_rebar_recombinant,parent_pangonet_distance,net_min_supporting_loci_lft,net_min_supporting_loci_rgt,net_min_supporting_loci_lft_rgt_ge_4,k1000_muts,mutations_averted
6,1425824,XBK,1,I,7,{},-1,inf,inf,XBK,...,,,,,,,,,,
28,1231548,XAU,1,S,8,{},-1,inf,inf,XAU,...,,,,,,,,,,
3,1422955,XBQ,1,I,14,{},-1,inf,inf,XBQ,...,,,,,,,,,,
8,1092789,XP,1,S,45,{},-1,inf,inf,XP,...,,,,,,,,,,
9,1265115,XAS,1,S,77,{},-1,inf,inf,XAS,...,,,,,,,,,,
36,1061700,XN,1,S,120,{'BA.2': 1},-1,inf,inf,XN,...,,,,,,,,,,
44,223239,XB,2,I,192,{},-1,inf,inf,XB,...,,,,,,,,,,
37,1276376,XAJ,11,I,18,{},-1,inf,inf,XAJ,...,,,,,,,,,,


In [202]:
 col_name_map = {
    "root_pango": "pango",
    "pango_samples": "samples", 
    "non_pango_samples": "extra", 
    "root_mutations": "muts",
    #"root_type": "type",
    "root": "root",
    "closest_recombinant": "recomb",
    #"closest_recombinant_averted_mutations": "averted",
    "closest_recombinant_time": "t recomb",
    "closest_recombinant_path_len": "p recomb",
    
}

out = []
for tbl in [recombs_tbl, close_recombs, non_recombs]:
   

    tbl = tbl[list(col_name_map.keys())].sort_values("root_pango")
    
    s = tbl.to_latex(
        escape=True, index=False, columns=col_name_map.keys(), header=list(col_name_map.values()),
        float_format="%.0f")
    s = s.replace("'", "").replace(": ", ":")
    s = s.replace("{BA.2:37, XR:17, XAA:17, XU:1, XAM:21, XAG:6\}", "{BA.2:37, $\star$\}")
    splits = s.splitlines()
    footer = splits[-2:]
    splits = splits[:-2]
    if len(out) > 0:
        splits = splits[3:]
    out.extend(splits)
    

out.extend(footer)
print("\n".join(out))

\begin{tabular}{lrlrrrrr}
\toprule
pango & samples & extra & muts & root & recomb & t recomb & p recomb \\
\midrule
XA & 39 & \{\} & 1 & 122444 & 122444 & 0 & 0 \\
XBB & 6452 & \{BA.2:1\} & 14 & 1396207 & 1396207 & 0 & 0 \\
XBB.1 & 2 & \{\} & 1 & 1429711 & 1429711 & 0 & 0 \\
XBD & 30 & \{\} & 0 & 1378208 & 1378208 & 0 & 0 \\
XBF & 185 & \{\} & 3 & 1420385 & 1420385 & 0 & 0 \\
XBG & 25 & \{\} & 2 & 1291970 & 1291970 & 0 & 0 \\
XBR & 1 & \{\} & 2 & 1420166 & 1420166 & 0 & 0 \\
XC & 5 & \{\} & 1 & 414488 & 414488 & 0 & 0 \\
XF & 16 & \{\} & 1 & 946761 & 946761 & 0 & 0 \\
XG & 3 & \{\} & 3 & 1083412 & 1083412 & 0 & 0 \\
XL & 64 & \{\} & 1 & 1034619 & 1034619 & 0 & 0 \\
XM & 26 & \{BA.2:16, XAL:3\} & 0 & 1003220 & 1003220 & 0 & 0 \\
XQ & 55 & \{BA.2:37, $\star$\} & 3 & 1058654 & 1058654 & 0 & 0 \\
XS & 17 & \{\} & 1 & 1000242 & 1000242 & 0 & 0 \\
XW & 32 & \{\} & 1 & 1159411 & 1159411 & 0 & 0 \\
XY & 23 & \{\} & 2 & 1187989 & 1187989 & 0 & 0 \\
\midrule
XAA & 17 & \{\} & 3 & 1183815 & 10586