In [1]:
from collections import Counter
import json
import urllib.request
import pandas as pd

In [2]:
pangos = [
    "XE",
    "XB",
    "XBB",
    "XBF",
    "XN",
    "XAS",
    "XBE",
    "XL",
    "XJ",
    "XQ",
    "XP",
    "XZ",
    "XBG",
    "XBK",
    "XBD",
    "XW",
    "XA",
    "XM",
    "XAJ",
    "XY",
    "XAM",
    "XS",
    "XF",
    "XBQ",
    "XR",
    "XAP",
    "XAA",
    "XAF",
    "XBM",
    "XAV",
    "XAU",
    "XAE",
    "XAN",
    "XAG",
    "XBH",
    "XC",
    "XBR",
    "XH",
    "XAD",
    "XG",
    "XAL",
    "XU",
    "XAB",
    "XAH",
    "XAQ",
    "XAR",
    "XAT",
    "XBJ",
    "XK",
    "XT",
    "XV",
]
len(pangos)

51

In [3]:
alias_key_url = "https://raw.githubusercontent.com/cov-lineages/pango-designation/refs/heads/master/pango_designation/alias_key.json"
with urllib.request.urlopen(alias_key_url) as url:
    data = json.load(url)

In [4]:
num_total_x = 0
num_one_bkpt = 0
num_multiple_bkpts = 0
for x in data:
    if x.startswith("X"):
        num_total_x += 1
        if len(data[x]) == 2:
            num_one_bkpt +=1
        else:
            num_multiple_bkpts += 1
print(f"Pango X: {num_total_x}")
print(f"One breakpoint: {num_one_bkpt}")
print(f"More breakpoints: {num_multiple_bkpts}")

Pango X: 144
One breakpoint: 123
More breakpoints: 21


In [5]:
#covrecomb_url = "https://raw.githubusercontent.com/wuaipinglab/CovRecomb/refs/heads/main/CovRecomb-Global-Version/supplementary%20tables/TableS4.csv"
covrecomb_url = "https://raw.githubusercontent.com/wuaipinglab/CovRecomb/refs/heads/main/CovRecomb-Global-Version/putative_recombinants/putative%20recombinants.csv"
cr_df = pd.read_csv(covrecomb_url)
cr_df.columns

Index(['sample_id', 'collect_date', 'pango_lineage', 'lineage_X', 'lineage_Y',
       'mutation_pattern', 'raw_p_value', 'adjusted_p_value', 'X_mutations',
       'Y_mutations', 'shared_mutations', 'denovo_mutations', 'region',
       'country'],
      dtype='object')

In [6]:
for p in pangos:
    num_pango_lineage = len(cr_df[cr_df["pango_lineage"] == p])
    print(p, num_pango_lineage)

XE 2425
XB 3377
XBB 2388
XBF 8
XN 11
XAS 6
XBE 156
XL 0
XJ 250
XQ 0
XP 0
XZ 0
XBG 0
XBK 0
XBD 273
XW 0
XA 43
XM 470
XAJ 77
XY 118
XAM 12
XS 13
XF 0
XBQ 0
XR 0
XAP 0
XAA 0
XAF 292
XBM 0
XAV 0
XAU 0
XAE 3
XAN 19
XAG 38
XBH 0
XC 24
XBR 0
XH 159
XAD 0
XG 0
XAL 123
XU 0
XAB 3
XAH 0
XAQ 81
XAR 0
XAT 0
XBJ 134
XK 18
XT 0
XV 42


In [7]:
def _get_breakpoint(mut_patt, left):
    pos = [x.split("_")[0] for x in mut_patt.split("/")]
    if left:
        return pos[0]
    else:
        return pos[-1]

In [8]:
results_df = pd.DataFrame(
    columns=[
        "pango",
        "left_pango_parent",
        "right_pango_parent",
        "breakpoint_interval",
    ]
)

i = 0
for p in pangos:
    parent_counts = Counter()
    breakpoint_counts = Counter()
    for i, row in cr_df[cr_df["pango_lineage"] == p].iterrows():
        num_XY = row["mutation_pattern"].count("XY")
        num_YX = row["mutation_pattern"].count("YX")
        if num_XY + num_YX == 1:
            if num_XY == 1:
                pango_parents = (row["lineage_X"], row["lineage_Y"])
                left_breakpoint = _get_breakpoint(row["X_mutations"], left=False)
                right_breakpoint = _get_breakpoint(row["Y_mutations"], left=True)
            else:
                pango_parents = (row["lineage_Y"], row["lineage_X"])
                left_breakpoint = _get_breakpoint(row["Y_mutations"], left=False)
                right_breakpoint = _get_breakpoint(row["X_mutations"], left=True)
            parent_counts[pango_parents] += 1
            breakpoint_interval = left_breakpoint + "-" + right_breakpoint
            breakpoint_counts[breakpoint_interval] += 1
        else:
            # TODO: Do something with multiple-breakpoint recombinants.
            #print(f"Multiple breakpoints: {p}")
            pass
    if len(parent_counts) > 0:
        most_common_left_pango_parent, most_common_right_pango_parent = parent_counts.most_common()[0][0]
        most_common_breakpoint_interval = breakpoint_counts.most_common()[0][0]
        results = [
            p,
            most_common_left_pango_parent,
            most_common_right_pango_parent,
            most_common_breakpoint_interval,
        ]
    else:
        results = [p, "na", "na", "na"]
    results_df.loc[i] = results
    i += 1

In [9]:
# Note that this does not account for entries with multiple breakpoints.
results_df

Unnamed: 0,pango,left_pango_parent,right_pango_parent,breakpoint_interval
135382,XE,BA.1.9,BA.2.23,8393-11288
133442,XB,B.1.634,B.1.631,16466-22882
135479,XBB,BJ.1,BM.1.1.1,22200-22674
134829,XBF,BM.1.1*,BQ.2,23075-26529
106114,XN,BA.2.3.16,BA.2.69,22688-22792
85488,XAS,BA.5,BA.2.17,23075-25416
133593,XBE,BA.5.2.2,BE.4.1.1,21765-22599
133594,XL,na,na,na
134629,XJ,BA.1.9,BA.2.9*,13195-15714
134630,XQ,na,na,na
