In [1]:
from collections import Counter
import pandas as pd

In [2]:
overview_file = "../data/pango_x_overview.csv"
df_overview = pd.read_csv(overview_file)
pangos = df_overview["pango"].to_list()
len(pangos)

69

#### Independent recombination events from Table S4

In [3]:
# Get the 1451 independent recombination events from the CovRecomb paper, and
# subset to the ones corresponding to the Pango X labels ("X" series).
tab_s4_url = "https://raw.githubusercontent.com/wuaipinglab/CovRecomb/refs/heads/main/CovRecomb-Global-Version/supplementary%20tables/TableS4.csv"
tab_s4_df = pd.read_csv(tab_s4_url)
# TODO: Deal with XBB sublineages.
tab_s4_df = tab_s4_df[tab_s4_df["X_series"].isin(pangos)].reset_index()
tab_s4_df.columns

Index(['index', 'sample_id', 'collect_date', 'pango_lineage', 'lineage_X',
       'lineage_Y', 'mutation_pattern', 'raw_p_value', 'adjusted_p_value',
       'X_mutations', 'Y_mutations', 'shared_mutations', 'denovo_mutations',
       'situation', 'region', 'country', 'Number_of_epidemic_recombinant',
       'Geographical_distribution', 'X_series', 'Confidence_mean',
       'breakpoints'],
      dtype='object')

In [4]:
# Some Pango Xs have multiple events associated with them.
# There is typically one event that has the highest number of epidemic recombinants.
tab_s4_df = tab_s4_df[
    [
        "X_series",
        "lineage_X",
        "lineage_Y",
        #"situation",
        "breakpoints",
        "Number_of_epidemic_recombinant",
    ]
].sort_values(
    by=["X_series", "Number_of_epidemic_recombinant"],
    ascending=[True, False],
).reset_index()
tab_s4_df

Unnamed: 0,index,X_series,lineage_X,lineage_Y,breakpoints,Number_of_epidemic_recombinant
0,12,XA,B.1.1.7*,B.1.177.18,21255-21765,43
1,29,XAB,BA.1.1*,BA.2.31,8393-11288,5
2,2,XAF,BA.1.1.7,BA.2.9*,8393-11288,427
3,40,XAF,BA.1.1.7,BA.2.9*,8393-11288,2
4,19,XAG,BA.1.1*,BA.2.9*,6513-9534,17
5,13,XAJ,BA.5,BG.3,"23018-23673,21721-21765",43
6,20,XAJ,BA.5,BG.3,"23018-23673,15009-21765",15
7,27,XAJ,BA.5,BG.3,"23018-23673,15009-21765",6
8,41,XAL,BA.1.1*,BA.2.23,15240-19955,2
9,23,XAM,BA.1.1*,BA.2.9*,6513-9344,12


#### Other putative recombination events

In [5]:
putative_recomb_url = "https://raw.githubusercontent.com/wuaipinglab/CovRecomb/refs/heads/main/CovRecomb-Global-Version/putative_recombinants/putative%20recombinants.csv"
putative_recomb_df = pd.read_csv(putative_recomb_url)
putative_recomb_df.columns

Index(['sample_id', 'collect_date', 'pango_lineage', 'lineage_X', 'lineage_Y',
       'mutation_pattern', 'raw_p_value', 'adjusted_p_value', 'X_mutations',
       'Y_mutations', 'shared_mutations', 'denovo_mutations', 'region',
       'country'],
      dtype='object')

In [6]:
for p in pangos:
    num_pango_lineage = len(putative_recomb_df[putative_recomb_df["pango_lineage"] == p])
    print(p, num_pango_lineage)

XA 43
XAA 0
XAD 0
XAE 3
XAF 292
XAG 38
XAL 123
XAM 12
XAP 0
XBB 2388
XBD 273
XBF 8
XBG 0
XBH 0
XBM 0
XBR 0
XC 24
XE 2425
XF 0
XG 0
XH 159
XJ 250
XL 0
XM 470
XQ 0
XR 0
XS 13
XU 0
XW 0
XY 118
XZ 0
XAJ 77
XAN 19
XAS 6
XAU 0
XAV 0
XB 3377
XBE 156
XBK 0
XBQ 0
XN 11
XP 0
XAC 0
XAZ 0
XBJ 134
XBP 0
XBS 0
XBW 0
XCA 0
XAB 3
XAH 0
XBN 0
XBZ 0
XCG 0
XAK 28
XBL 0
XBC 7
XBU 0
XAY 0
XBT 0
XAQ 81
XAR 0
XAT 0
XK 18
XT 0
XV 42
XAW 44
XBA 0
XD 30


In [7]:
def get_breakpoints(mut_patt, left):
    pos = [x.split("_")[0] for x in mut_patt.split("/")]
    if left:
        return pos[0]
    else:
        return pos[-1]


results_df = pd.DataFrame(
    columns=[
        "pango",
        "left_parent",
        "right_parent",
        "left_breakpoint",
        "right_breakpoint",
        "max_count",    # Most common
        "total_count",
        "num_simple_cases",
        "num_complex_cases",
    ]
)

i = 0
for p in pangos:
    num_simple_cases = 0
    num_complex_cases = 0

    # Simple cases
    parent_counts = Counter()
    breakpoint_counts = Counter()
    recomb_counts = Counter()

    for i, row in putative_recomb_df[putative_recomb_df["pango_lineage"] == p].iterrows():
        num_XY = row["mutation_pattern"].count("XY")
        num_YX = row["mutation_pattern"].count("YX")

        if num_XY + num_YX == 1:
            num_simple_cases += 1
            if num_XY == 1:
                left_parent = row["lineage_X"]
                right_parent = row["lineage_Y"]
                left_bkpt = get_breakpoints(row["X_mutations"], left=False)
                right_bkpt = get_breakpoints(row["Y_mutations"], left=True)
            else:
                left_parent = row["lineage_Y"]
                right_parent = row["lineage_X"]
                left_bkpt = get_breakpoints(row["Y_mutations"], left=False)
                right_bkpt = get_breakpoints(row["X_mutations"], left=True)
            parent_counts[(left_parent, right_parent)] += 1
            breakpoint_counts[(left_bkpt, right_bkpt)] += 1
            recomb_counts[(left_parent, right_parent, left_bkpt, right_bkpt)] += 1
        else:
            num_complex_cases += 1

    if num_simple_cases > 1:
        (
            most_common_left_parent,
            most_common_right_parent,
            most_common_left_bkpt,
            most_common_right_bkpt,
        ) = recomb_counts.most_common()[0][0]
        results = [
            p,
            most_common_left_parent,
            most_common_right_parent,
            most_common_left_bkpt,
            most_common_right_bkpt,
            max(recomb_counts.values()),
            sum(recomb_counts.values()),
            num_simple_cases,
            num_complex_cases,
        ]
    else:
        results = [
            p,
            "na",
            "na",
            "na",
            "na",
            "na",
            "na",
            num_simple_cases,
            num_complex_cases,
        ]

    results_df.loc[i] = results
    i += 1

In [8]:
pd.set_option('display.max_rows', 500)
results_df

Unnamed: 0,pango,left_parent,right_parent,left_breakpoint,right_breakpoint,max_count,total_count,num_simple_cases,num_complex_cases
105258,XA,B.1.177.18,B.1.1.7*,21255,21765,43,43,43,0
105259,XAA,na,na,na,na,na,na,0,0
105260,XAD,na,na,na,na,na,na,0,0
105274,XAE,BA.2.3.16,B.1.1.322,27807,28877,3,3,3,0
134478,XAF,BA.1.1.7,BA.2.9*,8393,11288,277,292,292,0
135120,XAG,BA.1.1*,BA.2.9*,6513,9344,24,38,38,0
135281,XAL,BA.1.1*,BA.2.23,15240,19955,94,118,118,5
131073,XAM,BA.1.1*,BA.2.9*,6513,9344,11,12,12,0
131074,XAP,na,na,na,na,na,na,0,0
135479,XBB,BJ.1,BM.1.1.1,22200,22674,1966,2321,2321,67


#### Merging the results above

In [9]:
merged_df = pd.DataFrame(
    columns=[
        "pango",
        "left_pango_parent",
        "right_pango_parent",
        "left_breakpoint",
        "right_breakpoint",
    ]
)

# For each Pango X in Table S4, 
# pick the entry with the highest number of epidemic recombinants.
i = 0
for p in set(tab_s4_df["X_series"].to_list()):
    tmp_df = tab_s4_df[tab_s4_df["X_series"] == p].reset_index()
    if len(tmp_df) > 1:
        # Assume it's sorted by the number of epidemic recombinants.
        tmp_df = tmp_df[0:1]
    if "," in tmp_df["breakpoints"][0]:
        # Complex recombinant
        left_pango_parent = "-"
        right_pango_parent = "-"
        left_breakpoint = "-"
        right_breakpoint = "-"
    else:
        left_pango_parent = tmp_df["lineage_X"][0]
        right_pango_parent = tmp_df["lineage_Y"][0]
        (
            left_breakpoint,
            right_breakpoint,
        ) = tmp_df["breakpoints"][0].split("-")
    merged_df.loc[i] = [
        p,
        left_pango_parent,
        right_pango_parent,
        left_breakpoint,
        right_breakpoint,
    ]
    i += 1

merged_df

Unnamed: 0,pango,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint
0,XBB,BJ.1,BM.1.1.1,22109,22674
1,XB,-,-,-,-
2,XBE,BA.5.2.2,BE.4.1.1,21765,22599
3,XAJ,-,-,-,-
4,XC,AY.29.1,B.1.1.7*,26767,27972
5,XJ,BA.1.9,BA.2.9*,13195,15714
6,XE,BA.1.9,BA.2.23,8393,9866
7,XAM,BA.1.1*,BA.2.9*,6513,9344
8,XS,AY.112*,BA.1.1*,9053,10449
9,XAW,-,-,-,-


In [10]:
i = len(merged_df)
for p in set(results_df["pango"].to_list()):
    if p in tab_s4_df["X_series"].to_list():
        continue
    tmp_df = results_df[results_df["pango"] == p].reset_index()
    if tmp_df["num_complex_cases"][0] > tmp_df["num_simple_cases"][0]:
        merged_df.loc[i] = [p, "-", "-", "-", "-"]
    else:
        merged_df.loc[i] = tmp_df[
            [
                "pango",
                "left_parent",
                "right_parent",
                "left_breakpoint",
                "right_breakpoint",
            ]
        ].iloc[0].to_list()
    i += 1
merged_df = merged_df.sort_values(by="pango").reset_index(drop=True)
merged_df

Unnamed: 0,pango,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint
0,XA,B.1.1.7*,B.1.177.18,21255,21765
1,XAA,na,na,na,na
2,XAB,BA.1.1*,BA.2.31,8393,11288
3,XAC,na,na,na,na
4,XAD,na,na,na,na
5,XAE,BA.2.3.16,B.1.1.322,27807,28877
6,XAF,BA.1.1.7,BA.2.9*,8393,11288
7,XAG,BA.1.1*,BA.2.9*,6513,9534
8,XAH,na,na,na,na
9,XAJ,-,-,-,-


In [11]:
merged_df.to_csv("covrecomb.csv", sep=",", index=False)