In [1]:
from collections import Counter
from pathlib import Path
import pandas as pd

In [2]:
data_dir = Path("../data")
overview_file = data_dir / "bigtable_pangodesig.csv"
overview_df = pd.read_csv(overview_file)
pangos = overview_df.pango
len(pangos)

60

#### Independent recombination events

In [3]:
# Get the 1451 independent recombination events from the CovRecomb paper, and
# subset to the ones corresponding to the Pango X labels ("X" series).
# NOTE: No XBB.* or other X?.*
table_s4_url = "https://raw.githubusercontent.com/wuaipinglab/CovRecomb/refs/heads/main/CovRecomb-Global-Version/supplementary%20tables/TableS4.csv"
table_s4_df = pd.read_csv(table_s4_url)
table_s4_df = table_s4_df[table_s4_df.X_series.isin(pangos)].reset_index(drop=True)
table_s4_df.head(5)

Unnamed: 0,sample_id,collect_date,pango_lineage,lineage_X,lineage_Y,mutation_pattern,raw_p_value,adjusted_p_value,X_mutations,Y_mutations,shared_mutations,denovo_mutations,situation,region,country,Number_of_epidemic_recombinant,Geographical_distribution,X_series,Confidence_mean,breakpoints
0,hCoV-19/USA/CA-CDC-FG-008007/2021,2021/3/10,XB,B.1.631,B.1.634,YYYYYYYXXXXXXXXXY,1.7800000000000002e-39,1.4399999999999999e-36,22882_A/23604_A/23765_G/24442_T/24642_T/26158_...,7142_G/9614_G/9693_T/9754_C/11288_---------/15...,3037_T/14408_T/21615_G/21846_T/22036_C/23403_G...,1255_G/3688_T/3884_T/6633_T/15026_T/21057_T/23...,B.1.631_B.1.634,North America,USA,3332,USA_2611/Mexico_509/Spain_23/Honduras_55/Guate...,XB,0.9659,"28330-28910,16466-22882"
1,hCoV-19/Denmark/DCGC-322560/2022,2022/1/7,BA.2,BA.1.9,BA.2.23,XXXXYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY,5.41e-112,9.84e-109,2832_G/5386_G/6513_---/8393_A,9866_T/10198_T/10447_A/11288_---------/12880_T...,3037_T/10029_T/10449_A/14408_T/18163_G/22578_A...,686_---------/3241_T/5924_A/27571_------------...,BA.1.9_BA.2.23,Europe,Denmark,2522,Denmark_22/United Kingdom_1767/Hong Kong_6/Ind...,XE,0.8517,8393-9866
2,hCoV-19/Venezuela/Dtt3630/2022,2022/1/22,BA.2,BA.1.1.7,BA.2.9*,XXXXYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY,3.02e-109,8.360000000000001e-106,2470_T/2832_G/5386_G/8393_A,11288_---------/12880_T/15714_T/17410_T/19955_...,3037_T/10029_T/10449_A/14408_T/18163_G/22578_A...,2441_A/6513_---/7081_T/13124_A/14838_T/15451_A...,BA.1.1.7_BA.2.9*,South America,Venezuela,427,Venezuela_4/Norway_11/USA_61/Costa Rica_144/De...,XAF,0.8135,8393-11288
3,hCoV-19/Denmark/DCGC-301091/2021,2021/12/30,XH,BA.1.9,BA.2.9*,XXXXYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY,7.3399999999999995e-109,1.62e-105,2832_G/5386_G/6513_---/8393_A,11288_---------/12880_T/15714_T/17410_T/19955_...,3037_T/10029_T/10449_A/14408_T/18163_G/22578_A...,902_C/904_A/1244_A/28435_T,BA.1.9_BA.2.9*,Europe,Denmark,422,Denmark_187/Finland_151/Sweden_2/Belgium_6/Cze...,XH,0.8335,8393-11288
4,hCoV-19/USA/NJ-CDC-ASC210687738/2022,2022/1/29,XM,BA.1.1*,BA.2.23,XXXXXXXXXYYYYYYYYYYYYYYY,4.890000000000001e-106,1.0999999999999999e-102,2470_T/2832_G/5386_G/6513_---/8393_A/11285_---...,21618_T/21633_---------/21987_A/22200_G/22688_...,3037_T/10029_T/10449_A/14408_T/18163_G/22578_A...,20343_C,BA.1.1*_BA.2.23,North America,USA,390,USA_5/Germany_220/Bahrain_1/Netherlands_17/Bel...,XM,0.9238,15240-21618


In [4]:
# Some Pango Xs have multiple events associated with them.
# There is typically one event that has the highest number of epidemic recombinants.
table_s4_df = table_s4_df[
    [
        "X_series",
        "lineage_X",
        "lineage_Y",
        "X_mutations",
        "Y_mutations",
        "mutation_pattern",
        #"situation",
        "breakpoints",
        "Number_of_epidemic_recombinant",
    ]
].sort_values(
    by=["X_series", "Number_of_epidemic_recombinant"],
    ascending=[True, False],
).reset_index(drop=True)
table_s4_df.head(5)

Unnamed: 0,X_series,lineage_X,lineage_Y,X_mutations,Y_mutations,mutation_pattern,breakpoints,Number_of_epidemic_recombinant
0,XA,B.1.1.7*,B.1.177.18,21765_------/21992_---/23063_T/23271_A/23604_A...,445_C/6286_T/10323_G/21255_C,YYYYXXXXXXXXXXXXXXXXXX,21255-21765,43
1,XAB,BA.1.1*,BA.2.31,2470_T/2832_G/5386_G/6513_---/8393_A,11288_---------/12880_T/15714_T/17410_T/19955_...,XXXXXYYYYYYYYYYYYYYY,8393-11288,5
2,XAF,BA.1.1.7,BA.2.9*,2470_T/2832_G/5386_G/8393_A,11288_---------/12880_T/15714_T/17410_T/19955_...,XXXXYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY,8393-11288,427
3,XAF,BA.1.1.7,BA.2.9*,2470_T/2832_G/5386_G/8393_A,11288_---------/12880_T/15714_T/17410_T/21618_...,XXXXYYYYYYYYYYYYYYYYYYYYYYYYYY,8393-11288,2
4,XAG,BA.1.1*,BA.2.9*,2470_T/2832_G/5386_G/6513_---,9534_T/9866_T/10198_T/10447_A/11288_---------/...,XXXXYYYYYYYYYYYYYYYYYYYYYY,6513-9534,17


In [5]:
def _get_breakpoints(mut_patt, left):
    pos = [x.split("_")[0] for x in mut_patt.split("/")]
    if left:
        return int(pos[0])
    return int(pos[-1])


In [6]:
data = []
for _, row in table_s4_df.iterrows():
    num_XY = row["mutation_pattern"].count("XY")
    num_YX = row["mutation_pattern"].count("YX")
    if num_XY + num_YX == 1:
        # NOTE: X and Y do not mean left parent and right parent, respectively.
        if num_XY == 1:
            parent_left_pango = row["lineage_X"]
            parent_right_pango = row["lineage_Y"]
            interval_left = _get_breakpoints(row["X_mutations"], left=False)
            interval_right = _get_breakpoints(row["Y_mutations"], left=True)
        else:
            parent_left_pango = row["lineage_Y"]
            parent_right_pango = row["lineage_X"]
            interval_left = _get_breakpoints(row["Y_mutations"], left=False)
            interval_right = _get_breakpoints(row["X_mutations"], left=True)
        data.append(
            {
                "pango": row["X_series"],
                "is_in_results": "TRUE",
                "type": "simple",
                "parent_left_pango": parent_left_pango,
                "parent_right_pango": parent_right_pango,
                "interval_left": str(interval_left),
                "interval_right": str(interval_right + 1),   # NOTE: Make right-exclusive.
                "num_epidemic_recombs": row["Number_of_epidemic_recombinant"],
            }
        )
    else:
        data.append(
            {
                "pango": row["X_series"],
                "is_in_results": "TRUE",
                "type": "complex",
                "parent_left_pango": "n/a",
                "parent_right_pango": "n/a",
                "interval_left": "n/a",
                "interval_right": "n/a",
                "num_epidemic_recombs": row["Number_of_epidemic_recombinant"],
            }
        )
table_s4_df = pd.DataFrame(data)
table_s4_df

Unnamed: 0,pango,is_in_results,type,parent_left_pango,parent_right_pango,interval_left,interval_right,num_epidemic_recombs
0,XA,True,simple,B.1.177.18,B.1.1.7*,21255.0,21766.0,43
1,XAB,True,simple,BA.1.1*,BA.2.31,8393.0,11289.0,5
2,XAF,True,simple,BA.1.1.7,BA.2.9*,8393.0,11289.0,427
3,XAF,True,simple,BA.1.1.7,BA.2.9*,8393.0,11289.0,2
4,XAG,True,simple,BA.1.1*,BA.2.9*,6513.0,9535.0,17
5,XAJ,True,complex,,,,,43
6,XAJ,True,complex,,,,,15
7,XAJ,True,complex,,,,,6
8,XAL,True,simple,BA.1.1*,BA.2.23,15240.0,19956.0,2
9,XAM,True,simple,BA.1.1*,BA.2.9*,6513.0,9345.0,12


In [7]:
# When there are multiple events associated with the same Pango X,
# get the event with the highest number of epidemic recombinants.
idx = table_s4_df.groupby('pango')['num_epidemic_recombs'].idxmax()
table_s4_df = table_s4_df.loc[idx].reset_index(drop=True)
table_s4_df

Unnamed: 0,pango,is_in_results,type,parent_left_pango,parent_right_pango,interval_left,interval_right,num_epidemic_recombs
0,XA,True,simple,B.1.177.18,B.1.1.7*,21255.0,21766.0,43
1,XAB,True,simple,BA.1.1*,BA.2.31,8393.0,11289.0,5
2,XAF,True,simple,BA.1.1.7,BA.2.9*,8393.0,11289.0,427
3,XAG,True,simple,BA.1.1*,BA.2.9*,6513.0,9535.0,17
4,XAJ,True,complex,,,,,43
5,XAL,True,simple,BA.1.1*,BA.2.23,15240.0,19956.0,2
6,XAM,True,simple,BA.1.1*,BA.2.9*,6513.0,9345.0,12
7,XB,True,complex,,,,,3332
8,XBB,True,simple,BJ.1,BM.1.1.1,22109.0,22675.0,97
9,XBD,True,simple,BA.2.75.2*,BF.3.1*,23019.0,24621.0,268


#### Other putative recombination events

In [8]:
recomb_url = "https://raw.githubusercontent.com/wuaipinglab/CovRecomb/refs/heads/main/CovRecomb-Global-Version/putative_recombinants/putative%20recombinants.csv"
recomb_df = pd.read_csv(recomb_url)
recomb_df.columns

Index(['sample_id', 'collect_date', 'pango_lineage', 'lineage_X', 'lineage_Y',
       'mutation_pattern', 'raw_p_value', 'adjusted_p_value', 'X_mutations',
       'Y_mutations', 'shared_mutations', 'denovo_mutations', 'region',
       'country'],
      dtype='object')

In [9]:
# NOTE: pango_lineage is the Pango assigned to the sample (from GISAID).
for p in pangos:
    num_pango_lineage = sum(recomb_df.pango_lineage == p)
    #print(p, num_pango_lineage)

In [10]:
data = []
for p in pangos:
    num_simple = 0
    num_complex = 0
    recomb_counts = Counter()
    for _, row in recomb_df[
        # NOTE: Lumps together XBB.* and other X?.*
        (recomb_df.pango_lineage == p) | \
        (recomb_df.pango_lineage.str.startswith(p + "."))
    ].iterrows():
        num_XY = row["mutation_pattern"].count("XY")
        num_YX = row["mutation_pattern"].count("YX")
        if num_XY + num_YX == 1:
            num_simple += 1
            # NOTE: X and Y do not mean left parent and right parent, respectively.
            if num_XY == 1:
                parent_left_pango = row["lineage_X"]
                parent_right_pango = row["lineage_Y"]
                interval_left = _get_breakpoints(row["X_mutations"], left=False)
                interval_right = _get_breakpoints(row["Y_mutations"], left=True)
            else:
                parent_left_pango = row["lineage_Y"]
                parent_right_pango = row["lineage_X"]
                interval_left = _get_breakpoints(row["Y_mutations"], left=False)
                interval_right = _get_breakpoints(row["X_mutations"], left=True)
            recomb_counts[(parent_left_pango, parent_right_pango, interval_left, interval_right)] += 1
        else:
            num_complex += 1
    # Summarise results per Pango X.
    if num_simple + num_complex == 0:
        data.append(
            {
                "pango": p,
                "is_in_results": "FALSE",
                "type": "n/a",
                "parent_left_pango": "n/a",
                "parent_right_pango": "n/a",
                "interval_left": "n/a",
                "interval_right": "n/a",
                "max_count": "n/a",
                "num_simple": num_simple,
                "num_complex": num_complex,
            }
        )
    elif num_simple >= num_complex:
        (
            most_common_parent_left,
            most_common_parent_right,
            most_common_interval_left,
            most_common_interval_right,
        ) = recomb_counts.most_common()[0][0]
        data.append(
            {
                "pango": p,
                "is_in_results": "TRUE",
                "type": "simple",
                "parent_left_pango": most_common_parent_left,
                "parent_right_pango": most_common_parent_right,
                "interval_left": str(most_common_interval_left),
                "interval_right": str(most_common_interval_right + 1),   # NOTE: Make right-exclusive.
                "max_count": max(recomb_counts.values()),    # Most common
                "num_simple": num_simple,
                "num_complex": num_complex,
            }
        )
    else:
        data.append(
            {
                "pango": p,
                "is_in_results": "TRUE",
                "type": "complex",
                "parent_left_pango": "n/a",
                "parent_right_pango": "n/a",
                "interval_left": "n/a",
                "interval_right": "n/a",
                "max_count": "n/a",
                "num_simple": num_simple,
                "num_complex": num_complex,
            }
        )
recomb_df = pd.DataFrame(data)
recomb_df

Unnamed: 0,pango,is_in_results,type,parent_left_pango,parent_right_pango,interval_left,interval_right,max_count,num_simple,num_complex
0,XA,True,simple,B.1.177.18,B.1.1.7*,21255.0,21766.0,43.0,43,0
1,XAA,False,,,,,,,0,0
2,XAB,True,simple,BA.1.1*,BA.2.31,6513.0,9345.0,2.0,3,0
3,XAC,False,,,,,,,0,0
4,XAD,False,,,,,,,0,0
5,XAE,True,simple,BA.2.3.16,B.1.1.322,27807.0,28878.0,3.0,3,0
6,XAF,True,simple,BA.1.1.7,BA.2.9*,8393.0,11289.0,277.0,292,0
7,XAG,True,simple,BA.1.1*,BA.2.9*,6513.0,9345.0,24.0,38,0
8,XAH,False,,,,,,,0,0
9,XAJ,True,complex,,,,,,1,76


#### Merging the results above

In [11]:
# NOTE: The breakpoint intervals have been adjusted to be right-exclusive.
data = []
for p in pangos:
    if any(table_s4_df.pango == p):
        row = table_s4_df[table_s4_df.pango == p]
    else:
        row = recomb_df[recomb_df.pango == p]
    data.append(
        {
            "pango": row["pango"].to_list()[0],
            "is_in_results": row["is_in_results"].to_list()[0],
            "type": row["type"].to_list()[0],
            "parent_left_pango": row["parent_left_pango"].to_list()[0],
            "parent_right_pango": row["parent_right_pango"].to_list()[0],
            "interval_left": row["interval_left"].to_list()[0],
            "interval_right": row["interval_right"].to_list()[0],
        }
    )
merged_df = pd.DataFrame(data)
merged_df

Unnamed: 0,pango,is_in_results,type,parent_left_pango,parent_right_pango,interval_left,interval_right
0,XA,True,simple,B.1.177.18,B.1.1.7*,21255.0,21766.0
1,XAA,False,,,,,
2,XAB,True,simple,BA.1.1*,BA.2.31,8393.0,11289.0
3,XAC,False,,,,,
4,XAD,False,,,,,
5,XAE,True,simple,BA.2.3.16,B.1.1.322,27807.0,28878.0
6,XAF,True,simple,BA.1.1.7,BA.2.9*,8393.0,11289.0
7,XAG,True,simple,BA.1.1*,BA.2.9*,6513.0,9535.0
8,XAH,False,,,,,
9,XAJ,True,complex,,,,


In [12]:
csv_file = data_dir / "bigtable_covrecomb.csv"
merged_df.to_csv(csv_file, sep=",", index=False)