In [1]:
from collections import Counter
from pathlib import Path
import pandas as pd

In [2]:
data_dir = Path("../data")
overview_file = data_dir / "bigtable_pangodesig.csv"
overview_df = pd.read_csv(overview_file)
pangos = overview_df.pango
len(pangos)

60

#### Independent recombination events

In [3]:
# Get the 1451 independent recombination events from the CovRecomb paper, and
# subset to the ones corresponding to the Pango X labels ("X" series).
# NOTE: No XBB.* or other X?.*
table_s4_url = "https://raw.githubusercontent.com/wuaipinglab/CovRecomb/refs/heads/main/CovRecomb-Global-Version/supplementary%20tables/TableS4.csv"
table_s4_df = pd.read_csv(table_s4_url)
table_s4_df = table_s4_df[table_s4_df.X_series.isin(pangos)].reset_index()
table_s4_df.columns

Index(['index', 'sample_id', 'collect_date', 'pango_lineage', 'lineage_X',
       'lineage_Y', 'mutation_pattern', 'raw_p_value', 'adjusted_p_value',
       'X_mutations', 'Y_mutations', 'shared_mutations', 'denovo_mutations',
       'situation', 'region', 'country', 'Number_of_epidemic_recombinant',
       'Geographical_distribution', 'X_series', 'Confidence_mean',
       'breakpoints'],
      dtype='object')

In [4]:
# Some Pango Xs have multiple events associated with them.
# There is typically one event that has the highest number of epidemic recombinants.
table_s4_df = table_s4_df[
    [
        "X_series",
        "lineage_X",
        "lineage_Y",
        "X_mutations",
        "Y_mutations",
        "mutation_pattern",
        #"situation",
        "breakpoints",
        "Number_of_epidemic_recombinant",
    ]
].sort_values(
    by=["X_series", "Number_of_epidemic_recombinant"],
    ascending=[True, False],
).reset_index(drop=True)
table_s4_df.head(5)

Unnamed: 0,X_series,lineage_X,lineage_Y,X_mutations,Y_mutations,mutation_pattern,breakpoints,Number_of_epidemic_recombinant
0,XA,B.1.1.7*,B.1.177.18,21765_------/21992_---/23063_T/23271_A/23604_A...,445_C/6286_T/10323_G/21255_C,YYYYXXXXXXXXXXXXXXXXXX,21255-21765,43
1,XAB,BA.1.1*,BA.2.31,2470_T/2832_G/5386_G/6513_---/8393_A,11288_---------/12880_T/15714_T/17410_T/19955_...,XXXXXYYYYYYYYYYYYYYY,8393-11288,5
2,XAF,BA.1.1.7,BA.2.9*,2470_T/2832_G/5386_G/8393_A,11288_---------/12880_T/15714_T/17410_T/19955_...,XXXXYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY,8393-11288,427
3,XAF,BA.1.1.7,BA.2.9*,2470_T/2832_G/5386_G/8393_A,11288_---------/12880_T/15714_T/17410_T/21618_...,XXXXYYYYYYYYYYYYYYYYYYYYYYYYYY,8393-11288,2
4,XAG,BA.1.1*,BA.2.9*,2470_T/2832_G/5386_G/6513_---,9534_T/9866_T/10198_T/10447_A/11288_---------/...,XXXXYYYYYYYYYYYYYYYYYYYYYY,6513-9534,17


In [5]:
def _get_breakpoints(mut_patt, left):
    pos = [x.split("_")[0] for x in mut_patt.split("/")]
    if left:
        return pos[0]
    return pos[-1]


data = []
for _, row in table_s4_df.iterrows():
    num_XY = row["mutation_pattern"].count("XY")
    num_YX = row["mutation_pattern"].count("YX")
    if num_XY + num_YX == 1:
        if num_XY == 1:
            parent_left_pango = row["lineage_X"]
            parent_right_pango = row["lineage_Y"]
            interval_left = _get_breakpoints(row["X_mutations"], left=False)
            interval_right = _get_breakpoints(row["Y_mutations"], left=True)
        else:
            parent_left_pango = row["lineage_Y"]
            parent_right_pango = row["lineage_X"]
            interval_left = _get_breakpoints(row["Y_mutations"], left=False)
            interval_right = _get_breakpoints(row["X_mutations"], left=True)
        data.append(
            {
                "pango": row["X_series"],
                "is_in_results": "TRUE",
                "type": "simple",
                "parent_left_pango": parent_left_pango,
                "parent_right_pango": parent_right_pango,
                "interval_left": interval_left,
                "interval_right": interval_right,
                "num_epidemic_recombs": row["Number_of_epidemic_recombinant"],
            }
        )
    else:
        data.append(
            {
                "pango": row["X_series"],
                "is_in_results": "TRUE",
                "type": "complex",
                "parent_left_pango": "n/a",
                "parent_right_pango": "n/a",
                "interval_left": "n/a",
                "interval_right": "n/a",
                "num_epidemic_recombs": row["Number_of_epidemic_recombinant"],
            }
        )
table_s4_df = pd.DataFrame(data)
table_s4_df

Unnamed: 0,pango,is_in_results,type,parent_left_pango,parent_right_pango,interval_left,interval_right,num_epidemic_recombs
0,XA,True,simple,B.1.177.18,B.1.1.7*,21255.0,21765.0,43
1,XAB,True,simple,BA.1.1*,BA.2.31,8393.0,11288.0,5
2,XAF,True,simple,BA.1.1.7,BA.2.9*,8393.0,11288.0,427
3,XAF,True,simple,BA.1.1.7,BA.2.9*,8393.0,11288.0,2
4,XAG,True,simple,BA.1.1*,BA.2.9*,6513.0,9534.0,17
5,XAJ,True,complex,,,,,43
6,XAJ,True,complex,,,,,15
7,XAJ,True,complex,,,,,6
8,XAL,True,simple,BA.1.1*,BA.2.23,15240.0,19955.0,2
9,XAM,True,simple,BA.1.1*,BA.2.9*,6513.0,9344.0,12


In [6]:
idx = table_s4_df.groupby('pango')['num_epidemic_recombs'].idxmax()
table_s4_df = table_s4_df.loc[idx].reset_index(drop=True)
table_s4_df

Unnamed: 0,pango,is_in_results,type,parent_left_pango,parent_right_pango,interval_left,interval_right,num_epidemic_recombs
0,XA,True,simple,B.1.177.18,B.1.1.7*,21255.0,21765.0,43
1,XAB,True,simple,BA.1.1*,BA.2.31,8393.0,11288.0,5
2,XAF,True,simple,BA.1.1.7,BA.2.9*,8393.0,11288.0,427
3,XAG,True,simple,BA.1.1*,BA.2.9*,6513.0,9534.0,17
4,XAJ,True,complex,,,,,43
5,XAL,True,simple,BA.1.1*,BA.2.23,15240.0,19955.0,2
6,XAM,True,simple,BA.1.1*,BA.2.9*,6513.0,9344.0,12
7,XB,True,complex,,,,,3332
8,XBB,True,simple,BJ.1,BM.1.1.1,22109.0,22674.0,97
9,XBD,True,simple,BA.2.75.2*,BF.3.1*,23019.0,24620.0,268


#### Other putative recombination events

In [7]:
recomb_url = "https://raw.githubusercontent.com/wuaipinglab/CovRecomb/refs/heads/main/CovRecomb-Global-Version/putative_recombinants/putative%20recombinants.csv"
recomb_df = pd.read_csv(recomb_url)
recomb_df.columns

Index(['sample_id', 'collect_date', 'pango_lineage', 'lineage_X', 'lineage_Y',
       'mutation_pattern', 'raw_p_value', 'adjusted_p_value', 'X_mutations',
       'Y_mutations', 'shared_mutations', 'denovo_mutations', 'region',
       'country'],
      dtype='object')

In [8]:
# NOTE: pango_lineage is the Pango assigned to the sample (from GISAID).
for p in pangos:
    num_pango_lineage = sum(recomb_df.pango_lineage == p)
    #print(p, num_pango_lineage)

In [9]:
data = []
for p in pangos:
    num_simple = 0
    num_complex = 0
    recomb_counts = Counter()
    for _, row in recomb_df[
        # NOTE: Lumps together XBB.* and other X?.*
        (recomb_df.pango_lineage == p) | \
        (recomb_df.pango_lineage.str.startswith(p + "."))
    ].iterrows():
        num_XY = row["mutation_pattern"].count("XY")
        num_YX = row["mutation_pattern"].count("YX")
        if num_XY + num_YX == 1:
            num_simple += 1
            if num_XY == 1:
                parent_left_pango = row["lineage_X"]
                parent_right_pango = row["lineage_Y"]
                interval_left = _get_breakpoints(row["X_mutations"], left=False)
                interval_right = _get_breakpoints(row["Y_mutations"], left=True)
            else:
                parent_left_pango = row["lineage_Y"]
                parent_right_pango = row["lineage_X"]
                interval_left = _get_breakpoints(row["Y_mutations"], left=False)
                interval_right = _get_breakpoints(row["X_mutations"], left=True)
            recomb_counts[(parent_left_pango, parent_right_pango, interval_left, interval_right)] += 1
        else:
            num_complex += 1
    # Summarise results per Pango X.
    if num_simple + num_complex == 0:
        data.append(
            {
                "pango": p,
                "is_in_results": "FALSE",
                "type": "n/a",
                "parent_left_pango": "n/a",
                "parent_right_pango": "n/a",
                "interval_left": "n/a",
                "interval_right": "n/a",
                "max_count": "n/a",
                "num_simple": num_simple,
                "num_complex": num_complex,
            }
        )
    elif num_simple >= num_complex:
        (
            most_common_parent_left,
            most_common_parent_right,
            most_common_interval_left,
            most_common_interval_right,
        ) = recomb_counts.most_common()[0][0]
        data.append(
            {
                "pango": p,
                "is_in_results": "TRUE",
                "type": "simple",
                "parent_left_pango": most_common_parent_left,
                "parent_right_pango": most_common_parent_right,
                "interval_left": most_common_interval_left,
                "interval_right": most_common_interval_right,
                "max_count": max(recomb_counts.values()),    # Most common
                "num_simple": num_simple,
                "num_complex": num_complex,
            }
        )
    else:
        data.append(
            {
                "pango": p,
                "is_in_results": "TRUE",
                "type": "complex",
                "parent_left_pango": "n/a",
                "parent_right_pango": "n/a",
                "interval_left": "n/a",
                "interval_right": "n/a",
                "max_count": "n/a",
                "num_simple": num_simple,
                "num_complex": num_complex,
            }
        )
recomb_df = pd.DataFrame(data)
recomb_df

Unnamed: 0,pango,is_in_results,type,parent_left_pango,parent_right_pango,interval_left,interval_right,max_count,num_simple,num_complex
0,XA,True,simple,B.1.177.18,B.1.1.7*,21255.0,21765.0,43.0,43,0
1,XAA,False,,,,,,,0,0
2,XAB,True,simple,BA.1.1*,BA.2.31,6513.0,9344.0,2.0,3,0
3,XAC,False,,,,,,,0,0
4,XAD,False,,,,,,,0,0
5,XAE,True,simple,BA.2.3.16,B.1.1.322,27807.0,28877.0,3.0,3,0
6,XAF,True,simple,BA.1.1.7,BA.2.9*,8393.0,11288.0,277.0,292,0
7,XAG,True,simple,BA.1.1*,BA.2.9*,6513.0,9344.0,24.0,38,0
8,XAH,False,,,,,,,0,0
9,XAJ,True,complex,,,,,,1,76


#### Merging the results above

In [10]:
data = []
for p in pangos:
    if any(table_s4_df.pango == p):
        row = table_s4_df[table_s4_df.pango == p]
    else:
        row = recomb_df[recomb_df.pango == p]
    data.append(
        {
            "pango": row["pango"].to_list()[0],
            "is_in_results": row["is_in_results"].to_list()[0],
            "type": row["type"].to_list()[0],
            "parent_left_pango": row["parent_left_pango"].to_list()[0],
            "parent_right_pango": row["parent_right_pango"].to_list()[0],
            "interval_left": row["interval_left"].to_list()[0],
            "interval_right": row["interval_right"].to_list()[0],
        }
    )
merged_df = pd.DataFrame(data)
merged_df

Unnamed: 0,pango,is_in_results,type,parent_left_pango,parent_right_pango,interval_left,interval_right
0,XA,True,simple,B.1.177.18,B.1.1.7*,21255.0,21765.0
1,XAA,False,,,,,
2,XAB,True,simple,BA.1.1*,BA.2.31,8393.0,11288.0
3,XAC,False,,,,,
4,XAD,False,,,,,
5,XAE,True,simple,BA.2.3.16,B.1.1.322,27807.0,28877.0
6,XAF,True,simple,BA.1.1.7,BA.2.9*,8393.0,11288.0
7,XAG,True,simple,BA.1.1*,BA.2.9*,6513.0,9534.0
8,XAH,False,,,,,
9,XAJ,True,complex,,,,


In [11]:
csv_file = data_dir / "bigtable_covrecomb.csv"
merged_df.to_csv(csv_file, sep=",", index=False)