In [1]:
import json
import urllib.request
import pandas as pd

In [2]:
pangos = [
    "XE",
    "XB",
    "XBB",
    "XBF",
    "XN",
    "XAS",
    "XBE",
    "XL",
    "XJ",
    "XQ",
    "XP",
    "XZ",
    "XBG",
    "XBK",
    "XBD",
    "XW",
    "XA",
    "XM",
    "XAJ",
    "XY",
    "XAM",
    "XS",
    "XF",
    "XBQ",
    "XR",
    "XAP",
    "XAA",
    "XAF",
    "XBM",
    "XAV",
    "XAU",
    "XAE",
    "XAN",
    "XAG",
    "XBH",
    "XC",
    "XBR",
    "XH",
    "XAD",
    "XG",
    "XAL",
    "XU",
    "XAB",
    "XAH",
    "XAQ",
    "XAR",
    "XAT",
    "XBJ",
    "XK",
    "XT",
    "XV",
]
len(pangos)

51

In [9]:
# https://github.com/jeromekelleher/sc2ts-paper/issues/325#issuecomment-2659019906
skip_pangos = [
    "XAB",  # Failed sc2ts missing site QC
    "XAH",  # Failed sc2ts missing site QC
    "XAQ",  # Not in Viridian v04
    "XAR",  # Not in Viridian v04
    "XAT",  # Not in Viridian v04
    "XBJ",  # SKIP FOR NOW but see comment
    "XK",   # Not in Viridian v04
    "XT",   # Not in Viridian v04
    "XV",   # Not in Viridian v04
    "XBQ",  # https://github.com/jeromekelleher/sc2ts-paper/issues/195#issuecomment-2665567347
    "XBR", # https://github.com/jeromekelleher/sc2ts-paper/issues/195#issuecomment-2665567347
]
len(skip_pangos)

11

In [10]:
alias_key_url = "https://raw.githubusercontent.com/cov-lineages/pango-designation/refs/heads/master/pango_designation/alias_key.json"
with urllib.request.urlopen(alias_key_url) as url:
    alias_key = json.load(url)

In [11]:
num_total_x = 0
num_one_bkpt = 0
num_multiple_bkpts = 0
for x in alias_key:
    if x.startswith("X"):
        num_total_x += 1
        if len(alias_key[x]) == 2:
            num_one_bkpt += 1
        else:
            num_multiple_bkpts += 1
print(f"Pango X: {num_total_x}")
print(f"One breakpoint: {num_one_bkpt}")
print(f"More breakpoints: {num_multiple_bkpts}")

Pango X: 144
One breakpoint: 123
More breakpoints: 21


In [12]:
def is_concordant(*, ground_truth, query):
    def _remap(x):
        if x.endswith("*"):
            x = x[:-1]
        split = x.split(".")
        alias = alias_key.get(split[0], "")
        if alias != "":
            x = ".".join([alias] + split[1:])
        return x
    pango_gt = _remap(ground_truth)
    pango_cp = _remap(query)
    if (pango_cp == pango_gt) or (pango_cp.startswith(pango_gt + ".")):
        return True
    else:
        return False

In [13]:
def compare_methods(df, comparator):
    df_gt = df[df["method"] == "groundtruth"]
    df_cp = df[df["method"] == comparator]

    list_concordant = []
    list_discrepant = []
    list_nonrecomb = []

    for p in pangos:
        if p in skip_pangos:
            continue

        tmp_df_gt = df_gt[df_gt["pango"] == p]
        tmp_df_cp = df_cp[df_cp["pango"] == p]

        left_pango_parent_gt = tmp_df_gt.iloc[0]["left_pango_parent"]
        left_pango_parent_cp = tmp_df_cp.iloc[0]["left_pango_parent"]
        right_pango_parent_gt = tmp_df_gt.iloc[0]["right_pango_parent"]
        right_pango_parent_cp = tmp_df_cp.iloc[0]["right_pango_parent"]

        if left_pango_parent_cp == "na" or right_pango_parent_cp == "na":
            list_nonrecomb.append(p)
            continue

        is_left_concordant = is_concordant(
            ground_truth=left_pango_parent_gt,
            query=left_pango_parent_cp,
        )
        is_right_concordant = is_concordant(
            ground_truth=right_pango_parent_gt,
            query=right_pango_parent_cp,
        )
        if is_left_concordant and is_right_concordant:
            list_concordant.append(p)
        else:
            list_discrepant.append(p)
            print(
                p,
                "left", [left_pango_parent_gt, left_pango_parent_cp],
                "right", [right_pango_parent_gt, right_pango_parent_cp],
            )

    return (list_concordant, list_discrepant, list_nonrecomb)

In [14]:
methods_df = pd.read_csv("../data/methods_comparison.csv")

print(f"Discrepant recombinants: GT vs sc2ts")
(
    concordant_sc2ts, discrepant_sc2ts, nonrecomb_sc2ts
) = compare_methods(
    df=methods_df,
    comparator="sc2ts",
)
print("\n")

print(f"Discrepant recombinants: GT vs RH-GISAID")
(
    concordant_rh, discrepant_rh, nonrecomb_rh
) = compare_methods(
    df=methods_df,
    comparator="recombinhunt_gisaid",
)
print("\n")

print(f"Concordant recombinants")
print(f"GT vs sc2ts: {len(concordant_sc2ts)}")
print(f"GT vs RH-GISAID: {len(concordant_rh)}")
print("\n")

print(f"Non-recombinants")
print(f"GT vs sc2ts: {len(nonrecomb_sc2ts)}")
print(f"GT vs RH-GISAID: {len(nonrecomb_rh)}")

Discrepant recombinants: GT vs sc2ts
XBB left ['BJ.1', 'BA.2.10'] right ['BM.1.1.1', 'BM.1.1.1']
XBF left ['BA.5.2', 'BA.5.2.1'] right ['CJ.1', 'BM.1.1.1']
XBH left ['BA.2.3.17', 'BA.2.1'] right ['BA.2.75.2', 'BA.2.75.2']


Discrepant recombinants: GT vs RH-GISAID
XAS left ['BA.5*', 'BA.4.8'] right ['BA.2*', 'BA.2.65']
XBE left ['BA.5.2*', 'BA.5.2.6'] right ['BE.4.1', 'BE.4']
XBM left ['BA.2.76', 'BF.3'] right ['BF.3', 'BA.2.76;BF.3']
XBH left ['BA.2.3.17', 'BA.2.75.2'] right ['BA.2.75.2', 'BA.2.3.17;BA.2.75.2']


Concordant recombinants
GT vs sc2ts: 26
GT vs RH-GISAID: 30


Non-recombinants
GT vs sc2ts: 11
GT vs RH-GISAID: 6
