In [None]:
from collections import Counter
from pathlib import Path
import pandas as pd
import numpy as np
import sc2ts
import nb_utils

In [None]:
data_dir = Path("../data")

In [None]:
#! git clone https://github.com/phac-nml/pangonet.git
#! cd pangonet && pip install . && cd -
#! pangonet --help | head

In [None]:
dataset_dir = Path("dataset")
alias_key_file = dataset_dir / "rebar" / "alias_key.json"
lineage_notes_file = dataset_dir / "rebar" / "lineage_notes.txt"
pangonet = nb_utils.initialise_pangonet(alias_key_file, lineage_notes_file)

In [None]:
ts = nb_utils.load()
ti = sc2ts.TreeInfo(ts)

In [None]:
reconmb_file = data_dir / "recombinants.csv"
recomb_df = pd.read_csv(reconmb_file, parse_dates=["date_added"])
len(recomb_df)

In [None]:
df_hq = recomb_df[recomb_df.net_min_supporting_loci_lft_rgt_ge_4].reset_index(drop=True)
df_hq.columns

In [None]:
print(f"All: {len(recomb_df)}")
print(f"High-quality: {len(df_hq)}")

Compute pangonet distance between pangolin-assigned Pango labels of sc2ts parents

In [None]:
nd = np.zeros(len(df_hq), dtype=np.float32)
for i, row in df_hq.iterrows():
    if row.parent_left_pango.startswith("X") or \
        row.parent_right_pango.startswith("X"):
        nd[i] = -1
    else:
        nd[i] = nb_utils.get_pangonet_distance(
            pangonet=pangonet,
            label_1=row.parent_left_pango,
            label_2=row.parent_right_pango,
        )

In [None]:
# Exclude these cases.
len([x for x in nd if x == -1])

In [None]:
df_hq = df_hq.assign(node_distance=nd)
df_hq = df_hq[df_hq.node_distance >= 0].reset_index(drop=True)
df_hq.head(1)

Explore parent pairs, unfoled

In [None]:
Counter(df_hq.parent_left_scorpio.to_list() + df_hq.parent_right_scorpio.to_list())

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.patches as mpatches


def plot_parent_mrca_vs_pango_node_distance(label_color_map):
    # TODO: Add to recombinants CSV file.
    re_t_mrca = ts.nodes_time[df_hq.mrca] - ts.nodes_time[df_hq.recombinant]
    assert len(df_hq) == len(re_t_mrca)

    j_width = 0.3
    j = np.random.uniform(-j_width, j_width, len(df_hq))
    nd_j = df_hq.node_distance + j
    assert len(df_hq) == len(nd_j)

    _, ax = plt.subplots(1, 1, figsize=(10, 4))

    for i, row in enumerate(df_hq.itertuples()):
        scorpio_pair = (row.parent_left_scorpio, row.parent_right_scorpio)
        color = label_color_map.get(scorpio_pair, "gray")
        alpha = 0.6 if color != "gray" else 0.3
        ax.scatter(y=re_t_mrca[i], x=nd_j[i], color=color, alpha=alpha)

    empty_patch = mpatches.Patch(color='none', label='')
    ax.legend(
        labels=label_color_map.keys(),
        labelcolor=label_color_map.values(),
        handles=[empty_patch for _ in label_color_map.keys()],
        fontsize=8,
        frameon=False,
    )

    max_nd = 21
    ax.set_xticks(ticks=np.arange(max_nd), labels=np.arange(max_nd))
    ax.set_title(f"High-quality recombinants: {len(df_hq)}")
    ax.set_ylabel("Time to MRCA from recombination node (days)")
    ax.set_xlabel("Pango distance between sc2ts parents");

In [None]:
focal_label_1 = "Delta (B.1.617.2-like)"
focal_label_2 = "Delta (AY.4-like)"
plot_parent_mrca_vs_pango_node_distance(
    label_color_map={
        (focal_label_1, focal_label_1): mcolors.TABLEAU_COLORS["tab:blue"],
        (focal_label_1, focal_label_2): mcolors.TABLEAU_COLORS["tab:orange"],
        (focal_label_2, focal_label_1): mcolors.TABLEAU_COLORS["tab:green"],
        (focal_label_2, focal_label_2): mcolors.TABLEAU_COLORS["tab:red"],
    }
)

In [None]:
focal_label_1 = "Omicron (BA.1-like)"
focal_label_2 = "Omicron (BA.2-like)"
focal_label_4 = "Omicron (BA.4-like)"
focal_label_5 = "Omicron (BA.5-like)"
plot_parent_mrca_vs_pango_node_distance(
    label_color_map={
        (focal_label_1, focal_label_1): mcolors.TABLEAU_COLORS["tab:blue"],
        (focal_label_1, focal_label_2): mcolors.TABLEAU_COLORS["tab:orange"],
        (focal_label_2, focal_label_1): mcolors.TABLEAU_COLORS["tab:green"],
        (focal_label_2, focal_label_2): mcolors.TABLEAU_COLORS["tab:red"],
        # BA.5 involved
        (focal_label_2, focal_label_5): mcolors.TABLEAU_COLORS["tab:purple"],
        (focal_label_4, focal_label_5): mcolors.TABLEAU_COLORS["tab:brown"],
        (focal_label_5, focal_label_5): mcolors.TABLEAU_COLORS["tab:pink"],
        (focal_label_5, focal_label_4): mcolors.TABLEAU_COLORS["tab:olive"],
        (focal_label_5, focal_label_2): mcolors.TABLEAU_COLORS["tab:cyan"],
    }
)

Fold the parent pairs

In [None]:
Counter([
    tuple(sorted([x, y])) for x, y in zip(
        df_hq.parent_left_scorpio.to_list(),
        df_hq.parent_right_scorpio.to_list(),
    )
]).most_common(10)

In [None]:
plot_parent_mrca_vs_pango_node_distance(
    label_color_map={
        # BA.5
        ('Omicron (BA.5-like)', 'Omicron (BA.5-like)'): mcolors.TABLEAU_COLORS["tab:blue"],
        # B.1.617.2
        ('Delta (B.1.617.2-like)', 'Delta (B.1.617.2-like)'): mcolors.TABLEAU_COLORS["tab:orange"],
        # BA.2 and BA.5
        ('Omicron (BA.2-like)', 'Omicron (BA.5-like)'): mcolors.TABLEAU_COLORS["tab:green"],
        ('Omicron (BA.5-like)', 'Omicron (BA.2-like)'): mcolors.TABLEAU_COLORS["tab:green"],
        # BA.1 and BA.2
        ('Omicron (BA.1-like)', 'Omicron (BA.2-like)'): mcolors.TABLEAU_COLORS["tab:red"],
        ('Omicron (BA.2-like)', 'Omicron (BA.1-like)'): mcolors.TABLEAU_COLORS["tab:red"],
        # BA.4 and BA.5
        ('Omicron (BA.4-like)', 'Omicron (BA.5-like)'): mcolors.TABLEAU_COLORS["tab:purple"],
        ('Omicron (BA.5-like)', 'Omicron (BA.4-like)'): mcolors.TABLEAU_COLORS["tab:purple"],
        # AY.4 and B.1.617.2
        ('Delta (AY.4-like)', 'Delta (B.1.617.2-like)'): mcolors.TABLEAU_COLORS["tab:brown"],
        ('Delta (B.1.617.2-like)', 'Delta (AY.4-like)'): mcolors.TABLEAU_COLORS["tab:brown"],
        # BA.1
        ('Omicron (BA.1-like)', 'Omicron (BA.1-like)'): mcolors.TABLEAU_COLORS["tab:pink"],
        # BA.2
        ('Omicron (BA.2-like)', 'Omicron (BA.2-like)'): mcolors.TABLEAU_COLORS["tab:olive"],
        # AY.4
        ('Delta (AY.4-like)', 'Delta (AY.4-like)'): mcolors.TABLEAU_COLORS["tab:cyan"],
    }
)