In [None]:
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
import tszip

In [None]:
data_dir = Path("../data")

In [None]:
report_file = data_dir / "combined.lineage_report.csv.xz"
report_df = pd.read_csv(report_file)
report_df.head(5)

In [None]:
ts_file = data_dir / "v1-beta1_2023-02-21.pp.md.bpshift.ts.dated.il.tsz"
ts = tszip.decompress(ts_file)
ts

In [None]:
len(report_df) == ts.num_nodes

In [None]:
import os
import urllib.request

# NOTE: Use v.133.
alias_key_file = data_dir / "alias_key.json"
if not os.path.exists(alias_key_file):
    url = "https://raw.githubusercontent.com/cov-lineages/pango-designation/refs/tags/v1.33/pango_designation/alias_key.json"
    urllib.request.urlretrieve(url, alias_key_file)

lineage_notes_file = data_dir / "lineage_notes.txt"
if not os.path.exists(lineage_notes_file):
    url = "https://raw.githubusercontent.com/cov-lineages/pango-designation/refs/tags/v1.33/lineage_notes.txt"
    urllib.request.urlretrieve(url, lineage_notes_file)

In [None]:
import nb_utils
pangonet = nb_utils.initialise_pangonet(
    alias_key_file=alias_key_file,
    lineage_notes_file=lineage_notes_file,
)

Examine Pango labels for sample nodes

In [None]:
pango_viridian = []
for node in ts.nodes():
    pango_viridian.append(node.metadata.get("Viridian_pangolin", None))
len(pango_viridian)

In [None]:
num_none = 0    # Check tally of non-sample nodes
num_identical = 0
pangonet_diff = []

for a, b in zip(report_df.lineage, pango_viridian):
    if b == None:
        num_none += 1
        continue
    if a == b:
        num_identical += 1
    else:
        try:
            pd_dist = nb_utils.get_pangonet_distance(pangonet, label_1=a, label_2=b)
        except BaseException as err:
            pd_dist = -1
        pangonet_diff.append(pd_dist)

In [None]:
print(f"All nodes: {ts.num_nodes}")
print(f"Sample nodes: {ts.num_samples}")
print(f"Non-sample nodes: {ts.num_nodes - ts.num_samples}")
print(f"None: {num_none}")  # Check that it's identical to non-sample nodes
print(f"Identical labels: {num_identical} ({round(num_identical / ts.num_samples, ndigits=4) * 100}%)")

In [None]:
plt.ylabel("Count of discrepant cases")
plt.xlabel("Pango distance")
plt.hist(pangonet_diff, bins=30, log=True);

Examine Pango labels for non-sample nodes

In [None]:
pango_imputed = []
for node in ts.nodes():
    pango_imputed.append(node.metadata["Imputed_Viridian_pangolin"])
len(pango_imputed)

In [None]:
num_not_none = 0    # Check tally of sample nodes
num_identical = 0
pangonet_diff = []

for a, b, c in zip(report_df.lineage, pango_imputed, pango_viridian):
    if c != None:
        num_not_none += 1
        continue
    if a == b:
        num_identical += 1
    else:
        try:
            pd_dist = nb_utils.get_pangonet_distance(pangonet, label_1=a, label_2=b)
        except BaseException as err:
            pd_dist = -1
        pangonet_diff.append(pd_dist)

In [None]:
print(f"All nodes: {ts.num_nodes}")
print(f"Sample nodes: {ts.num_samples}")
num_nonsample_nodes = ts.num_nodes - ts.num_samples
print(f"Non-sample nodes: {num_nonsample_nodes}")
print(f"Not None: {num_not_none}")
perc_identical = round(num_identical / num_nonsample_nodes, ndigits=4) * 100
print(f"Identical labels: {num_identical} ({perc_identical}%)")

In [None]:
plt.ylabel("Count of discrepant cases")
plt.xlabel("Pango distance")
plt.hist(pangonet_diff, bins=30, log=True);