## Accuracy of pango sample assigments in sc2ts ARG

In this notebook we examine how well the pango assignments computed in the sc2ts ARG agree with those from the source Viridian consensus sequences.

In [21]:
import sc2ts
import tszip
import pathlib
import numpy as np

datadir = pathlib.Path("../data")

In [8]:
ds = sc2ts.Dataset(datadir / "viridian_mafft_2024-10-14_v1.vcz.zip")
ds

<sc2ts.dataset.Dataset at 0x7f76da3be6e0>

In [10]:
ts = tszip.load(datadir / "sc2ts_v1_2023-02-21_pp_dated_remapped_bps_pango_mmps.trees.tsz")

Get the per-node data computed by sc2ts and join with the source Viridian data from dataset

In [15]:
df = sc2ts.node_data(ts)
df = df[df.is_sample].set_index("sample_id")
df

Unnamed: 0_level_0,pango,scorpio,node_id,is_sample,is_recombinant,num_mutations,max_descendant_samples,date
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SRR11772659,A,.,2,True,False,1,255,2020-01-19
SRR11397727,B,.,3,True,False,0,1,2020-01-24
SRR11397730,B,.,4,True,False,0,1,2020-01-24
SRR11597198,A,.,6,True,False,0,1,2020-01-25
SRR11597221,A,.,7,True,False,0,1,2020-01-25
...,...,...,...,...,...,...,...,...
ERR10937847,XBB.1.5,Omicron (XBB.1.5-like),2688999,True,False,0,1,2023-02-20
ERR10937891,XBB.1.5.62,Omicron (XBB.1.5-like),2689000,True,False,0,1,2023-02-20
ERR10937893,FD.1,Omicron (XBB.1.5-like),2689001,True,False,0,1,2023-02-20
ERR10937945,CH.1.1.3,Omicron (BA.2-like),2689002,True,False,0,1,2023-02-20


In [18]:
df = df.join(ds.metadata.as_dataframe(["Viridian_pangolin", "Viridian_scorpio", "Viridian_pangolin_1.29", "Viridian_scorpio_1.29"]))
df

Unnamed: 0_level_0,pango,scorpio,node_id,is_sample,is_recombinant,num_mutations,max_descendant_samples,date,Viridian_pangolin,Viridian_scorpio,Viridian_pangolin_1.29,Viridian_scorpio_1.29
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
SRR11772659,A,.,2,True,False,1,255,2020-01-19,A,.,A,.
SRR11397727,B,.,3,True,False,0,1,2020-01-24,B,.,B,.
SRR11397730,B,.,4,True,False,0,1,2020-01-24,B,.,B,.
SRR11597198,A,.,6,True,False,0,1,2020-01-25,A,.,A,.
SRR11597221,A,.,7,True,False,0,1,2020-01-25,A,.,A,.
...,...,...,...,...,...,...,...,...,...,...,...,...
ERR10937847,XBB.1.5,Omicron (XBB.1.5-like),2688999,True,False,0,1,2023-02-20,XBB.1.5,Omicron (XBB.1.5-like),XBB.1.5,Omicron (XBB.1.5-like)
ERR10937891,XBB.1.5.62,Omicron (XBB.1.5-like),2689000,True,False,0,1,2023-02-20,XBB.1.5.62,Omicron (XBB.1.5-like),XBB.1.5.62,Omicron (XBB.1.5-like)
ERR10937893,FD.1,Omicron (XBB.1.5-like),2689001,True,False,0,1,2023-02-20,FD.1,Omicron (XBB.1.5-like),FD.1,Omicron (XBB.1.5-like)
ERR10937945,CH.1.1.3,Omicron (BA.2-like),2689002,True,False,0,1,2023-02-20,CH.1.1.3,Omicron (BA.2-like),CH.1.1.3,Omicron (BA.2-like)


In [31]:
diff = np.sum(df["pango"] != df["Viridian_pangolin_1.29"]) 
diff

np.int64(8293)

In [36]:
f"{diff/len(df) * 100:.2f}"

'0.33'

In [37]:
diff = np.sum(df["Viridian_pangolin"] != df["Viridian_pangolin_1.29"]) 
diff

np.int64(6588)

In [38]:
f"{diff/len(df) * 100:.2f}"

'0.27'

In [27]:
np.sum(df["scorpio"] != df["Viridian_scorpio"]) 

np.int64(727)

In [28]:
np.sum(df["Viridian_scorpio"] != df["Viridian_scorpio_1.29"]) 

np.int64(0)

In [30]:
df[df["scorpio"] != df["Viridian_scorpio_1.29"]][["scorpio", "Viridian_scorpio_1.29"]]

Unnamed: 0_level_0,scorpio,Viridian_scorpio_1.29
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1
SRR14389522,Zeta (P.2-like),.
SRR16617944,Zeta (P.2-like),.
ERR5946784,Delta (B.1.617.2-like) +K417N,Delta (AY.4-like)
ERR8516089,Delta (B.1.617.2-like) +K417N,Delta (B.1.617.2-like)
ERR6165703,Delta (AY.4-like),Delta (B.1.617.2-like)
...,...,...
SRR23601085,Omicron (XBB-like),Omicron (XBB.1.5-like)
SRR23601164,Omicron (XBB-like),Omicron (XBB.1.5-like)
SRR23601292,Omicron (XBB-like),Omicron (XBB.1.5-like)
SRR23601180,Omicron (XBB-like),Omicron (XBB.1.5-like)
