# Get a set of fixed-date samples to see VOCs

Getting saltational variants started off in the presence of time travellers is a significant challenge. One way we can get some sequences of which we are confident are important to these outbreaks and have reasonably accurate dates is to look at the Pango designation data.

We then merge data and figure out which of these are in the Viridian dataset by looking at the ENA data.

See https://github.com/jeromekelleher/sc2ts-paper/issues/268


#### Download files

In [1]:
%%bash
wget --quiet https://raw.githubusercontent.com/cov-lineages/pango-designation/16205e716c6a68ff1c3d0f26f0c77478682368ac/lineages.csv


In [2]:
%%bash
curl -s -X 'GET' \
  'https://www.ebi.ac.uk/ena/portal/api/filereport?result=read_run&accession=PRJEB37886&fields=sample_accession%2Csample_alias&limit=0&format=tsv&download=true' \
  -H 'accept: */*' > filereport_read_run_PRJEB37886_tsv.txt


In [3]:
%%bash
wget --quiet --content-disposition https://figshare.com/ndownloader/files/49694808


#### Parse files


In [4]:
import pandas as pd

# lineage, sample name
pango = pd.read_csv("lineages.csv", sep=",")
pango["sample_name"] = [s.split("/")[1] for s in pango["taxon"]]
pango

Unnamed: 0,taxon,lineage,sample_name
0,Belgium/UZA-UA-48355442/2023,XBZ,UZA-UA-48355442
1,Norway/Ahus-4881/2023,XBZ,Ahus-4881
2,England/PHEP-YYG8X8X/2023,XBZ,PHEP-YYG8X8X
3,Germany/RP-RKI-I-1082931/2022,XBZ,RP-RKI-I-1082931
4,Germany/BY-RKI-I-1083943/2022,XBZ,BY-RKI-I-1083943
...,...,...,...
2587341,SouthAfrica/NICD-N58702/2024,LB.1.3.4,NICD-N58702
2587342,SouthAfrica/NICD-N58691/2024,LB.1.3.4,NICD-N58691
2587343,SouthAfrica/NICD-N58721/2024,LB.1.3.4,NICD-N58721
2587344,SouthAfrica/NICD-N58715/2024,LB.1.3.4,NICD-N58715


In [5]:
pango = pango.set_index("sample_name")

In [6]:
# run accession, sample name
ena = pd.read_csv("filereport_read_run_PRJEB37886_tsv.txt", sep="\t")
ena["sample_name"] = [s.split("/")[1] for s in ena["sample_alias"]]
ena


Unnamed: 0,run_accession,sample_alias,sample_accession,sample_name
0,ERR10120258,COG-UK/QEUH-3F6D670,SAMEA110754665,QEUH-3F6D670
1,ERR10120260,COG-UK/QEUH-3F6D69E,SAMEA110754667,QEUH-3F6D69E
2,ERR10120263,COG-UK/QEUH-3F6D6DA,SAMEA110754670,QEUH-3F6D6DA
3,ERR10120264,COG-UK/LSPA-3F69DBB,SAMEA110754671,LSPA-3F69DBB
4,ERR10120276,COG-UK/QEUH-3F6D731,SAMEA110754683,QEUH-3F6D731
...,...,...,...,...
2700696,ERR9654574,COG-UK/QEUH-3D8C798,SAMEA14260946,QEUH-3D8C798
2700697,ERR9654575,COG-UK/QEUH-3D8AC8C,SAMEA14260947,QEUH-3D8AC8C
2700698,ERR9654587,COG-UK/QEUH-3D8C7A7,SAMEA14260959,QEUH-3D8C7A7
2700699,ERR9654607,COG-UK/QEUH-3D8D65E,SAMEA14260979,QEUH-3D8D65E


In [7]:
ena = ena.set_index("sample_name")

In [8]:
pango_ena = pango.join(ena, how="inner")
pango_ena

Unnamed: 0_level_0,taxon,lineage,run_accession,sample_alias,sample_accession
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LSPA-325F505B,England/LSPA-325F505B/2022,XBZ,ERR10513123,COG-UK/LSPA-325F505B,SAMEA112160435
QEUH-326B5575,England/QEUH-326B5575/2023,XBZ,ERR10863933,COG-UK/QEUH-326B5575,SAMEA112622902
QEUH-326B81BA,England/QEUH-326B81BA/2023,XBZ,ERR10887233,COG-UK/QEUH-326B81BA,SAMEA112641438
ALDP-94A6E1,England/ALDP-94A6E1/2020,A,ERR4639199,COG-UK/ALDP-94A6E1,SAMEA7359442
ALDP-94A6C3,England/ALDP-94A6C3/2020,A,ERR4639208,COG-UK/ALDP-94A6C3,SAMEA7359451
...,...,...,...,...,...
WSI-3316A5BD,England/WSI-3316A5BD/2023,JN.1.64,ERR12383654,COG-UK/WSI-3316A5BD,SAMEA115054198
WSI-3319FCF6,England/WSI-3319FCF6/2024,JN.1.64,ERR12658112,COG-UK/WSI-3319FCF6,SAMEA115291849
WSI-331EF116,England/WSI-331EF116/2024,JN.1.65,ERR12742963,COG-UK/WSI-331EF116,SAMEA115406129
WSI-331C96F5,England/WSI-331C96F5/2024,JN.1.65,ERR12658286,COG-UK/WSI-331C96F5,SAMEA115292022


In [9]:
del pango, ena

In [10]:
# Run (strain)
viridian = pd.read_csv("run_metadata.v05.tsv.gz", sep="\t").set_index("Run")
viridian


  viridian = pd.read_csv("run_metadata.v05.tsv.gz", sep="\t").set_index("Run")


Unnamed: 0_level_0,In_may_2024_preprint,Study,Sample,Experiment,Run_count,Platform,Country,Region,Collection_date,First_created,...,Genbank_N,Viridian_pangolin,Viridian_scorpio,Genbank_pangolin,Genbank_scorpio,Genbank_tree_name,Viridian_cons_len,Viridian_cons_het,Viridian_pangolin_1.29,Viridian_scorpio_1.29
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR6546375,T,PRJEB47121,SAMEA9781395,ERX6172603,1,ILLUMINA,Estonia,none,2021-08-02,2021-08-22,...,.,AY.100,Delta (B.1.617.2-like),.,.,.,29810,0,AY.100,Delta (B.1.617.2-like)
ERR6546376,T,PRJEB47121,SAMEA9781396,ERX6172604,1,ILLUMINA,Estonia,none,2021-08-02,2021-08-22,...,.,AY.122,Delta (B.1.617.2-like),.,.,.,29807,3,AY.122,Delta (B.1.617.2-like)
ERR6546377,T,PRJEB47121,SAMEA9781397,ERX6172605,1,ILLUMINA,Estonia,none,2021-08-02,2021-08-22,...,.,B.1.617.2,Delta (B.1.617.2-like),.,.,.,29808,16,B.1.617.2,Delta (B.1.617.2-like)
ERR6546378,T,PRJEB47121,SAMEA9781398,ERX6172606,1,ILLUMINA,Estonia,none,2021-08-02,2021-08-22,...,.,B.1.617.2,Delta (B.1.617.2-like),.,.,.,29808,13,B.1.617.2,Delta (B.1.617.2-like)
ERR6546379,T,PRJEB47121,SAMEA9781399,ERX6172607,1,ILLUMINA,Estonia,none,2021-08-02,2021-08-22,...,.,B.1.617.2,Delta (B.1.617.2-like),.,.,.,29800,18,B.1.617.2,Delta (B.1.617.2-like)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERR13177478,F,PRJEB46220,SAMEA114210144,ERX12548855,1,ILLUMINA,Argentina,none,2023-04-20,2024-05-30,...,.,.,.,.,.,.,29757,3,XBB.1.5,Omicron (XBB-like)
ERR13177479,F,PRJEB46220,SAMEA114210145,ERX12548856,1,ILLUMINA,Argentina,none,2023-04-21,2024-05-30,...,.,.,.,.,.,.,29759,0,XBB.1.5.107,Omicron (XBB-like)
ERR13177480,F,PRJEB46220,SAMEA114210146,ERX12548857,1,ILLUMINA,Argentina,none,2023-04-24,2024-05-30,...,.,.,.,.,.,.,29756,2,XBB.1.5,Omicron (XBB-like)
ERR13177481,F,PRJEB46220,SAMEA114210147,ERX12548858,1,ILLUMINA,Argentina,none,2023-04-26,2024-05-30,...,.,.,.,.,.,.,29759,1,XBB.1.5.107,Omicron (XBB-like)


In [11]:
pango_ena = pango_ena.set_index("run_accession")
pango_ena

Unnamed: 0_level_0,taxon,lineage,sample_alias,sample_accession
run_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ERR10513123,England/LSPA-325F505B/2022,XBZ,COG-UK/LSPA-325F505B,SAMEA112160435
ERR10863933,England/QEUH-326B5575/2023,XBZ,COG-UK/QEUH-326B5575,SAMEA112622902
ERR10887233,England/QEUH-326B81BA/2023,XBZ,COG-UK/QEUH-326B81BA,SAMEA112641438
ERR4639199,England/ALDP-94A6E1/2020,A,COG-UK/ALDP-94A6E1,SAMEA7359442
ERR4639208,England/ALDP-94A6C3/2020,A,COG-UK/ALDP-94A6C3,SAMEA7359451
...,...,...,...,...
ERR12383654,England/WSI-3316A5BD/2023,JN.1.64,COG-UK/WSI-3316A5BD,SAMEA115054198
ERR12658112,England/WSI-3319FCF6/2024,JN.1.64,COG-UK/WSI-3319FCF6,SAMEA115291849
ERR12742963,England/WSI-331EF116/2024,JN.1.65,COG-UK/WSI-331EF116,SAMEA115406129
ERR12658286,England/WSI-331C96F5/2024,JN.1.65,COG-UK/WSI-331C96F5,SAMEA115292022


In [12]:
joined = viridian.join(pango_ena, how="inner")
del viridian, pango_ena

In [13]:
joined

Unnamed: 0_level_0,In_may_2024_preprint,Study,Sample,Experiment,Run_count,Platform,Country,Region,Collection_date,First_created,...,Genbank_scorpio,Genbank_tree_name,Viridian_cons_len,Viridian_cons_het,Viridian_pangolin_1.29,Viridian_scorpio_1.29,taxon,lineage,sample_alias,sample_accession
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR10000002,T,PRJEB37886,SAMEA110427045,ERX9541018,1,ILLUMINA,United Kingdom,none,2022-07-05,2022-07-28,...,Omicron (BA.5-like),ERR10000002.genbank.OX252821.1,29770,2,BF.26,Omicron (BA.5-like),Scotland/LSPA-3EBE8D5/2022,BF.26,COG-UK/LSPA-3EBE8D5,SAMEA110427045
ERR10000004,T,PRJEB37886,SAMEA110427047,ERX9541020,1,ILLUMINA,United Kingdom,none,2022-07-04,2022-07-28,...,Omicron (BA.5-like),ERR10000004.genbank.OX254093.1,29728,2,BA.5.2,Omicron (BA.5-like),Scotland/LSPA-3EBBC63/2022,BA.5.2,COG-UK/LSPA-3EBBC63,SAMEA110427047
ERR10000005,T,PRJEB37886,SAMEA110427048,ERX9541021,1,ILLUMINA,United Kingdom,none,2022-07-05,2022-07-28,...,Omicron (BA.5-like),ERR10000005.genbank.OX253820.1,29770,4,BF.4,Omicron (BA.5-like),Scotland/LSPA-3EBE8E4/2022,BF.4,COG-UK/LSPA-3EBE8E4,SAMEA110427048
ERR10000006,T,PRJEB37886,SAMEA110427049,ERX9541022,1,ILLUMINA,United Kingdom,none,2022-07-02,2022-07-28,...,Omicron (BA.5-like),ERR10000006.genbank.OX253426.1,29770,2,BA.5.2.21,Omicron (BA.5-like),Scotland/LSPA-3EBBDD9/2022,BA.5.2,COG-UK/LSPA-3EBBDD9,SAMEA110427049
ERR10000017,T,PRJEB37886,SAMEA110427060,ERX9541033,1,ILLUMINA,United Kingdom,none,2022-07-04,2022-07-28,...,Omicron (BA.5-like),ERR10000017.genbank.OX254399.1,29471,3,BA.5.1.22,Omicron (BA.5-like),Scotland/LSPA-3EBC0DF/2022,BA.5.1.22,COG-UK/LSPA-3EBC0DF,SAMEA110427060
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERR12658247,F,PRJEB37886,SAMEA115291983,ERX12032539,1,ILLUMINA,United Kingdom,none,2024-01-30,2024-02-13,...,.,.,29773,0,JN.1.39,Omicron (BA.2-like),England/WSI-331C9C6C/2024,JN.1.39,COG-UK/WSI-331C9C6C,SAMEA115291983
ERR12658261,F,PRJEB37886,SAMEA115291997,ERX12032553,1,ILLUMINA,United Kingdom,none,2024-01-28,2024-02-13,...,.,.,29474,0,JN.1.39.3,Omicron (BA.2-like),England/WSI-331C9AF9/2024,JN.1.39.3,COG-UK/WSI-331C9AF9,SAMEA115291997
ERR12658286,F,PRJEB37886,SAMEA115292022,ERX12032578,1,ILLUMINA,United Kingdom,none,2024-01-29,2024-02-13,...,.,.,29773,1,JN.1.65,Omicron (BA.2-like),England/WSI-331C96F5/2024,JN.1.65,COG-UK/WSI-331C96F5,SAMEA115292022
ERR12658298,F,PRJEB37886,SAMEA115292034,ERX12032590,1,ILLUMINA,United Kingdom,none,2024-01-24,2024-02-13,...,.,.,29773,2,JN.1.49,Omicron (BA.2-like),England/WSI-331C9E84/2024,JN.1.49,COG-UK/WSI-331C9E84,SAMEA115292034


These should all be COGUK samples now. Check on country, as a sanity check

In [14]:
joined.Country.unique()

array(['United Kingdom', 'UNKNOWN'], dtype=object)

In [15]:
# Subset down to the columns that we're using here and chuck out 2020-12-31 and non full precision dates
joined = joined[["Date_tree", "lineage", "Viridian_pangolin_1.29"]]
joined = joined[(joined["Date_tree"] != "2020-12-31") & (joined["Date_tree"].str.len() == 10)]

#### Search for seed samples 

Use this joined dataframe now to extract some early sequences for each lineage of interest.

In [16]:
def extract_lineage(lineage, max_rows=10):
    df = joined[joined.lineage == lineage].sort_values("Date_tree")
    print("Got", df.shape[0], " runs")
    return df.head(max_rows)

extract_lineage("B.1.617")

Got 0  runs


Unnamed: 0_level_0,Date_tree,lineage,Viridian_pangolin_1.29
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [17]:
extract_lineage("B.1.617.1")

Got 103  runs


Unnamed: 0_level_0,Date_tree,lineage,Viridian_pangolin_1.29
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ERR5461550,2021-02-22,B.1.617.1,B.1.617.1
ERR5461562,2021-02-22,B.1.617.1,B.1.617
ERR5469699,2021-03-02,B.1.617.1,B.1.617.1
ERR5469807,2021-03-04,B.1.617.1,B.1.617.1
ERR5486121,2021-03-04,B.1.617.1,B.1.617.1
ERR5539924,2021-03-04,B.1.617.1,B.1.617
ERR5521603,2021-03-06,B.1.617.1,B.1.617
ERR5531143,2021-03-07,B.1.617.1,B.1.617.1
ERR5532096,2021-03-07,B.1.617.1,B.1.617.1
ERR5532118,2021-03-07,B.1.617.1,B.1.617.1


In [18]:
extract_lineage("B.1.617.2")

Got 3030  runs


Unnamed: 0_level_0,Date_tree,lineage,Viridian_pangolin_1.29
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ERR5653377,2021-03-18,B.1.617.2,B.1.617.2
ERR5656218,2021-03-18,B.1.617.2,B.1.617.2
ERR5676810,2021-03-23,B.1.617.2,B.1.617.2
ERR5690893,2021-03-28,B.1.617.2,B.1.617.2
ERR5690055,2021-03-30,B.1.617.2,B.1.617.2
ERR5690921,2021-03-30,B.1.617.2,B.1.617.2
ERR5701881,2021-03-30,B.1.617.2,B.1.617.2
ERR5695631,2021-03-30,B.1.617.2,B.1.617.2
ERR5690052,2021-03-31,B.1.617.2,B.1.617.2
ERR5690920,2021-03-31,B.1.617.2,B.1.617.2


In [19]:
extract_lineage("BA.1")

Got 17873  runs


Unnamed: 0_level_0,Date_tree,lineage,Viridian_pangolin_1.29
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ERR7443564,2021-11-22,BA.1,BA.1
ERR7552222,2021-11-23,BA.1,BA.1
ERR7600669,2021-11-25,BA.1,BA.1
ERR7612412,2021-11-25,BA.1,BA.1
ERR7601682,2021-11-26,BA.1,BA.1
ERR7601847,2021-11-26,BA.1,BA.1
ERR7611335,2021-11-27,BA.1,BA.1
ERR7615361,2021-11-27,BA.1,BA.1
ERR7713581,2021-11-27,BA.1,BA.1
ERR7650807,2021-11-27,BA.1,BA.1


In [20]:
extract_lineage("BA.2")

Got 34310  runs


Unnamed: 0_level_0,Date_tree,lineage,Viridian_pangolin_1.29
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ERR7965207,2022-01-03,BA.2,BA.2
ERR7966705,2022-01-03,BA.2,BA.2
ERR7970740,2022-01-04,BA.2,BA.2
ERR7972740,2022-01-04,BA.2,BA.2
ERR8000637,2022-01-04,BA.2,BA.2
ERR8004998,2022-01-05,BA.2,BA.2.10
ERR8031838,2022-01-07,BA.2,BA.2
ERR8035855,2022-01-08,BA.2,BA.2
ERR8035243,2022-01-08,BA.2,BA.2
ERR8068119,2022-01-08,BA.2,BA.2


In [21]:
extract_lineage("BA.4")

Got 780  runs


Unnamed: 0_level_0,Date_tree,lineage,Viridian_pangolin_1.29
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ERR9460737,2022-03-22,BA.4,BA.4
ERR9478457,2022-03-22,BA.4,BA.4
ERR9618865,2022-04-07,BA.4,BA.4
ERR9824759,2022-04-10,BA.4,BA.4
ERR9624616,2022-04-11,BA.4,BA.4
ERR9623118,2022-04-11,BA.4,BA.4
ERR9701287,2022-04-12,BA.4,BA.4
ERR9645527,2022-04-14,BA.4,BA.4
ERR9840429,2022-04-16,BA.4,BA.4
ERR9654878,2022-04-19,BA.4,BA.4
