In [None]:
# See associated issue
# https://github.com/jeromekelleher/sc2ts-paper/issues/268


#### Download files

In [9]:
%%bash
wget --quiet https://raw.githubusercontent.com/cov-lineages/pango-designation/16205e716c6a68ff1c3d0f26f0c77478682368ac/lineages.csv


In [1]:
%%bash
curl -s -X 'GET' \
  'https://www.ebi.ac.uk/ena/portal/api/filereport?result=read_run&accession=PRJEB37886&fields=sample_accession%2Csample_alias&limit=0&format=tsv&download=true' \
  -H 'accept: */*' > filereport_read_run_PRJEB37886_tsv.txt


In [2]:
%%bash
wget --quiet --content-disposition https://figshare.com/ndownloader/files/49694808


#### Parse files


In [None]:
import pandas as pd

# lineage, sample name
pango = pd.read_csv("lineages.csv", sep=",")
pango["sample_name"] = [s.split("/")[1] for s in pango["taxon"]]
pango.head(1)


In [None]:
# run accession, sample name
ena = pd.read_csv("filereport_read_run_PRJEB37886_tsv.txt", sep="\t")
ena["sample_name"] = [s.split("/")[1] for s in ena["sample_alias"]]
ena.head(1)


In [None]:
# Run (strain)
viridian = pd.read_csv("run_metadata.v05.tsv.gz", sep="\t")
viridian = viridian[viridian["Date_tree"] != "none"]
viridian["parsed_datetime"] = pd.to_datetime(
    viridian["Date_tree"],
    format='%Y-%m-%d',
    errors='coerce',
)
viridian = viridian[viridian["parsed_datetime"].notna()]
viridian.head(1)


#### Search among the COG-UK samples

In [20]:
# Chosen by trial-and-error
threshold_dates_dict = {
    "B.1.617.1": "2021-04-01",
    "B.1.617.2": "2021-04-01",
    "BA.1": "2021-12-01",
    "BA.2": "2022-01-08",
    "BA.4": "2022-04-01",
}


In [23]:
for focal_pango, threshold_date in threshold_dates_dict.items():
    out_file = "".join([
        "candidate_seeds", "_", focal_pango, "_", \
        "pre", "-", threshold_date, \
    ]) + ".txt"

    designated_samples = pango[pango["lineage"] == focal_pango]["sample_name"]
    coguk_runs = ena[ena["sample_name"].isin(designated_samples)]["run_accession"]
    viridian_samples = viridian[viridian["Run"].isin(coguk_runs)]

    viridian_samples[
        (viridian_samples["parsed_datetime"] < pd.to_datetime(threshold_date)) & \
        (viridian_samples["parsed_datetime"] != pd.to_datetime("2020-12-31"))
    ][["Run", "Date_tree"]].to_csv(out_file, index=False)


#### Search among the South Africa samples for Omicron seeds

In [24]:
# Chosen by trial-and-error
threshold_dates_dict = {
    "BA.1": "2021-10-01",
    "BA.2": "2021-12-01",
    "BA.4": "2022-01-01",
}


In [25]:
for focal_pango, threshold_date in threshold_dates_dict.items():
    out_file = "".join([
        "candidate_seeds", "_", focal_pango, "_", \
        "SouthAfrica", "_", \
        "pre", "-", threshold_date,
    ]) + ".txt"

    viridian[
        (viridian["Viridian_pangolin"] == focal_pango) & \
        (viridian["Country"] == "South Africa") & \
        (viridian["parsed_datetime"] < pd.to_datetime(threshold_date))
    ][["Run", "Date_tree"]].to_csv(out_file, index=False)
