In [4]:
import pandas as pd
import numpy as np
from screed import ScreedDB

#### Steps
* From gisaid.org login and download the metadata and full MSA, as discussed [here on nextrain's site](https://docs.nextstrain.org/projects/ncov/en/latest/analysis/data-prep.html#curate-data-from-gisaid-search-and-downloads) (except get the "MSA full" not the FASTA file)
* Unpack the metadata and MSA tar files
* Run the processing script on the msa file (../scripts/process_msa_headers.sh, may need to remove the "USA" filter if you want to use all sequences, and a much larger file)
* Use this logic to generate custom MSA based on metadata queries

Alternative [here](https://docs.nextstrain.org/projects/ncov/en/latest/)
* Nextstrain and augur provide methods to subsample from MSA for phylogenetics
* Can use intermediate output as raw MSA
 * Using augur tools

In [5]:
metadata = pd.read_csv("/scratch/jho5ze/bionets/covid/variant_data/metadata.tsv", sep="\t", parse_dates = ["Collection date", "Submission date"])

def get_nth_slash(row, n):
    try:
        return row.split("/")[n].strip()
    except:
        return np.nan
    
for i in range(4):
    metadata[f"Location_{i}"] = metadata["Location"].apply(lambda row: get_nth_slash(row, i))
metadata = metadata.rename(columns={"Location_1":"country", "Location_2":"state"})
metadata = metadata[metadata["country"] == "USA"] #Remove to include all sequences, not just USA
metadata["alias"] = metadata["Virus name"].apply(lambda row: "/".join(row.split("/")[2:]))


In [10]:
def msa_from_screed_ids(ids): 
    for seq_id in ids:
        if seq_id in msadb:
            record = msadb[seq_id]
            name = ">"+record["name"]
            sequence = str(record["sequence"])
            yield name
            yield sequence

In [6]:
msadb = ScreedDB("/scratch/jho5ze/bionets/coevolution/data/msa_0927/usa_msa_0927.fasta")

In [19]:
accessions = metadata[metadata.state == "Virginia"]["Accession ID"].head().tolist()
print(accessions)

['EPI_ISL_429970', 'EPI_ISL_429972', 'EPI_ISL_429971', 'EPI_ISL_429974', 'EPI_ISL_429973']


In [20]:
for line in msa_from_screed_ids(accessions):
    print(line[:100])

>EPI_ISL_429970
-----------------------------------------------------------------------------------------------CAACT
>EPI_ISL_429972
---------------------------------------------------------------AAGGTTTATACCTTCCCAGGTAACAAACCAACCAACT
>EPI_ISL_429971
----------------------------------------------------------------AGGTTTATACCTTCCCAGGTAACAAACCAACCAACT
>EPI_ISL_429974
-----------------------------------------------------------------GGTTTATACCTTCCCAGGTAACAAACCAACCAACT
>EPI_ISL_429973
--------------------------------------------------------------AAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACT
