In [134]:
import ncbi.datasets
import json
import jsonlines
import os
import csv
import zipfile
import pandas as pd
from pyfaidx import Fasta
from google.protobuf.json_format import ParseDict
import ncbi.datasets.v1.reports.virus_pb2 as virus_report_pb2
from collections import Counter
from datetime import datetime, timezone, timedelta

from Bio import SeqIO
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio.SeqRecord import SeqRecord
from tqdm import tqdm

In [2]:
virus_api = ncbi.datasets.VirusApi(ncbi.datasets.ApiClient())

In [87]:
%%time
viral_genomes = virus_api.virus_genome_download(
    "SARS2",
    annotated_only=True,
    complete_only=True,
    host="human",
    geo_location='USA',
    exclude_sequence=False,
    _preload_content=False
)

zipfn = 'sars_cov2_dataset.zip'
with open(zipfn, 'wb') as f:
    f.write(viral_genomes.data)

print(f'Download complete')
!unzip -v {zipfn}

Download complete
Archive:  sars_cov2_dataset.zip
 Length   Method    Size  Cmpr    Date    Time   CRC-32   Name
--------  ------  ------- ---- ---------- ----- --------  ----
    1604  Defl:N      769  52% 2022-04-13 16:59 3de26d82  README.md
6879577429  Defl:N 581520080  92% 2022-04-13 16:59 be20b71e  ncbi_dataset/data/data_report.jsonl
7211083224  Defl:N 2463556574  66% 2022-04-13 17:02 3f167833  ncbi_dataset/data/genomic.fna
    2431  Defl:N     1057  57% 2022-04-13 17:09 7054c588  ncbi_dataset/data/virus_dataset.md
     448  Defl:N      239  47% 2022-04-13 17:09 33223b7a  ncbi_dataset/data/dataset_catalog.json
--------          -------  ---                            -------
14090665136         3045078719  78%                            5 files
Wall time: 10min 43s


In [120]:
%%time
viral_genomes_ref = virus_api.virus_genome_download(
    "SARS2",
    annotated_only=True,
    complete_only=True,
    refseq_only=True,
    host="human",
    exclude_sequence=False,
    _preload_content=False
)

zipfn_ref = 'sars_cov2_refseq.zip'
with open(zipfn_ref, 'wb') as f:
    f.write(viral_genomes_ref.data)

Wall time: 764 ms


In [115]:
%%time
def get_data_reports(zip_file):
    with zipfile.ZipFile(zip_file, 'r') as zip_download:
        with zip_download.open('ncbi_dataset/data/data_report.jsonl') as report_file_handle:
            with jsonlines.Reader(report_file_handle) as json_reader:
                for g in json_reader:
                    yield g

alpha_variant = ['B.1.1.7']
beta_variant = ['B.1.351']
gamma_variant = ['B.1.1.28.1','P.1']
delta_variant = ['B.1.617.2','XD','XF','XS']
lambda_variant = ['B.1.1.1.37','C.37']
omicron_variant = ['B.1.1.529','B.1.1.529.1','BA.1','B.1.1.529.2','BA.2',
                   'B.1.1.529.3','BA.3','B.1.1.529.4','BA.4','B.1.1.529.5','BA.5']
all_variant = alpha_variant+beta_variant+gamma_variant+delta_variant+lambda_variant+omicron_variant
                    
genome_data = []
for g in tqdm(get_data_reports(zipfn)):
    annot = g['annotation']
    for gene in annot.get('genes', []):
        for c in gene.get('cds', []):
            cds_len = 0
            begins, ends = [], []
            for r in c['nucleotide']['range']:
                begin, end = int(r['begin']), int(r['end'])
                cds_len += end-begin+1
                begins.append(begin)
                ends.append(end)
            genome_data.append({
                'Accession': g['accession'],
                'ReleaseDate': g['releaseDate'],
                'PangoClass': g['virus']['pangolinClassification'],
                'Location': g.get('location', {}).get('geographicLocation'),
                'Length': g.get('length', 0),
                'Gene': gene['name'],
                'Protein': c['name'],
                'Begin': min(begins),
                'End': max(ends),
                'CDS_Length': cds_len,
            })

237445it [03:13, 1227.73it/s]

Wall time: 3min 13s





In [117]:
df1 = pd.DataFrame(genome_data)
df1 = df1[(df1.Length>=29000)&(df1.PangoClass.isin(all_variant))&(df1.Gene=='S')]
df1.head()

Unnamed: 0,Accession,ReleaseDate,PangoClass,Location,Length,Gene,Protein,Begin,End,CDS_Length
216287,MW422255.1,2020-12-30,B.1.1.7,"USA: San Diego, California",29763,S,surface glycoprotein,21500,25312,3813
216299,MW422256.1,2020-12-30,B.1.1.7,USA,29817,S,surface glycoprotein,21524,25336,3813
217511,MW430966.1,2021-01-04,B.1.1.7,USA: California,29835,S,surface glycoprotein,21523,25335,3813
217571,MW430974.1,2021-01-04,B.1.1.7,USA: Florida,29861,S,surface glycoprotein,21551,25363,3813
218027,MW440433.1,2021-01-05,B.1.1.7,"USA: New York, Saratoga County",29792,S,surface glycoprotein,21514,25326,3813


In [118]:
df1.shape

(110625, 10)

In [121]:
ref_data = []
for g in tqdm(get_data_reports(zipfn_ref)):
    annot = g['annotation']
    for gene in annot.get('genes', []):
        for c in gene.get('cds', []):
            cds_len = 0
            begins, ends = [], []
            for r in c['nucleotide']['range']:
                begin, end = int(r['begin']), int(r['end'])
                cds_len += end-begin+1
                begins.append(begin)
                ends.append(end)
            ref_data.append({
                'Accession': g['accession'],
                'ReleaseDate': g['releaseDate'],
                'PangoClass': g['virus']['pangolinClassification'],
                'Location': g.get('location', {}).get('geographicLocation'),
                'Length': g.get('length', 0),
                'Gene': gene['name'],
                'Protein': c['name'],
                'Begin': min(begins),
                'End': max(ends),
                'CDS_Length': cds_len,
            })

1it [00:00, 249.87it/s]


In [124]:
df2 = pd.DataFrame(ref_data)
df2 = df2[df2.Gene=='S']
df2

Unnamed: 0,Accession,ReleaseDate,PangoClass,Location,Length,Gene,Protein,Begin,End,CDS_Length
2,NC_045512.2,2020-01-13,B,China,29903,S,surface glycoprotein,21563,25384,3822


In [127]:
%%time
def get_sequences(fastafn):
    sequences = {}
    for seq_record in SeqIO.parse(fastafn,'fasta'):
        sequences[seq_record.id] = seq_record.seq
    return sequences

sequences = get_sequences('ncbi_dataset/data/genomic.fna')
refseq = get_sequences('sars_cov2_refseq/ncbi_dataset/data/genomic.fna')
len(sequences)

Wall time: 1min 42s


237445

In [130]:
refseq_s_begin, refseq_s_end = df2.Begin.item(), df2.End.item()
refseq_id = df2.Accession.item()
refseq_s = refseq[refseq_id][refseq_s_begin:refseq_s_end+1]
len(refseq_s)

3822

In [136]:
s_sequences = [
    SeqRecord(
        refseq_s,
        id=refseq_id,
        name='S',
        description='surface glycoprotein, refseq'
    )
]
for seq_id in tqdm(df1.Accession.tolist()):
    data = df1[df1.Accession==seq_id]
    s_sequences.append(
        SeqRecord(
            sequences[seq_id][data.Begin.item():data.End.item()+1],
            id=seq_id,
            name='S',
            description='surface glycoprotein'
        )
    )

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 110625/110625 [39:35<00:00, 46.56it/s]


In [137]:
SeqIO.write(s_sequences, 'sars_cov2_s_genomic.fasta', 'fasta')

110626

In [138]:
df_s = pd.concat([df2, df1], ignore_index=True)
df_s.head()

Unnamed: 0,Accession,ReleaseDate,PangoClass,Location,Length,Gene,Protein,Begin,End,CDS_Length
0,NC_045512.2,2020-01-13,B,China,29903,S,surface glycoprotein,21563,25384,3822
1,MW422255.1,2020-12-30,B.1.1.7,"USA: San Diego, California",29763,S,surface glycoprotein,21500,25312,3813
2,MW422256.1,2020-12-30,B.1.1.7,USA,29817,S,surface glycoprotein,21524,25336,3813
3,MW430966.1,2021-01-04,B.1.1.7,USA: California,29835,S,surface glycoprotein,21523,25335,3813
4,MW430974.1,2021-01-04,B.1.1.7,USA: Florida,29861,S,surface glycoprotein,21551,25363,3813


In [139]:
df_s.to_csv('sars_cov2_s_report.csv',index=False)