In [1]:
import json
from pathlib import Path

In [2]:
from tqdm import tqdm
from Bio import Entrez

## Setup

In [3]:
DATESTR = '211111'
NBNAME = DATESTR + '-find-genomes'

In [4]:
Entrez.email = 'mjlumpe@gmail.com'
Entrez.api_key = 'adea2a53e6e44c3cc42c932f9ba4bdffa809'

In [5]:
tmpdir = Path('tmp')
tmpdir.mkdir(exist_ok=True)

summaries_dir = tmpdir / 'assembly-summaries'
summaries_dir.mkdir(exist_ok=True)

intermediate_out = Path('data-intermediate') / NBNAME
intermediate_out.mkdir(exist_ok=True)

In [6]:
outfiles = dict(
    summaries=intermediate_out / 'assembly-summaries.tar.gz',
)

## Code

In [7]:
def chunk(l, chunksize: int):
    for start in range(0, len(l), chunksize):
        yield l[start:start + chunksize]

## Search

In [8]:
TERM = '''\
"bacteria"[Organism] \
AND ("representative genome"[RefSeq Category] OR "reference genome"[RefSeq Category]) \
'''
# Not needed?
# AND "all"[Filter] NOT "anomalous"[Filter] \
# AND "latest"[Filter] \
# '''

In [9]:
response = Entrez.read(Entrez.esearch('assembly', TERM, retmax='99999'))

In [10]:
count = int(response['Count'])
assert response['RetStart'] == '0'
assert response['RetMax'] == str(count)
count

14388

In [11]:
uids = list(response['IdList'])
assert len(uids) == count

## Download assemblies

In [12]:
summary_files = {uid: summaries_dir / f'{uid}.json' for uid in uids}

In [13]:
to_download = {uid for uid, f in summary_files.items() if not f.is_file()}

for chunk_uids in tqdm(list(chunk(list(to_download), 100))):
    with Entrez.esummary(db='assembly', id=','.join(chunk_uids), retmode='json') as response:
        data = json.load(response)

    assert set(data['result']['uids']) == set(chunk_uids)
    
    for uid in chunk_uids:
        summary = data['result'][uid]
        with open(summary_files[uid], 'w') as f:
            json.dump(summary, f)

0it [00:00, ?it/s]


## Save as archive

In [14]:
! cd {str(summaries_dir)}; tar -czf "{str(outfiles['summaries'].absolute())}" *