# 201201 download updated assembly summaries

In [1]:
from pathlib import Path
import json

from tqdm import tqdm

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

In [2]:
from Bio import Entrez

Entrez.email = 'mjlumpe@gmail.com'

In [3]:
from midas.db.models import Genome

In [4]:
exptname = '201031-database-v1.1-software-version-migration'
datestr = '201201'
nbname = datestr + '-download-updated-assembly-summaries'

In [5]:
intermediate_out = Path('../../data/intermediate/') / exptname / nbname
intermediate_out.mkdir(exist_ok=True, parents=True)

## Database connection

In [6]:
engine = create_engine('sqlite:///db.sqlite')
Session = sessionmaker(engine)

In [7]:
session = Session()

In [8]:
assembly_ids = {g.key: g.entrez_id for g in session.query(Genome)}

## Download assembly summary data

In [9]:
def chunkit(items, chunksize):
    for start in range(0, len(items), chunksize):
        yield items[start:start + chunksize]

In [10]:
summaries_dir = Path('tmp/assembly-summaries')
summaries_dir.mkdir(exist_ok=True)

In [11]:
summary_files = {aid: summaries_dir / ('%d.json' % aid) for aid in assembly_ids.values()}

to_download = [aid for aid, fp in summary_files.items() if not fp.is_file()]


for ids in tqdm(list(chunkit(to_download, 100))):
    rdata = json.load(Entrez.esummary(db='assembly', id=','.join(map(str, ids)), retmode='json'))
    
    for aid in ids:
        summary = rdata['result'][str(aid)]
        
        with summary_files[aid].open('w') as f:
            json.dump(summary, f)

0it [00:00, ?it/s]


## Extract updated taxonomy IDs

In [12]:
taxids = dict()

for aid, fp in tqdm(summary_files.items()):
    with fp.open() as f:
        summary = json.load(f)
        
    taxids[aid] = int(summary['taxid'])

100%|██████████| 50752/50752 [00:02<00:00, 17164.59it/s]


In [13]:
with open(intermediate_out / 'updated-assembly-taxids.json', 'w') as f:
    json.dump(taxids, f)