# 220831 Download genomes

In [2]:
from pathlib import Path
import json

In [3]:
import pandas as pd
from Bio import Entrez
from tqdm import tqdm

In [4]:
from entrez_tools.esummary import get_esummary_json
from entrez_tools.db.assembly import seq_url_from_esummary

## Setup

In [5]:
Entrez.email = 'mjlumpe@gmail.com'

In [6]:
genomes_dir = Path('/home/jared/gambit/data/genomes/220831-candida/')

fasta_dir = genomes_dir / 'fasta'
fasta_dir.mkdir(exist_ok=True)

esummary_dir = genomes_dir / 'esummary'
esummary_dir.mkdir(exist_ok=True)

In [7]:
DATESTR = '220831'
NBNAME = f'{DATESTR}_1-download-genomes'

In [8]:
intermediate_data = Path('data-intermediate') / NBNAME
intermediate_data.mkdir(exist_ok=True)

outfiles = dict(
    urls=intermediate_data / 'genome-urls.json',
)

## Load data

In [9]:
genomes = pd.read_csv('data-src/genomes.csv')

## Fetch esummary data

In [10]:
for uid in tqdm(genomes['ncbi_uid']):
    file = esummary_dir / f'{uid}.xml'
    if file.exists():
        continue
    
    result = json.load(Entrez.esummary(db='assembly', id=uid, retmode='json'))
    data = get_esummary_json(result)[str(uid)]
    
    with open(file, 'w') as f:
        json.dump(data, f)

100%|██████████| 71/71 [00:00<00:00, 44033.06it/s]


## Get FTP URLs

In [33]:
if outfiles['urls'].exists():
    with open(outfiles['urls']) as f:
        ftp_urls = json.load(f)
        
else:
    ftp_urls = dict()

    for uid in genomes['ncbi_uid']:
        with open(esummary_dir / f'{uid}.xml') as f:
            es = json.load(f)

        ftp_urls[uid] = seq_url_from_esummary(es, False)

    with open(outfiles['urls'], 'w') as f:
        ftp_urls = json.dump(ftp_urls, f)

## Downloads

In [41]:
for row in tqdm(genomes.itertuples()):
    file = fasta_dir / (row.genbank_acc + '.fna.gz')
    if file.exists():
        continue
        
    url = ftp_urls[str(row.ncbi_uid)]
    urlretrieve(url, file)

71it [04:04,  3.45s/it]
