# 210109 Patch 160906 installer FTP URLs

In [1]:
from pathlib import Path
import json
from gzip import GzipFile
from zipfile import ZipFile
import random
from ftplib import FTP

In [2]:
from tqdm import tqdm

In [3]:
from Bio import Entrez
Entrez.email = 'mjlumpe@gmail.com'

## File paths

In [4]:
infiles = dict(
    archive='/home/jared/projects/midas/data/v1/archives/refseq_curated_0.9_160906.midas-archive.gz',
)

In [5]:
outfiles = dict(
    archive='/home/jared/projects/midas/data/v1/archives/refseq_curated_0.9_160906_patched_210109.midas-archive',
)

In [6]:
tmpdir = Path('tmp')
tmpdir.mkdir(exist_ok=True)

## Open archive file

In [7]:
gz = GzipFile(infiles['archive'])
archive = ZipFile(gz)

In [8]:
genomes = dict()

for name in archive.namelist():
    if not name.startswith('genomes/'):
        continue
        
    with archive.open(name) as f:
        data = json.loads(f.read().decode())
        
    # Delete unneeded data
    data['gb_summary'] = None
    data['gb_tax_summary'] = None
    data['meta']['gb_summary_meta'] = None
    del data['meta']['refseq_ftp_dir_url']
    del data['meta']['refseq_sequence_ftp_url']
    
    key = data['key']
    assert key not in genomes
    genomes[key] = data

## Funcs

In [11]:
def fetch_esummaries(ids):
    idstr = ','.join(map(str, ids))
    response = json.load(Entrez.esummary(db='assembly', id=idstr, retmode='json'))
    return {id: response['result'][str(id)] for id in ids}

In [12]:
def chunk(items, chunksize):
    chunks = []
    chunk = []
    
    for item in items:
        chunk.append(item)
        if len(chunk) >= chunksize:
            chunks.append(chunk)
            chunk = []
            
    chunks.append(chunk)
    return chunks

In [13]:
def get_genome_url(summary):
    try:
        dir_url = summary['ftppath_refseq']
    except KeyError:
        return None
    
    dir_name = dir_url.rsplit('/', 1)[1]
    return f'{dir_url}/{dir_name}_genomic.fna.gz'

## Get new URLs

In [14]:
urls_file = tmpdir / 'urls.json'

# Load partial results if they exist
if urls_file.is_file():
    with urls_file.open() as f:
        new_urls = json.load(f)
else:
    new_urls = dict()

In [15]:
_to_download = set(genomes) - new_urls.keys()
failed = dict()

for keys in tqdm(chunk(_to_download, 100)):
    id_to_key = {genomes[key]['gb_id']: key for key in keys}
    summaries = fetch_esummaries(id_to_key)
    
    for id_, summary in summaries.items():
        key = id_to_key[id_]
        url = get_genome_url(summary)
        
        if url is None:
            failed[key] = summary
        else:
            new_urls[key] = url

100%|██████████| 1/1 [00:00<00:00,  2.57it/s]


In [16]:
# Save work
with urls_file.open('wt') as f:
    json.dump(new_urls, f)

### Show results

In [17]:
failed

{'refseq/assembly/GCF_000487935.1': {'uid': '75571',
  'error': 'cannot get document summary'},
 'refseq/assembly/GCF_000220025.2': {'uid': '604291',
  'error': 'cannot get document summary'},
 'refseq/assembly/GCF_000026325.1': {'uid': '45288',
  'error': 'cannot get document summary'},
 'refseq/assembly/GCF_000542635.1': {'uid': '103841',
  'error': 'cannot get document summary'}}

In [18]:
assert set(new_urls).union(failed) == set(genomes)

### Validity check on subset of URLs

Check a random subset of the URLs are actually present on the NCBI FTP server.

In [19]:
_URL_PREFIX = 'ftp://ftp.ncbi.nlm.nih.gov/'
_test_urls = random.sample(list(new_urls.values()), 500)

with FTP('ftp.ncbi.nlm.nih.gov', user='anonymous', passwd='mjlumpe@gmail.com', timeout=999) as ftp:
    for url in tqdm(_test_urls):
        assert url.startswith(_URL_PREFIX)
        path = url[len(_URL_PREFIX):]
        
        size = ftp.size(path)
        assert size is not None
        assert size > 10000

100%|██████████| 500/500 [00:58<00:00,  8.51it/s]


## Create new archive file

### Update genome data with new URLS

In [20]:
for key, data in genomes.items():
    new_url = new_urls.get(key)
    
    if new_url is not None:
        data['meta']['sequence_source']['url'] = new_url
    else:
        data['meta']['sequence_source'] = None
        
    data['key_version'] = '1.0.1'

### Write archive

In [21]:
with ZipFile(outfiles['archive'], 'w') as out_archive:
    # Copy non-genome files
    for path in ['info', 'genome_sets/midas/assembly/curated']:
        with out_archive.open(path, 'w') as f:
            f.write(archive.open(path).read())
            
    # Write genomes
    for key, data in genomes.items():
        with out_archive.open('genomes/' + key, 'w') as f:
            f.write(json.dumps(data).encode('utf8'))

In [22]:
assert out_archive.namelist() == archive.namelist()