# 201205 Download additional taxa

In [1]:
exptname = '201031-database-v1.1-software-version-migration'
datestr = '201205'
nbname = datestr + '-download-additional-taxa'

In [2]:
import json
from pathlib import Path
import xml.etree.ElementTree as ET
from datetime import datetime

from tqdm import tqdm
from Bio import Entrez

In [3]:
Entrez.email = 'mjlumpe@gmail.com'

## Paths

In [4]:
infiles = dict(
#     v11_archive='/home/jared/projects/midas/data/v1/archives/refseq_curated_1.1_beta_200525.midas-archive.gz',
    taxa=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201102-download-taxa/'),
#     taxonomy_original=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201113-original-genome-taxa/'),
#     taxonomy_original_extra=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201124-original-taxa-extra-info/'),
#     matches=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201122-taxon-name-matching/'),
    updated_taxids=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201201-download-updated-assembly-summaries/updated-assembly-taxids.json')
)

In [5]:
intermediate_out = Path('../../data/intermediate/') / exptname / nbname
intermediate_out.mkdir(exist_ok=True, parents=True)

## Load data

### Already downloaded taxids

In [6]:
with open(infiles['taxa'] / 'taxa.json') as f:
    existing_taxids = {tdata['taxid'] for tdata in json.load(f)}

In [7]:
# Update with all alias taxids
with open(infiles['taxa'] / 'aka_taxids.json') as f:
    for k, v in json.load(f).items():
        k = int(k)
        assert v in existing_taxids
        existing_taxids.add(k)

### Updated assembly taxids

In [8]:
with open(infiles['updated_taxids']) as f:
    updated_assembly_taxids = set(json.load(f).values())

## Func defs

In [9]:
def parse_date(datestr):
    return None if datestr is None else datetime.strptime(datestr, '%Y/%m/%d %H:%M:%S')

def taxon_xml_to_json(txml):
    """Convert parsed taxon XML element to JSON-like format that's much easier to work with."""
    assert txml.tag == 'Taxon'
    
    return dict(
        taxid=int(txml.findtext('TaxId')),
        parent_taxid=int(txml.findtext('ParentTaxId')),
        scientific_name=txml.findtext('ScientificName'),
        rank=txml.findtext('Rank'),
        division=txml.findtext('Division'),
        create_date=parse_date(txml.findtext('CreateDate')),
        update_date=parse_date(txml.findtext('UpdateDate')),
        pub_date=parse_date(txml.findtext('PubDate')),
        aka_taxids=[int(e.text) for e in txml.findall('./AkaTaxIds/TaxId')],
    )

In [10]:
def get_taxon_othernames(txml):
    entries = []
    
    for el in txml.findall('./OtherNames/*'):
        if el.tag == 'Name':
            _type = el.findtext('./ClassCDE')
            _name = el.findtext('./DispName')
            assert _type and _name
            entries.append(dict(type=_type, name=_name))
        else:
            assert el.text
            entries.append(dict(type=el.tag, name=el.text))
    
    return entries

In [11]:
def efetch_taxa(taxids):
    taxids = list(taxids)
    resp = Entrez.efetch(db='taxonomy', id=taxids)
    doc = ET.parse(resp)
    root = doc.getroot()
    assert root.tag == 'TaxaSet'
    
    taxa = dict()
    
    for txml in root.findall('./Taxon'):
        tdata = taxon_xml_to_json(txml)
        
        # Get primary or alternate taxon ID that was passed to the function
        taxid = tdata['taxid']
        if taxid not in taxids:
            for id2 in tdata['aka_taxids']:
                if id2 in taxids:
                    taxid = id2
                    break
            else:
                raise RuntimeError('Could not determine requested taxid')
                
        taxa[taxid] = (txml, tdata)
    
    return taxa

# efetch_taxa_throttled = throttle(1/3)(efetch_taxa)

## Download taxonomy data

### Setup

In [12]:
tax_dir = Path('tmp/taxa')

In [13]:
taxa_to_download = set(updated_assembly_taxids).difference(existing_taxids)
new_taxon_data = dict()
new_aka_taxids = dict()
new_taxon_othernames = dict()

In [14]:
len(taxa_to_download)

44

In [15]:
def record_taxon(taxid, txml, tdata=None):
    """Add downloaded taxonomy data to our list, add parent to download list if needed."""
    if tdata is None:
        tdata = taxon_xml_to_json(txml)
    true_taxid = tdata['taxid']
    
    # Check we have the expected ID
    assert true_taxid == taxid or taxid in tdata['aka_taxids']
    
    new_taxon_data[true_taxid] = tdata
    taxa_to_download.remove(taxid)
    
    # Record alternate ids
    for id2 in tdata['aka_taxids']:
        assert id2 not in news_aka_taxids
        new_aka_taxids[id2] = true_taxid
        
    # Record OtherNames tag
    othernames = get_taxon_othernames(txml)
    if othernames:
        new_taxon_othernames[taxid] = othernames
    
    # Add parent to download list if we don't have it yet
    parentid = tdata['parent_taxid']
    if parentid not in existing_taxids and parentid not in new_taxon_data and parentid not in new_aka_taxids:
        taxa_to_download.add(parentid)

### Download

In [16]:
chunk_size = 100
_initial = len(new_taxon_data)

with tqdm(total=len(taxa_to_download) + _initial, initial=_initial) as pbar:
    while taxa_to_download:
        
        # Find next chunk of IDs to download
        next_chunk = []
        
        for taxid in list(taxa_to_download):
            dst = tax_dir / ('%d.xml' % taxid)
            
            if dst.is_file():
                with dst.open() as f:
                    txml = ET.parse(f).getroot()
                record_taxon(taxid, txml)
                    
            else:
                next_chunk.append(taxid)
                if len(next_chunk) >= chunk_size:
                    break
                    
        if next_chunk:

            # Fetch
            chunk_taxa = efetch_taxa(next_chunk)

            # Add results
            for taxid, (txml, tdata) in chunk_taxa.items():
                record_taxon(taxid, txml, tdata)

                # Write to file
                dst = tax_dir / ('%d.xml' % taxid)
                assert not dst.is_file()
                with dst.open('wb') as f:
                    f.write(ET.tostring(txml))
                
        # Update progress bar
        total = len(new_taxon_data) + len(taxa_to_download)
        if pbar.total != total:
            pbar.total = total
        pbar.n = len(new_taxon_data)
        pbar.refresh()

100%|██████████| 49/49 [00:00<00:00, 1892.09it/s]


### Consistency checking

In [17]:
# Check we have one entry per set of aliases
for (id2, taxid) in new_aka_taxids.items():
    assert id2 not in new_taxon_data 
    assert taxid in new_taxon_data

## Output to JSON format

In [18]:
class TaxonEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, datetime):
            return obj.isoformat()
        return super().default(obj)

In [19]:
with open(intermediate_out / 'taxa.json', 'w') as f:
    json.dump(list(new_taxon_data.values()), f, cls=TaxonEncoder)

In [20]:
# No new taxa have aka taxids, skip writing empty dict
assert not new_aka_taxids

# with open(intermediate_out / 'aka_taxids.json', 'w') as f:
#     json.dump(new_aka_taxids, f)

In [21]:
with open(intermediate_out / 'taxon-othernames.json', 'w') as f:
    json.dump(new_taxon_othernames, f)