# 201113 Original genome taxa

In [1]:
exptname = '201031-database-v1.1-software-version-migration'
datestr = '201113'
nbname = datestr + '-original-genome-taxa'

In [2]:
import json
from pathlib import Path
from zipfile import ZipFile
from gzip import GzipFile
from datetime import date
import re
from collections import Counter

## File paths

In [3]:
infiles = dict(
    v11_archive='/home/jared/projects/midas/data/v1/archives/refseq_curated_1.1_beta_200525.midas-archive.gz',
    v09_archive='/home/jared/projects/midas/data/v1/archives/refseq_curated_0.9_160906.midas-archive.gz',
    taxa=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201102-download-taxa/'),
)

In [4]:
intermediate_out = Path('../../data/intermediate/') / exptname / nbname
intermediate_out.mkdir(exist_ok=True, parents=True)

## Load archive data

In [5]:
archive_v09 = ZipFile(GzipFile(infiles['v09_archive']))
archive_v09.read('info').decode()

'{"archive_version": "1.0"}'

In [6]:
archive_v11 = ZipFile(GzipFile(infiles['v11_archive']))
archive_v11.read('info').decode()

'{"archive_version": "1.0"}'

In [7]:
with archive_v11.open('genome_sets/midas/assembly/curated') as f:
    gset_data = json.load(f)

In [8]:
genus_names = set()
species_names = set()

for adata in gset_data['annotations'].values():
    genus_names.add(adata['tax_genus'])
    species_names.add((adata['tax_genus'], adata['tax_species']))
    
species_names = sorted(species_names)
genus_names = sorted(genus_names)
    
len(genus_names), len(species_names)

(419, 1438)

In [9]:
genomes_by_db_species = {sp: set() for sp in species_names}

for key, a in gset_data['annotations'].items():
    genomes_by_db_species[(a['tax_genus'], a['tax_species'])].add(key)

## Load taxonomy data

In [10]:
with open(infiles['taxa'] / 'taxa.json') as f:
    taxon_data = json.load(f)
    
# Convert to dict indexed by ID
taxon_data = {tdata['taxid']: tdata for tdata in taxon_data}

In [11]:
with open(infiles['taxa'] / 'aka_taxids.json') as f:
    aka_taxids = json.load(f)
    
aka_taxids = {int(id1): id2 for id1, id2 in aka_taxids.items()}

In [12]:
# Check taxon_data dict doesn't include any alias taxonomy IDs
for taxid, taxon in taxon_data.items():
    assert taxon['taxid'] == taxid
    assert taxid not in aka_taxids
    assert taxon['parent_taxid'] == 0 or taxon['parent_taxid'] in taxon_data

Parent ID mapping:

In [13]:
parent_rels = {t['taxid']: t['parent_taxid'] for t in taxon_data.values()}

## Func defs

In [14]:
def resolve_alias(tid):
    return aka_taxids.get(tid, tid)

In [15]:
def gettaxon(tid):
    """Get taxon by ID, resolving alias IDs."""
    return taxon_data[resolve_alias(tid)]

In [16]:
def getparent(taxon):
    """Get taxon's parent, handling aliases of parent id."""
    if isinstance(taxon, int):
        taxon = gettaxon(taxon)
        
    try:
        return gettaxon(taxon['parent_taxid'])
    except KeyError:
        return None

In [17]:
def iter_ancestors(taxon, incself=False):
    if isinstance(taxon, int):
        taxon = gettaxon(taxon)
    if not incself:
        taxon = getparent(taxon)
        
    while taxon is not None:
        yield taxon
        taxon = getparent(taxon)

In [18]:
def find_lca(taxids):
    """Find lowest common ancestor of a set of taxa."""
    taxids = set(taxids)
    if not taxids:
        raise ValueError('Must supply at least one taxon')
    if len(taxids) == 1:
        return taxids.pop()
    
    # Build tree of these taxa and their common ancestors
    taxid_map = dict()
    roots = set()
    
    heads = set(taxids)
    
    while heads:
        taxid = heads.pop()
        
        try:
            parent = parent_rels[taxid]
        except KeyError:
            roots.add(taxid)
            continue
        
        if parent not in taxid_map:
            taxid_map[parent] = set()
            heads.add(parent)
            
        taxid_map[parent].add(taxid)
        
    # Shouldn't happen
    if len(roots) > 1:
        raise RuntimeError('Not common ancestor found')
        
    # Descend tree from root to find LCA
    a = roots.pop()
    while True:
        if a in taxids:
            return a
        children = taxid_map[a]
        if len(children) > 1:
            return a
        (a,) = children

## Extract 2016 taxonomy summary data from v0.9 archive

In [19]:
orig_tax_summaries = dict()
genome_orig_taxids = dict()

# It's much faster to read files in order here, otherwise we'll be seeking around inside the outer GZip file
for name in archive_v09.namelist():
    if not name.startswith('genomes/'):
        continue
        
    key = name.split('/', 1)[1]
        
    with archive_v09.open(name) as f:
        genome_data = json.load(f)
            
    summary = genome_data['gb_tax_summary']
    taxid = int(summary['uid'])
    
    if taxid in orig_tax_summaries:
        # Check entries are consistent
        assert summary == orig_tax_summaries[taxid]
    else:
        orig_tax_summaries[taxid] = summary
        
    genome_orig_taxids[key] = taxid

## Process original taxonomy data

For each species in curated database, find set of all taxa assigned to its genomes and filter out those who do not match species name (these genomes were probably assigned to this species from another during the curation process).

In [20]:
genome_taxids_by_species = dict()

for db_species, keys in genomes_by_db_species.items():
    taxids = set()
    
    for key in keys:
        taxid = genome_orig_taxids[key]
        summary = orig_tax_summaries[taxid]
        if (summary['genus'], summary['species']) == db_species:
            taxids.add(resolve_alias(taxid))
        
    # Should have at least one
    assert taxids
    
    genome_taxids_by_species[db_species] = taxids

### Find lowest common ancestor of all genomes per species

In [21]:
genome_lca_by_species = {sp: find_lca(taxids) for sp, taxids in genome_taxids_by_species.items()}

In [22]:
# (test find_lca implementation)
for sp, lca in genome_lca_by_species.items():
    taxids = genome_taxids_by_species[sp]
    
    for taxid in taxids:
        assert any(taxon['taxid'] == lca for taxon in iter_ancestors(taxid, incself=True))

In [23]:
Counter(gettaxon(tid)['rank'] for tid in genome_lca_by_species.values())

Counter({'species': 1209,
         'strain': 205,
         'genus': 10,
         'subspecies': 6,
         'no rank': 4,
         'biotype': 2,
         'species group': 2})

## Write output

In [24]:
with open(intermediate_out / 'original-tax-summaries.json', 'w') as f:
    json.dump(orig_tax_summaries, f)

In [25]:
_genome_taxids_by_species_json = [
    dict(curated_genus=g, curated_species=s, taxids=list(taxids))
    for (g, s), taxids in genome_taxids_by_species.items()
]

with open(intermediate_out / 'genome-matching-taxids-by-species.json', 'w') as f:
    json.dump(_genome_taxids_by_species_json, f)

In [26]:
_genome_lca_by_species_json = [
    dict(curated_genus=g, curated_species=s, taxid=taxid)
    for (g, s), taxid in genome_lca_by_species.items()
]

with open(intermediate_out / 'species-genome-lcas.json', 'w') as f:
    json.dump(_genome_lca_by_species_json, f)