# 201125 unmatched species taxonomy trees

In [1]:
exptname = '201031-database-v1.1-software-version-migration'
datestr = '201125'
nbname = datestr + '-unmatched-species-taxonomy-trees'

In [2]:
import json
from pathlib import Path
import re
from zipfile import ZipFile
from gzip import GzipFile
from collections import Counter

## File paths

In [3]:
infiles = dict(
    v11_archive='/home/jared/projects/midas/data/v1/archives/refseq_curated_1.1_beta_200525.midas-archive.gz',
    taxa=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201102-download-taxa/'),
    taxonomy_original=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201113-original-genome-taxa/'),
    taxonomy_original_extra=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201124-original-taxa-extra-info/'),
    matches=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201122-taxon-name-matching/'),
)

In [4]:
outdir = Path('../../reports/') / exptname / nbname
outdir.mkdir(exist_ok=True, parents=True)

## Load data

### Archive files

In [5]:
archive_v11 = ZipFile(GzipFile(infiles['v11_archive']))
archive_v11.read('info').decode()

'{"archive_version": "1.0"}'

In [6]:
with archive_v11.open('genome_sets/midas/assembly/curated') as f:
    gset_data = json.load(f)

In [7]:
genus_names = set()
species_names = set()

for adata in gset_data['annotations'].values():
    genus_names.add(adata['tax_genus'])
    species_names.add((adata['tax_genus'], adata['tax_species']))
    
species_names = sorted(species_names)
genus_names = sorted(genus_names)
    
len(genus_names), len(species_names)

(419, 1438)

### Taxonomy

#### Current data

In [8]:
with open(infiles['taxa'] / 'taxa.json') as f:
    taxon_data = json.load(f)
    
# Convert to dict indexed by ID
taxon_data = {tdata['taxid']: tdata for tdata in taxon_data}

In [9]:
with open(infiles['taxa'] / 'aka_taxids.json') as f:
    aka_taxids = json.load(f)
    
aka_taxids = {int(id1): id2 for id1, id2 in aka_taxids.items()}

In [10]:
# Check taxon_data dict doesn't include any alias taxonomy IDs
for taxid, taxon in taxon_data.items():
    assert taxon['taxid'] == taxid
    assert taxid not in aka_taxids
    assert taxon['parent_taxid'] == 0 or taxon['parent_taxid'] in taxon_data

In [11]:
# Build dictionary of parent ID relationships
parent_rels = dict()

for taxid, taxon in taxon_data.items():
    ptaxid = taxon['parent_taxid']
    if ptaxid != 0:
        parent_rels[taxid] = ptaxid

#### Original 2016 data

In [12]:
with open(infiles['taxonomy_original'] / 'original-tax-summaries.json') as f:
    orig_tax_summaries = {int(tid): s for tid, s in json.load(f).items()}

In [13]:
with open(infiles['taxonomy_original'] / 'species-genome-lcas.json') as f:
    species_genome_lcas = json.load(f)

species_genome_lcas = {(d['curated_genus'], d['curated_species']): d['taxid'] for d in species_genome_lcas}

In [14]:
with open(infiles['taxonomy_original_extra'] / 'original-genome-taxid-counts.json') as f:
    _orig_genome_taxid_counts_json = json.load(f)

orig_genome_taxid_counts_filtered = {
    (d['curated_genus'], d['curated_species']): {int(taxid): cnt for taxid, cnt in d['filtered_counts'].items()}
    for d in _orig_genome_taxid_counts_json
}

#### Name matches

In [15]:
species_name_matches = dict()

with open(infiles['matches'] / 'species-name-matches.json') as f:
    for d in json.load(f):
        sp = (d.pop('curated_genus'), d.pop('curated_species'))
        species_name_matches[sp] = d if d['matched_taxid'] is not None else None

## Func defs

In [16]:
def only(it):
    """Get the only element of an iterable if it has length one, else raise an error."""
    (item,) = it
    return item

In [17]:
def resolve_alias(tid):
    return aka_taxids.get(tid, tid)

In [18]:
def gettaxon(tid):
    """Get taxon by ID, resolving alias IDs."""
    return taxon_data[resolve_alias(tid)]

In [19]:
def getparent(taxon):
    """Get taxon's parent, handling aliases of parent id."""
    if isinstance(taxon, int):
        taxon = gettaxon(taxon)
        
    try:
        return gettaxon(taxon['parent_taxid'])
    except KeyError:
        return None

In [20]:
def iter_ancestors(taxon, incself=False):
    if isinstance(taxon, int):
        taxon = gettaxon(taxon)
    if not incself:
        taxon = getparent(taxon)
        
    while taxon is not None:
        yield taxon
        taxon = getparent(taxon)

In [21]:
def taxon_url(taxid):
    return 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=%d' % taxid

In [22]:
def make_tree_child_map(leaf_taxids, root_taxid=None):
    children = dict()
    roots = set()
    
    heads = set(leaf_taxids)
    
    while heads:
        taxid = heads.pop()
            
        # Is root
        if taxid == root_taxid or taxid not in parent_rels:
            roots.add(taxid)
            continue
        
        ptaxid = parent_rels[taxid]
        
        if ptaxid in children:
            children[ptaxid].add(taxid)
        
        else:
            children[ptaxid] = {taxid}
            heads.add(ptaxid)

    if len(roots) > 1:
        raise ValueError('More than one root found')
    
    root = only(roots)
    return children, root

In [23]:
def sum_counts_recursive(child_map, root, counts):
    rcounts = dict()
    
    def _count_subtree(taxid):
        cnt = counts.get(taxid, 0)
        for child in child_map.get(taxid, []):
            cnt += _count_subtree(child)
        rcounts[taxid] = cnt
        return cnt
    
    _count_subtree(root)
    
    return rcounts

## Tree report generation

### Config

In [24]:
report_species = [sp for sp in species_names if species_name_matches[sp] is None]
report_species_ids = {sp: re.sub(r'[^a-z-]', '', re.sub(r'\s+', '-', ' '.join(sp).lower())) for sp in report_species}

In [25]:
report_attrs = dict(
    title=datestr + ' unmatched species taxonomy trees',
    tree_indent_px=15,
)

### Page template

In [26]:
REPORT_CSS = '''
body {
    margin: 24px;
    font-size: 1.4em;
}

h1 {
    font-size: 4rem !important;
}

h2 {
    font-size: 2.5rem !important;
    margin-top: 6rem;
}

.section-header {
}

table {
    border-collapse: collapse;
}

table > tbody > tr:nth-child(even) {
    background: #eeeeee;
}

td, th {
    padding: 6px 15px !important;
}

.text-gray {
    color: #999;
}
'''

In [27]:
REPORT_PRE = '''
<!DOCTYPE HTML>
<html lang="en">
    <head>
        <meta name="author" content="Jared Lumpe">
        <title>{title}</title>
        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/skeleton/2.0.4/skeleton.min.css"/>
        <style>{css}</style>
    </head>
    <body>
        <h1>{title}</h1>
'''.format(**report_attrs, css=REPORT_CSS)

REPORT_POST = '''
    </body>
</html>
'''

### Contents

In [28]:
def write_contents_table(f):
    f.write('''
    <h2>Contents</h2>
    
    <ol>
    ''')
    
    for sp in report_species:
        f.write('<li><a href="#%s">%s %s</a></li>\n' % (report_species_ids[sp], *sp))
    
    f.write('</ol>\n')

### Sections

In [29]:
orig_tax_summaries_by_canonical_taxid = {resolve_alias(taxid): summary for taxid, summary in orig_tax_summaries.items()}

In [30]:
def write_taxonomy_tree_table(f, child_map, root_taxid, counts, rcounts):
    def _write_subtree(taxid, depth):
        data = dict(gettaxon(taxid))
        data['url'] = taxon_url(taxid)
        data['indent'] = depth * report_attrs['tree_indent_px']
        data['count'] = counts.get(taxid, 0) or ''
        data['rcount'] = rcounts.get(taxid, 0)
        
        orig_summary = orig_tax_summaries_by_canonical_taxid.get(taxid)
        if orig_summary is not None:
            orig_name = orig_summary['scientificname']
            if orig_name != data['scientific_name']:
                data['orig_name'] = orig_name
                data['orig_name_class'] = ''
            else:
                data['orig_name'] = '(same)'
                data['orig_name_class'] = 'text-gray'
        else:
            data['orig_name'] = ''
            data['orig_name_class'] = ''
        
        f.write('''
        <tr>
            <td><a href="{url}" target="_blank" style="margin-left: {indent}px;">
                {scientific_name}
            </a></td>
            <td>{taxid}</td>
            <td>{rank}</td>
            <td class="{orig_name_class}">{orig_name}</td>
            <td style="text-align: right">{count}</td>
            <td style="text-align: right">({rcount})</td>
        </tr>
        '''.format(**data))

        for child_taxid in child_map.get(taxid, []):
            _write_subtree(child_taxid, depth + 1)

    f.write('''
    <table class="taxonomy-tree">
    <thead><tr>
        <th>Name</th>
        <th>ID</th>
        <th>Rank</th>
        <th>2016 Name</th>
        <th colspan="2">2016 Genome Count</th>
    </tr></thead>
    <tbody>
    ''')
    
    _write_subtree(root_taxid, 0)
    
    f.write('</tbody></table>')

In [31]:
def write_report_section(f, sp):
    lca_taxid = species_genome_lcas[sp]
    lca = gettaxon(lca_taxid) 

    counts = orig_genome_taxid_counts_filtered[sp]
    counts = {resolve_alias(taxid): cnt for taxid, cnt in counts.items()}
    child_map, root = make_tree_child_map(counts.keys(), lca['taxid'])
    rcounts = sum_counts_recursive(child_map, root, counts)
    
    f.write('<h2 id="%s" class="section-header">%s %s (%d genomes)</h2>\n\n' % (report_species_ids[sp], *sp, sum(counts.values())))
    write_taxonomy_tree_table(f, child_map, root, counts, rcounts)

### Generate report

In [32]:
with open(outdir / (nbname + '-report.html'), 'w') as f:
    f.write(REPORT_PRE)
    
    write_contents_table(f)
    
    for sp in report_species:
        f.write('\n\n')
        write_report_section(f, sp)
            
    f.write(REPORT_POST)