# 201218 final taxonomy assignments

In [None]:
exptname = '201031-database-v1.1-software-version-migration'
datestr = '201218'
nbname = datestr + '-final-taxonomy-assignments'

In [2]:
import json
from pathlib import Path
from zipfile import ZipFile
from gzip import GzipFile
from collections import Counter

In [3]:
import pandas as pd

## File paths

In [4]:
infiles = dict(
    v11_archive='/home/jared/projects/midas/data/v1/archives/refseq_curated_1.1_beta_200525.midas-archive.gz',
    taxa=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201102-download-taxa/'),
    taxa_additional=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201205-download-additional-taxa/'),
    matches=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201122-taxon-name-matching/'),
    updated_taxids=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201201-download-updated-assembly-summaries/updated-assembly-taxids.json')
)

In [5]:
processed_out = Path('../../data/processed/') / exptname / nbname
processed_out.mkdir(exist_ok=True, parents=True)

## Load data

### Archive files

In [6]:
archive_v11 = ZipFile(GzipFile(infiles['v11_archive']))
archive_v11.read('info').decode()

'{"archive_version": "1.0"}'

In [7]:
with archive_v11.open('genome_sets/midas/assembly/curated') as f:
    gset_data = json.load(f)

In [8]:
genomes_by_species = dict()

for key, adata in gset_data['annotations'].items():
    sp = (adata['tax_genus'], adata['tax_species'])
    genomes_by_species.setdefault(sp, set()).add(key)
    
species_names = sorted(genomes_by_species.keys())
genus_names = sorted(set(g for g, s in species_names))
    
len(genus_names), len(species_names)

(419, 1438)

In [9]:
assembly_ids = dict()

for name in archive_v11.namelist():
    if not name.startswith('genomes/'):
        continue
    
    with archive_v11.open(name) as f:
        data = json.load(f)
        
        assembly_ids[data['key']] = data['gb_id']

### Taxa

In [10]:
with open(infiles['taxa'] / 'taxa.json') as f:
    taxon_data = {tdata['taxid']: tdata for tdata in json.load(f)}

with open(infiles['taxa_additional'] / 'taxa.json') as f:
    taxon_data.update({tdata['taxid']: tdata for tdata in json.load(f)})

In [11]:
with open(infiles['taxa'] / 'aka_taxids.json') as f:
    aka_taxids = {int(id1): id2 for id1, id2 in json.load(f).items()}

### Name matches

In [12]:
species_name_matches = dict()

with open(infiles['matches'] / 'species-name-matches.json') as f:
    for d in json.load(f):
        sp = (d.pop('curated_genus'), d.pop('curated_species'))
        species_name_matches[sp] = d if d['matched_taxid'] is not None else None

### Updated assembly taxids

In [13]:
with open(infiles['updated_taxids']) as f:
    assembly_taxids = {int(k): v for k, v in json.load(f).items()}

## Func defs

In [14]:
def resolve_alias(tid):
    return aka_taxids.get(tid, tid)

def taxid_eq(tid1, tid2):
    return resolve_alias(tid1) == resolve_alias(tid2)

In [15]:
def gettaxon(tid):
    """Get taxon by ID, resolving alias IDs."""
    return taxon_data[resolve_alias(tid)]

In [16]:
def getparent(taxon):
    """Get taxon's parent, handling aliases of parent id."""
    if isinstance(taxon, int):
        taxon = gettaxon(taxon)
        
    try:
        return gettaxon(taxon['parent_taxid'])
    except KeyError:
        return None

In [17]:
def iter_ancestors(taxon, incself=False):
    if isinstance(taxon, int):
        taxon = gettaxon(taxon)
    if not incself:
        taxon = getparent(taxon)
        
    while taxon is not None:
        yield taxon
        taxon = getparent(taxon)

## Manual edit data

In [18]:
manual_species_matches = {
    ('Actinomyces', 'odontolyticus'): 1660,
    ('Azospirillum', 'brasilense'): 192,
    ('Francisella', 'noatunensis'): 657445,
    ('Lachnoclostridium', '[Clostridium] clostridioforme'): 1531,
    ('Mobiluncus', 'curtisii'): 2051,
    ('Mycobacterium', 'intracellulare'): 1767,
    ('Pectobacterium', 'carotovorum'): 554,
    ('Pectobacterium', 'wasabiae'): 55208,
    ('Photorhabdus', 'luminescens'): 29488,
    ('Photorhabdus', 'temperata'): 574560,
    ('Pseudoalteromonas', 'haloplanktis'): 228,
    ('Pseudomonas', 'pseudoalcaligenes'): 301,
    ('Salinispora', 'pacifica'): 351187,
    ('Vibrio', 'alginolyticus'): 663,
    ('Vibrio', 'tasmaniensis'): 212663,
    ('Xanthomonas', 'alfalfae'): 366650,
    ('Xanthomonas', 'axonopodis'): 53413,
    ('Xanthomonas', 'campestris'): 339,
    ('Xanthomonas', 'fuscans'): 366649,
    ('Bifidobacterium', 'kashiwanohense'): 630129,
    ('Enterobacter', 'xiangfangensis'): 1296536,
    ('Mycobacterium', 'africanum'): 33894,
    ('Mycobacterium', 'bovis'): 1765,
    ('Xanthomonas', 'gardneri'): 2754056,
}

In [19]:
_rows = [
    ('Francisella', 'noatunensis', 299583),
    ('Pseudoalteromonas', 'haloplanktis', 0),
    ('Xanthomonas', 'alfalfae', 456327),
    ('Xanthomonas', 'axonopodis', 1985254),
    ('Xanthomonas', 'fuscans', 346),
    ('Bifidobacterium', 'kashiwanohense', 1686),
    ('Enterobacter', 'xiangfangensis', 158836),
    ('Mycobacterium', 'africanum', 1773),
    ('Mycobacterium', 'bovis', 1773),
    ('Xanthomonas', 'gardneri', 56454),
]

species_alternate_assignments = pd.DataFrame.from_records(_rows, columns=['genus', 'species', 'taxid'])
species_alternate_assignments.set_index(['genus', 'species'], inplace=True)

In [20]:
_rows = [
    ('Azospirillum', 'brasilense', 1064539, 1, 'remove'),
    ('Mobiluncus', 'curtisii', 887899, 1, 'remove'),
    ('Mycobacterium', 'intracellulare', 1138383, 1, 'remove'),
    ('Pectobacterium', 'carotovorum', 2778550, 2, 'create'),
    ('Pectobacterium', 'carotovorum', 180957, 22, 'create'),
    ('Pectobacterium', 'carotovorum', 78398, 2, 'create'),
    ('Pectobacterium', 'wasabiae', 1905730, 3, 'create'),
    ('Photorhabdus', 'luminescens', 2218628, 2, 'create'),
    ('Photorhabdus', 'temperata', 230089, 1, 'remove'),
    ('Salinispora', 'pacifica', 1137263, 2, 'create'),
    ('Salinispora', 'pacifica', 1050199, 3, 'create'),
    ('Salinispora', 'pacifica', 999544, 2, 'create'),
    ('Salinispora', 'pacifica', 999545, 3, 'create'),
    ('Vibrio', 'alginolyticus', 50719, 3, 'create'),
    ('Vibrio', 'tasmaniensis', 693153, 1, 'remove'),
    ('Xanthomonas', 'campestris', 56459, 5, 'reassign'),
    ('Xanthomonas', 'campestris', 56448, 13, 'reassign'),
]

additional_edits = pd.DataFrame.from_records(_rows, columns=['genus', 'species', 'taxid', 'ngenomes', 'action'])
additional_edits.set_index(['genus', 'species'], inplace=True)

## Combine manual with automatic matches

In [21]:
_rows = []

for sp, auto_match in species_name_matches.items():
    manual_taxid = manual_species_matches.get(sp)
    
    if auto_match is None:
        assert manual_taxid is not None
        _rows.append((*sp, manual_taxid, 'manual'))
        
    else:
        auto_taxid = auto_match['matched_taxid']
        
        # Assert manual matches confirm auto matches where they exist
        assert manual_taxid is None or manual_taxid == auto_taxid

        _rows.append((*sp, auto_taxid, 'automatic'))
        
species_matches = pd.DataFrame.from_records(_rows, columns=['genus', 'species', 'taxid', 'method'])
species_matches.set_index(['genus', 'species'], inplace=True)
species_matches.sort_index(inplace=True)

## Find species/genome -> taxon assignments

In [22]:
species_assignments = {sp: int(row['taxid']) for sp, row in species_matches.iterrows()}

# Override matches with alternate assigments where appropriate
for sp, row in species_alternate_assignments.iterrows():
    taxid = row['taxid']
    species_assignments[sp] = None if taxid == 0 else int(taxid)

In [23]:
genome_taxon_assignments = dict()

for sp in species_names:
    sp_taxid = species_assignments[sp]
    
    # Additional edits for this species
    sp_edits = {
        taxid: (ngenomes, action)
        for sp2, (taxid, ngenomes, action)
        in additional_edits.iterrows()
        if sp2 == sp
    }
    
    edit_counts = Counter()
        
    for key in genomes_by_species[sp]:
        assembly_id = assembly_ids[key]
        assembly_taxid = assembly_taxids[assembly_id]
        
        # Go through ancestors to see if any edits apply
        for ancestor in iter_ancestors(assembly_taxid, incself=True):
            ataxid = ancestor['taxid']
            if ataxid in sp_edits:
                action = sp_edits[ataxid][1]
                
                # Assign according to edit
                if action in ('reassign', 'create'):
                    genome_taxon_assignments[key] = ataxid
                elif action == 'remove':
                    genome_taxon_assignments[key] = None
                else:
                    assert 0
                    
                edit_counts[ataxid] += 1
                break
                
        else:
            # Assign to taxon of species
            genome_taxon_assignments[key] = sp_taxid
            
    # Check that edits were applied to the appropriate # of genomes
    assert edit_counts == {taxid: n for taxid, (n, action) in sp_edits.items()}

## Analysis

### Merged taxa

In [24]:
merged_taxids = {taxid for taxid, cnt in Counter(species_matches['taxid']).items() if cnt > 1}

In [25]:
species_matches[species_matches['taxid'].isin(merged_taxids)]

Unnamed: 0_level_0,Unnamed: 1_level_0,taxid,method
genus,species,Unnamed: 2_level_1,Unnamed: 3_level_1
Bacillus,mycoides,1405,automatic
Bacillus,weihenstephanensis,1405,automatic
Streptomyces,californicus,67351,automatic
Streptomyces,puniceus,67351,automatic
Vibrio,albensis,666,automatic
Vibrio,cholerae,666,automatic


## Write output

In [26]:
species_matches.to_csv(processed_out / '201218-species-taxon-matches.csv')
species_alternate_assignments.to_csv(processed_out / '201218-species-alternate-taxonomy-assignments.csv')
additional_edits.to_csv(processed_out / '201218-additional-taxonomy-edits.csv')

In [27]:
with open('tmp/genome-taxon-assignments.json', 'wt') as f:
    json.dump(genome_taxon_assignments, f)