# 2011109 Match taxa

In [1]:
exptname = '201031-database-v1.1-software-version-migration'
datestr = '201109'
nbname = datestr + '-match-taxa'

In [2]:
import json
from pathlib import Path
from zipfile import ZipFile
from gzip import GzipFile
from datetime import date
import re


import pandas as pd
from tqdm import tqdm
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

In [3]:
from midas.db.models import Genome, ReferenceGenomeSet, Taxon, AnnotatedGenome

from midas.ncbi import entrez_url

## File paths

In [4]:
infiles = dict(
    v1_archive='/home/jared/projects/midas/data/v1/archives/refseq_curated_1.1_beta_200525.midas-archive.gz',
    taxa=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201102-download-taxa/'),
)

In [5]:
dbfile = Path('./db.sqlite')

In [6]:
processed_out = Path('../../data/processed/') / exptname / nbname
processed_out.mkdir(exist_ok=True, parents=True)

In [7]:
intermediate_out = Path('../../data/intermediate/') / exptname / nbname
intermediate_out.mkdir(exist_ok=True, parents=True)

## Load data

### v1.1 archive

In [8]:
archive_zip = ZipFile(GzipFile(infiles['v1_archive']))
archive_zip.read('info').decode()

'{"archive_version": "1.0"}'

In [9]:
with archive_zip.open('genome_sets/midas/assembly/curated') as f:
    gset_data = json.load(f)

In [10]:
genus_names = set()
species_names = set()

for adata in gset_data['annotations'].values():
    genus_names.add(adata['tax_genus'])
    species_names.add((adata['tax_genus'], adata['tax_species']))
    
species_names = sorted(species_names)
genus_names = sorted(genus_names)
    
len(genus_names), len(species_names)

(419, 1438)

### Taxonomy

In [11]:
with open(infiles['taxa'] / 'taxa.json') as f:
    taxon_data = json.load(f)
    
# Convert to dict indexed by ID
taxon_data = {tdata['taxid']: tdata for tdata in taxon_data}

In [12]:
with open(infiles['taxa'] / 'aka_taxids.json') as f:
    aka_taxids = json.load(f)
    
aka_taxids = {int(id1): id2 for id1, id2 in aka_taxids.items()}

## Database connection

In [13]:
engine = create_engine('sqlite:///%s' % dbfile)
Session = sessionmaker(engine)

In [14]:
session = Session()

## Func defs

In [15]:
def resolve_alias(tid):
    return aka_taxids.get(tid, tid)

In [16]:
def gettaxon(tid):
    """Get taxon by ID, resolving alias IDs."""
    return taxon_data[resolve_alias(tid)]

In [17]:
def getparent(taxon):
    """Get taxon's parent, handling aliases of parent id."""
    if isinstance(taxon, int):
        taxon = taxon_data[taxon]
        
    try:
        return gettaxon(taxon['parent_taxid'])
    except KeyError:
        return None

In [18]:
def find_ancestor(taxon, accept, incself=False):
    """Find first ancestor of taxon for which accept() returns True."""
    if not incself:
        taxon = getparent(taxon)
        
    while taxon is not None and not accept(taxon):
        taxon = getparent(taxon)
        
    return taxon

In [19]:
def find_ancestor_of_rank(taxon, rank):
    return find_ancestor(taxon, lambda tax: tax['rank'] == rank, incself=True)

In [20]:
def normalize_name(name):
    if not isinstance(name, str):
        name = ' '.join(name)
        
    name = name.lower()
    
    if name.startswith('candidatus '):
        name = name.split(' ', 1)[1]

    return name

def name_eq(name1, name2):
    return normalize_name(name1) == normalize_name(name2)

In [21]:
def taxon_url(taxid):
    return 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=%d' % taxid

## Non-alphabetic genus and species names

Expect curated genus/species names in previous database version to contain only alphabetic characters, check for instances where this does not hold true:

In [22]:
for name in genus_names:
    if re.fullmatch(r'[A-Z][a-z]+', name) is None:
        print(name)

(No genus names with unexpected format)

In [23]:
for g, s in species_names:
    if re.fullmatch(r'[a-z]+', s, re.I) is None:
        print('%25s\t%s' % (g, s))

            Aphanizomenon	flos-aquae
                  Blautia	[Ruminococcus] gnavus
             Caballeronia	Paraburkholderia zhejiangensis
   Erysipelatoclostridium	[Clostridium] innocuum
              Haemophilus	[Haemophilus] parasuis
               Klebsiella	[Enterobacter] aerogenes
        Lachnoclostridium	[Clostridium] bolteae
        Lachnoclostridium	[Clostridium] citroniae
        Lachnoclostridium	[Clostridium] clostridioforme
        Lachnoclostridium	[Clostridium] symbiosum
              Pasteurella	[Pasteurella] pneumotropica
              Pseudomonas	syringae group genomosp. 7
        Ruminiclostridium	[Clostridium] clariflavum
        Ruminiclostridium	[Clostridium] josui
        Ruminiclostridium	[Clostridium] stercorarium
        Ruminiclostridium	[Eubacterium] siraeum
                Wolbachia	endosymbiont of Culex quinquefasciatus
                Wolbachia	endosymbiont of Drosophila melanogaster


## Find genus and species ancestors

Create dictionaries mapping all taxa to their ancestors with genus or species rank, where they exist.

In [24]:
species_ancestors = {tid: tid for tid, taxon in taxon_data.items() if taxon['rank'] == 'species'}

for taxon in taxon_data.values():
    ancestors = []
    
    while taxon is not None:
        ancestors.append(taxon['taxid'])
        
        if taxon['taxid'] in species_ancestors:
            sp_tid = species_ancestors[taxon['taxid']]
            for tid in ancestors:
                species_ancestors[tid] = sp_tid
            break
            
        taxon = getparent(taxon)

In [25]:
genus_ancestors = {tid: tid for tid, taxon in taxon_data.items() if taxon['rank'] == 'genus'}

for taxon in taxon_data.values():
    ancestors = []
    
    while taxon is not None:
        ancestors.append(taxon['taxid'])
        
        if taxon['taxid'] in genus_ancestors:
            sp_tid = genus_ancestors[taxon['taxid']]
            for tid in ancestors:
                genus_ancestors[tid] = sp_tid
            break
            
        taxon = getparent(taxon)

## Group original taxa by curated taxa

All genomes have original NCBI taxonomy ID stored in metadata, the correct one to use for each species/genome in the curated database should be one of these or their ancestors in most cases.

In [26]:
genome_taxids_by_db_species = {sp: set() for sp in species_names}
genome_taxids_by_db_genus = {sp: set() for sp in genus_names}

for genome in session.query(Genome):
    a = gset_data['annotations'][genome.key]
    tid = resolve_alias(genome.extra['ncbi_taxid'])
    
    genome_taxids_by_db_species[(a['tax_genus'], a['tax_species'])].add(tid)
    genome_taxids_by_db_genus[a['tax_genus']].add(tid)

In [27]:
species_taxids_by_db_species = {
    db_species: {species_ancestors[tid] for tid in taxids if tid in species_ancestors}
    for db_species, taxids in genome_taxids_by_db_species.items()
}

In [28]:
genus_taxids_by_db_genus = {
    db_genus: {genus_ancestors[tid] for tid in taxids if tid in genus_ancestors}
    for db_genus, taxids in genome_taxids_by_db_genus.items()
}

## Name matching

Try to assign NCBI taxa to curated genus/species by simple matching on names, restricted to the case that the match is an ancestor of the original NCBI-assigned taxon of one of the genomes.

Note that some of the genus names in the downloaded NCBI taxa are denotated [Candidatus](https://www.wikiwand.com/en/Candidatus), need to make sure to remove this from the name before trying to match it to the genus names from souce v1 database. See `normalize_name` and `name_eq` functions.

In [29]:
genus_map = {g: None for g in genus_names}

for db_genus, taxids in genus_taxids_by_db_genus.items():
    for taxid in taxids:
        taxon = gettaxon(taxid)
        
        if name_eq(taxon['scientific_name'], db_genus):
            assert genus_map[db_genus] is None
            genus_map[db_genus] = taxid

In [30]:
species_map = {s: None for s in species_names}

for db_species, taxids in species_taxids_by_db_species.items():
    for taxid in taxids:
        taxon = gettaxon(taxid)
        
        if name_eq(taxon['scientific_name'], db_species):
            assert species_map[db_species] is None
            species_map[db_species] = taxid

In [31]:
sum(v is None for v in genus_map.values()), len(genus_map)

(3, 419)

In [32]:
sum(v is None for v in species_map.values()), len(species_map)

(101, 1438)

## Try matching remaining genomes

In [33]:
for k, v in genus_map.items():
    if v is None:
        for tid in genus_taxids_by_db_genus[k]:
            print(k, gettaxon(tid)['scientific_name'], taxon_url(tid), sep='\t')

Lysinimicrobium	Demequina	https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=577469
Nautella	Phaeobacter	https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=302485
Turicella	Corynebacterium	https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=1716


### Turicella

[Taxonomy database entry](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=1716&lvl=3&lin=f&keep=1&srchmode=1&unlock) for Corynebacterium shows Turicella as a "heterotypic synonym", make this assignment manually.

In [34]:
genus_map['Turicella'] = 1716  # Corynebacterium taxid

## Write output

### Curated -> NCBI taxon mappings to JSON

In [35]:
with open(intermediate_out / 'genus-map.json', 'w') as f:
    json.dump(genus_map, f)

In [36]:
# Tuple keys won't work with JSON
_sp_map_json = [dict(curated_species=s, curated_genus=g, ncbi_taxid=taxid) for (g, s), taxid in species_map.items()]

with open(intermediate_out / 'species-map.json', 'w') as f:
    json.dump(_sp_map_json, f)

### Summarize remaining problems

In [37]:
_rows = []

for g in sorted(genus_names):
    if genus_map[g] is None:
        for tid in genus_taxids_by_db_genus[g]:
            taxon = gettaxon(tid)
            _rows.append((g, None, tid, taxon['scientific_name'], taxon_url(tid)))

for sp in sorted(species_names):
    if species_map[sp] is None:
        _rows2 = []
        for tid in species_taxids_by_db_species[sp]:
            taxon = gettaxon(tid)
            _rows2.append((*sp, tid, taxon['scientific_name'], taxon_url(tid)))
            
        _rows2.sort(key=lambda r: r[0:2])
        _rows.extend(_rows2)
            
df = pd.DataFrame.from_records(_rows, columns=['curated_genus', 'curated_species', 'taxon_id', 'taxon_name', 'taxon_url'])

In [38]:
df.to_csv(processed_out / (datestr + '-db-v1.1-unmapped-taxa.csv'), index=False)