In [1]:
exptname = '201031-database-v1.1-software-version-migration'
datestr = '201206'
nbname = datestr + '-compare-updated-genome-taxids'

In [2]:
import json
from pathlib import Path
from zipfile import ZipFile
from gzip import GzipFile
from collections import Counter

In [3]:
from tqdm import tqdm
import pandas as pd

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

In [4]:
from midas.db.models import Genome

## File paths

In [5]:
infiles = dict(
    v11_archive='/home/jared/projects/midas/data/v1/archives/refseq_curated_1.1_beta_200525.midas-archive.gz',
    taxa=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201102-download-taxa/'),
    taxa_additional=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201205-download-additional-taxa/'),
    taxonomy_original=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201113-original-genome-taxa/'),
    taxonomy_original_extra=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201124-original-taxa-extra-info/'),
    matches=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201122-taxon-name-matching/'),
    updated_taxids=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201201-download-updated-assembly-summaries/updated-assembly-taxids.json')
)

In [6]:
processed_out = Path('../../data/processed/') / exptname / nbname
processed_out.mkdir(exist_ok=True, parents=True)

## Load data

### Archive files

In [7]:
archive_v11 = ZipFile(GzipFile(infiles['v11_archive']))
archive_v11.read('info').decode()

'{"archive_version": "1.0"}'

In [8]:
with archive_v11.open('genome_sets/midas/assembly/curated') as f:
    gset_data = json.load(f)

In [9]:
genomes_by_species = dict()

for key, adata in gset_data['annotations'].items():
    sp = (adata['tax_genus'], adata['tax_species'])
    genomes_by_species.setdefault(sp, set()).add(key)
    
species_names = sorted(genomes_by_species.keys())
genus_names = sorted(set(g for g, s in species_names))
    
len(genus_names), len(species_names)

(419, 1438)

### Taxonomy

#### Current data

In [10]:
with open(infiles['taxa'] / 'taxa.json') as f:
    taxon_data = {tdata['taxid']: tdata for tdata in json.load(f)}

with open(infiles['taxa_additional'] / 'taxa.json') as f:
    taxon_data.update({tdata['taxid']: tdata for tdata in json.load(f)})

In [11]:
with open(infiles['taxa'] / 'aka_taxids.json') as f:
    aka_taxids = {int(id1): id2 for id1, id2 in json.load(f).items()}

#### Original 2016 data

In [12]:
with open(infiles['taxonomy_original'] / 'original-tax-summaries.json') as f:
    orig_tax_summaries = {int(tid): s for tid, s in json.load(f).items()}

### Name matches

In [13]:
species_name_matches = dict()

with open(infiles['matches'] / 'species-name-matches.json') as f:
    for d in json.load(f):
        sp = (d.pop('curated_genus'), d.pop('curated_species'))
        species_name_matches[sp] = d if d['matched_taxid'] is not None else None

### Database connection

In [14]:
engine = create_engine('sqlite:///db.sqlite')
Session = sessionmaker(engine)

In [15]:
session = Session()

In [16]:
assembly_ids = {g.key: g.entrez_id for g in session.query(Genome)}

In [17]:
orig_assembly_taxids = {g.entrez_id: g.extra['ncbi_taxid'] for g in session.query(Genome)}

### Updated assembly taxids

In [18]:
with open(infiles['updated_taxids']) as f:
    updated_assembly_taxids = {int(k): v for k, v in json.load(f).items()}

In [19]:
assert updated_assembly_taxids.keys() == orig_assembly_taxids.keys()

## Func defs

In [20]:
def resolve_alias(tid):
    return aka_taxids.get(tid, tid)

def taxid_eq(tid1, tid2):
    return resolve_alias(tid1) == resolve_alias(tid2)

In [21]:
def gettaxon(tid):
    """Get taxon by ID, resolving alias IDs."""
    return taxon_data[resolve_alias(tid)]

In [22]:
def getparent(taxon):
    """Get taxon's parent, handling aliases of parent id."""
    if isinstance(taxon, int):
        taxon = gettaxon(taxon)
        
    try:
        return gettaxon(taxon['parent_taxid'])
    except KeyError:
        return None

In [23]:
def iter_ancestors(taxon, incself=False):
    if isinstance(taxon, int):
        taxon = gettaxon(taxon)
    if not incself:
        taxon = getparent(taxon)
        
    while taxon is not None:
        yield taxon
        taxon = getparent(taxon)

In [24]:
def taxon_url(taxid):
    return 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=%d' % taxid

## Find genomes with reassigned taxids

In [25]:
_rows = []

for sp in species_names:
    match = species_name_matches[sp]
    mtaxid = None if match is None else match['matched_taxid']
    
    for key in genomes_by_species[sp]:
        aid = assembly_ids[key]
        ntid = resolve_alias(updated_assembly_taxids[aid])
        otid = orig_assembly_taxids[aid]
        otid2 = resolve_alias(otid)
        
        if otid2 != ntid:
            s = orig_tax_summaries[otid]
            
            in_match = mtaxid is not None and any(taxon['taxid'] == mtaxid for taxon in iter_ancestors(otid2, incself=True))
            reclassified = (s['genus'], s['species']) != sp
                
            _rows.append((*sp, key.split('/')[-1], otid2, ntid, mtaxid is not None, in_match, reclassified))
            
df = pd.DataFrame.from_records(_rows, columns=['genus', 'species', 'genome', 'old_taxid', 'new_taxid', 'species_has_match', 'new_in_match', 'reclassified'])

In [26]:
df.groupby(['species_has_match', 'new_in_match', 'reclassified']).size()

species_has_match  new_in_match  reclassified
False              False         False            26
                                 True              1
True               False         True             72
                   True          False           329
dtype: int64

In [27]:
df.to_csv(processed_out / (datestr + '-genome-taxid-reassignments.csv'), index=False)