# 201124 original taxa extra info

In [2]:
exptname = '201031-database-v1.1-software-version-migration'
datestr = '201124'
nbname = datestr + '-original-taxa-extra-info'

In [3]:
import json
from pathlib import Path
from zipfile import ZipFile
from gzip import GzipFile
from collections import Counter

In [4]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

In [5]:
from midas.db.models import Genome

## File paths

In [6]:
infiles = dict(
    v11_archive='/home/jared/projects/midas/data/v1/archives/refseq_curated_1.1_beta_200525.midas-archive.gz',
    taxa=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201102-download-taxa/'),
#    taxonomy_additional=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201109-extract-additional-taxonomy-data/'),
    taxonomy_original=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201113-original-genome-taxa/'),
    matches=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201122-taxon-name-matching/'),
)

In [7]:
intermediate_out = Path('../../data/intermediate/') / exptname / nbname
intermediate_out.mkdir(exist_ok=True, parents=True)

## Load data

### Archive files

In [8]:
archive_v11 = ZipFile(GzipFile(infiles['v11_archive']))
archive_v11.read('info').decode()

'{"archive_version": "1.0"}'

In [9]:
with archive_v11.open('genome_sets/midas/assembly/curated') as f:
    gset_data = json.load(f)

In [10]:
genus_names = set()
species_names = set()

for adata in gset_data['annotations'].values():
    genus_names.add(adata['tax_genus'])
    species_names.add((adata['tax_genus'], adata['tax_species']))
    
species_names = sorted(species_names)
genus_names = sorted(genus_names)
    
len(genus_names), len(species_names)

(419, 1438)

### Database

In [11]:
engine = create_engine('sqlite:///db.sqlite')
Session = sessionmaker(engine)

In [12]:
session = Session()

### Taxonomy

#### Current data

In [13]:
with open(infiles['taxa'] / 'taxa.json') as f:
    taxon_data = json.load(f)
    
# Convert to dict indexed by ID
taxon_data = {tdata['taxid']: tdata for tdata in taxon_data}

In [14]:
with open(infiles['taxa'] / 'aka_taxids.json') as f:
    aka_taxids = json.load(f)
    
aka_taxids = {int(id1): id2 for id1, id2 in aka_taxids.items()}

In [15]:
# Check taxon_data dict doesn't include any alias taxonomy IDs
for taxid, taxon in taxon_data.items():
    assert taxon['taxid'] == taxid
    assert taxid not in aka_taxids
    assert taxon['parent_taxid'] == 0 or taxon['parent_taxid'] in taxon_data

#### Original 2016 data

In [16]:
with open(infiles['taxonomy_original'] / 'original-tax-summaries.json') as f:
    orig_tax_summaries = {int(tid): s for tid, s in json.load(f).items()}

In [17]:
with open(infiles['taxonomy_original'] / 'genome-matching-taxids-by-species.json') as f:
    species_genome_taxids = json.load(f)

species_genome_taxids = {(d['curated_genus'], d['curated_species']): set(d['taxids']) for d in species_genome_taxids}

## Func defs

In [18]:
def resolve_alias(tid):
    return aka_taxids.get(tid, tid)

In [19]:
def gettaxon(tid):
    """Get taxon by ID, resolving alias IDs."""
    return taxon_data[resolve_alias(tid)]

## Count genomes for original taxa

In [20]:
genomes_by_db_species = {sp: set() for sp in species_names}

for key, a in gset_data['annotations'].items():
    sp = (a['tax_genus'], a['tax_species'])
    genomes_by_db_species[sp].add(key)

In [21]:
genome_orig_taxids = {g.key: g.extra['ncbi_taxid'] for g in session.query(Genome)}

In [22]:
filtered_genome_taxid_counts = dict()
alt_genome_taxid_counts = dict()

for sp, genome_keys in genomes_by_db_species.items():
    counts = filtered_genome_taxid_counts[sp] = Counter()
    alt_counts = alt_genome_taxid_counts[sp] = Counter()
    
    for key in genome_keys:
        taxid = genome_orig_taxids[key]
        summary = orig_tax_summaries[taxid]
        
        if (summary['genus'], summary['species']) == sp:
            counts[resolve_alias(taxid)] += 1
        else:
            alt_counts[resolve_alias(taxid)] += 1

    assert counts.keys() == set(species_genome_taxids[sp])

## Write output

In [23]:
with open(intermediate_out / 'genome-original-taxids.json', 'w') as f:
    json.dump(genome_orig_taxids, f)

In [24]:
_genome_orig_taxid_counts_json = [
    dict(curated_genus=sp[0], curated_species=sp[1], filtered_counts=filtered_genome_taxid_counts[sp], other_counts=alt_genome_taxid_counts[sp])
    for sp in species_names
]

with open(intermediate_out / 'original-genome-taxid-counts.json', 'w') as f:
    json.dump(_genome_orig_taxid_counts_json, f)