# 201221 Perform migration

In [44]:
exptname = '201031-database-v1.1-software-version-migration'
datestr = '201221'
nbname = datestr + '-compare-updated-genome-taxids'

In [45]:
import json
import re
from pathlib import Path
from zipfile import ZipFile
from gzip import GzipFile

In [46]:
from tqdm import tqdm
import pandas as pd

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

In [47]:
from midas.db.models import Genome, ReferenceGenomeSet, Taxon, AnnotatedGenome
from midas.db.migrate import init_db

## File paths

In [48]:
infiles = dict(
    v11_archive='/home/jared/projects/midas/data/v1/archives/refseq_curated_1.1_beta_200525.midas-archive.gz',
    taxa=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201102-download-taxa/'),
    taxa_othernames=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201109-extract-additional-taxonomy-data/taxon-othernames.json'),
    taxa_additional=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201205-download-additional-taxa/'),
    assembly_summaries=Path('tmp/assembly-summaries/'),
    taxonomy_assignments=Path('../../data/processed/201031-database-v1.1-software-version-migration/201218-final-taxonomy-assignments/'),
)

## Load data

### Archive files

In [49]:
archive_v11 = ZipFile(GzipFile(infiles['v11_archive']))
archive_v11.read('info').decode()

'{"archive_version": "1.0"}'

In [50]:
with archive_v11.open('genome_sets/midas/assembly/curated') as f:
    gset_data = json.load(f)

In [51]:
assembly_ids = dict()
archive_keys_ordered = []

for name in archive_v11.namelist():
    if not name.startswith('genomes/'):
        continue
    
    with archive_v11.open(name) as f:
        data = json.load(f)
        
        assembly_ids[data['key']] = data['gb_id']
        assert data['key_version'] == '1.1'
        archive_keys_ordered.append(data['key'])

In [52]:
NEXT_GENOME_VERSION = '1.2'

In [53]:
species_names = set((v['tax_genus'], v['tax_species']) for v in gset_data['annotations'].values())
len(species_names)

1438

### Taxa

In [54]:
with open(infiles['taxa'] / 'taxa.json') as f:
    taxon_data = {tdata['taxid']: tdata for tdata in json.load(f)}

with open(infiles['taxa_additional'] / 'taxa.json') as f:
    taxon_data.update({tdata['taxid']: tdata for tdata in json.load(f)})

In [55]:
with open(infiles['taxa'] / 'aka_taxids.json') as f:
    aka_taxids = {int(id1): id2 for id1, id2 in json.load(f).items()}

### Taxon assignments

In [56]:
species_match_df = pd.read_csv(infiles['taxonomy_assignments'] / '201218-species-taxon-matches.csv', index_col=['genus', 'species'])

In [57]:
species_by_matched_taxid = dict()

for sp, row in species_match_df.iterrows():
    species_by_matched_taxid.setdefault(row['taxid'], []).append(sp)

In [58]:
with open('tmp/genome-taxon-assignments.json') as f:
    genome_taxon_assignments = json.load(f)

In [59]:
species_taxids = set(taxid for taxid in genome_taxon_assignments.values() if taxid is not None)
len(species_taxids)

1438

## Func defs

In [60]:
def reflow(text):
    """Remove line breaks or runs of whitespace in text and replace with single spaces."""
    return re.sub(r'\s+', ' ', text.strip())

In [61]:
def resolve_alias(tid):
    return aka_taxids.get(tid, tid)

def taxid_eq(tid1, tid2):
    return resolve_alias(tid1) == resolve_alias(tid2)

In [62]:
def gettaxon(tid):
    """Get taxon by ID, resolving alias IDs."""
    return taxon_data[resolve_alias(tid)]

In [63]:
def getparent(taxon):
    """Get taxon's parent, handling aliases of parent id."""
    if isinstance(taxon, int):
        taxon = gettaxon(taxon)
        
    try:
        return gettaxon(taxon['parent_taxid'])
    except KeyError:
        return None

In [64]:
def iter_ancestors(taxon, incself=False):
    if isinstance(taxon, int):
        taxon = gettaxon(taxon)
    if not incself:
        taxon = getparent(taxon)
        
    while taxon is not None:
        yield taxon
        taxon = getparent(taxon)

## Migrate database

### Create database

In [65]:
dbfile = 'refseq_curated_1.2a_%s.db' % datestr
!rm {dbfile}
engine = create_engine('sqlite:///' + dbfile)
Session = sessionmaker(engine)

rm: cannot remove 'refseq_curated_1.2a_201221.db': No such file or directory


In [66]:
init_db(engine)

In [67]:
session = Session()

### Reference genome set

In [68]:
refset = ReferenceGenomeSet(
    key='midas/assembly/curated',
    version='1.2a1',
    name='refseq_curated_1.2a1',
    extra=dict(
        date_created=datestr,
        parent=dict(
            key=gset_data['key'],
            version=gset_data['key_version'],
            description=gset_data['description'],
        ),
    ),
)

In [69]:
refset.description = reflow('''
Migration of version 1.1 from v1.x library format/schema to schema for newest v2.2 software release.

This is an intermediate/development version not intended for full use, notably it does not yet have thresholds set.
Incremented minor version number because previous version number was not properly marked as alpha/pre-release, and also due to
the schema change.

Apart from migrating to the new schema, this version is updated to include the latest information downloaded from the NCBI
assembly and taxonomy databases (the initial version of the database was based on information downloaded in 2016).
Some additional curation was performed based on the changes to the NCBI data between then and now.
''')

In [70]:
refset.extra['parent']['notes'] = reflow('''
Also an intermediate development version not intended for use, but version number not marked as such.
A more appropriate version number would have been 1.1a1, but retroactively altering it would likely
just lead to more confusion.
''')

In [71]:
session.add(refset)
session.commit()

### Taxa

In [72]:
def make_taxon(data):
    
    return Taxon(
        name=data['scientific_name'],
        rank=data['rank'],
        report=True,
        ncbi_id=data['taxid'],
        extra=dict(
            ncbi_data=data,
        ),
        reference_set=refset,
    )

In [73]:
species_to_genus_map = dict()

for taxid in species_taxids:
    if taxid is None:
        continue
    assert taxid not in aka_taxids
    
    for ancestor in iter_ancestors(taxid):
        if ancestor['rank'] == 'genus':
            species_to_genus_map[taxid] = ancestor['taxid']
            break
            
    else:
        taxon = gettaxon(taxid)
        cnt = sum(taxid2 == taxid for taxid2 in genome_taxon_assignments.values())
        print('No genus ancestor found for %d %s (%d genomes)' % (taxid, taxon['scientific_name'], cnt))
        
        species_to_genus_map[taxid] = None

No genus ancestor found for 39492 [Eubacterium] siraeum (2 genomes)


In [74]:
taxa = dict()

#### Genus taxa

In [75]:
for taxid in set(species_to_genus_map.values()):
    if taxid is None:
        continue

    data = gettaxon(taxid)
    assert data['taxid'] == taxid
    assert data['rank'] == 'genus'
    
    taxon = make_taxon(data)
    taxa[taxid] = taxon
    
    session.add(taxon)

#### Species taxa

In [76]:
for taxid in set(species_to_genus_map.keys()):

    data = gettaxon(taxid)
    assert data['taxid'] == taxid
    assert data['rank'] == 'species'
    
    taxon = make_taxon(data)
    taxa[taxid] = taxon
    
    # Genus taxon as parent
    parent_id = species_to_genus_map[taxid]
    if parent_id is not None:
        taxon.parent = taxa[parent_id]
        
    # Add species name(s) corresponding to this taxon in the previous database version
    taxon.extra['v1_1_species_names'] = [
        dict(genus=g, species=s)
        for g, s in species_by_matched_taxid.get(taxid, [])
    ]
    
    session.add(taxon)

In [77]:
session.commit()

### Genomes

In [78]:
def make_genome(key):
    aid = assembly_ids[key]
    
    with open(infiles['assembly_summaries'] / ('%d.json' % aid)) as f:
        summary = json.load(f)
        
    with archive_v11.open('genomes/' + key) as f:
        v11_data = json.load(f)
        
    # Extract refseq and genbank accession nos
    refseq_acc = summary['assemblyaccession']
    assert refseq_acc.startswith('GCF_')
    assert key.split('/')[-1] == refseq_acc
    genbank_acc = summary['synonym']['genbank']
    assert genbank_acc.startswith('GCA_')
    assert summary['synonym']['refseq'] == refseq_acc
    assert summary['synonym']['similarity'] == 'identical'
        
    annotation = gset_data['annotations'][key]
    
    genome = Genome(
        key=key,
        version=NEXT_GENOME_VERSION,
        description='[%s] %s' % (refseq_acc, summary['organism']),
        entrez_db='assembly',
        entrez_id=aid,
        refseq_acc=refseq_acc,
        genbank_acc=genbank_acc,
        extra=dict(
            ncbi_taxid=int(summary['taxid']),
            v1_1_taxonomy=dict(
                genus=annotation['tax_genus'],
                species=annotation['tax_species'],
                strain=annotation['tax_strain'],
            ),
            sequence_source=v11_data['meta']['sequence_source'],
        ),
    )

    return genome

In [79]:
genomes = dict()

# Do this in the order the files are stored in the archive zip file
# so we aren't seeking around within the outer gzip file, which is
# very slow
for key in tqdm(archive_keys_ordered):
    taxid = genome_taxon_assignments[key]
    if taxid is None:
        continue
        
    genome = make_genome(key)
    genomes[key] = genome
    
    session.add(genome)

100%|██████████| 50752/50752 [00:21<00:00, 2411.22it/s]


In [80]:
session.commit()

### Genome annotations

In [81]:
for key, genome in tqdm(genomes.items()):
    taxid = genome_taxon_assignments[key]
    assert taxid is not None
    taxon = taxa[taxid]
    
    ag = AnnotatedGenome(
        genome=genome,
        reference_set=refset,
        primary_taxon=taxon,
        organism=taxon.name,
    )
    
    session.add(ag)

100%|██████████| 50741/50741 [00:05<00:00, 9469.56it/s] 


In [82]:
session.commit()

## Sanity checks and basic statistics

### Genomes

In [83]:
len(assembly_ids), session.query(Genome).count()

(50752, 50741)

In [84]:
for genome in tqdm(session.query(Genome)):
    assert len(genome.annotations) == 1
    ag, = genome.annotations
    assert ag.primary_taxon.rank == 'species'

50741it [00:15, 3376.66it/s]


### Taxa

In [85]:
session.query(Taxon).filter_by(rank='species').count(), session.query(Taxon).filter_by(rank='genus').count()

(1438, 462)

In [86]:
for taxon in session.query(Taxon).order_by('name'):
    if taxon.rank == 'species':
        # No singletons
        assert taxon.genomes_primary.count() >= 2
        
        # Check parent is genus
        if taxon.parent is None:
            print('\n%s %d has no parent' % (taxon.name, taxon.ncbi_id))
        else:
            assert taxon.parent.rank == 'genus'
        
        v11_names = taxon.extra['v1_1_species_names']
        if len(v11_names) != 1:
            print('\n%s %d has %d v1.1 names' % (taxon.name, taxon.ncbi_id, len(v11_names)))
            for n in v11_names:
                print('\t', (n['genus'], n['species']))
        
    elif taxon.rank == 'genus':
        assert taxon.genomes_primary.count() == 0
        assert taxon.parent is None
        assert len(taxon.children) > 0
        
    else:
        assert 0


Bacillus mycoides 1405 has 2 v1.1 names
	 ('Bacillus', 'mycoides')
	 ('Bacillus', 'weihenstephanensis')

Francisella orientalis 299583 has 0 v1.1 names

Pectobacterium brasiliense 180957 has 0 v1.1 names

Pectobacterium odoriferum 78398 has 0 v1.1 names

Pectobacterium parmentieri 1905730 has 0 v1.1 names

Pectobacterium parvum 2778550 has 0 v1.1 names

Photorhabdus laumondii 2218628 has 0 v1.1 names

Salinispora fenicalii 1137263 has 0 v1.1 names

Salinispora mooreana 999545 has 0 v1.1 names

Salinispora oceanensis 1050199 has 0 v1.1 names

Salinispora vitiensis 999544 has 0 v1.1 names

Streptomyces californicus 67351 has 2 v1.1 names
	 ('Streptomyces', 'californicus')
	 ('Streptomyces', 'puniceus')

Vibrio cholerae 666 has 2 v1.1 names
	 ('Vibrio', 'albensis')
	 ('Vibrio', 'cholerae')

Vibrio diabolicus 50719 has 0 v1.1 names

Xanthomonas hortorum 56454 has 0 v1.1 names

Xanthomonas phaseoli 1985254 has 0 v1.1 names

[Eubacterium] siraeum 39492 has no parent
