# 210719 Build genome database

In [1]:
from pathlib import Path
import json

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

In [3]:
from gambit.db.models import *
from gambit.db.migrate import init_db

In [4]:
from gambit_legacy.db.models import models_2_3

## Setup

In [5]:
DATESTR = '210719'
NBNAME = DATESTR + '-build-genome-database'
DATESTR_ISO = '2021-07-19'

In [6]:
infiles = dict(
    taxonomy=Path('data-intermediate/210718-compile-edits/'),
    db_v12=Path('/home/jared/projects/gambit/data/databases/refseq-curated/midas-1.2/refseq-curated-1.2a-201221.db'),
    thresholds=Path('data-intermediate/210719-set-thresholds/thresholds.csv'),
    genomes=Path('../../data/intermediate/210303-database-v2-overlaps/210303-format-data/genomes-v1.1.csv'),
)

In [7]:
intermediate_out = Path('data-intermediate') / NBNAME
if not intermediate_out.is_dir():
    intermediate_out.mkdir()

In [8]:
db_out = Path(f'/home/jared/projects/gambit/data/databases/refseq-curated/1.0-beta1/gambit-genomes-1.0b1-{DATESTR}.db')

## Load data

### Genomes

In [9]:
genomes_df = pd.read_csv(infiles['genomes'])

In [10]:
genomes_df.shape[0]

50752

### Updated taxonomy

In [11]:
taxa = pd.read_csv(infiles['taxonomy'] / 'taxa.csv')
taxa.set_index('id', inplace=True)

In [12]:
with open(infiles['taxonomy'] / 'genome-assignments.json') as f:
    genome_assignments_array = json.load(f)

In [13]:
assert len(genome_assignments_array) == genomes_df.shape[0]

genome_assignments = {acc: tid for acc, tid in zip(genomes_df['accession'], genome_assignments_array) if tid != 0}

In [14]:
len(genome_assignments)

48224

### Thresholds

In [15]:
thresholds_df = pd.read_csv(infiles['thresholds'])
thresholds_df.set_index('id', inplace=True)

assert np.all(taxa.index == thresholds_df.index)

In [16]:
taxa = pd.concat([taxa, thresholds_df], axis=1)

### MIDAS 1.2a Database

In [17]:
engine_v12 = create_engine(f'sqlite:///{infiles["db_v12"]}')

In [18]:
session_v12 = sessionmaker(engine_v12)()

In [19]:
gset_v12 = session_v12.query(models_2_3.ReferenceGenomeSet).one()

In [20]:
gset_v12.__dict__

{'_sa_instance_state': <sqlalchemy.orm.state.InstanceState at 0x7ffada338ca0>,
 'key': 'midas/assembly/curated',
 'name': 'refseq_curated_1.2a1',
 'extra': {'date_created': '201221',
  'parent': {'key': 'midas/assembly/curated',
   'version': '1.1',
   'description': 'Created 2020-05-26 by filtering version 0.9 by inclusion in refseq/assembly/all 1.1',
   'notes': 'Also an intermediate development version not intended for use, but version number not marked as such. A more appropriate version number would have been 1.1a1, but retroactively altering it would likely just lead to more confusion.'}},
 'id': 1,
 'version': '1.2a1',
 'description': 'Migration of version 1.1 from v1.x library format/schema to schema for newest v2.2 software release. This is an intermediate/development version not intended for full use, notably it does not yet have thresholds set. Incremented minor version number because previous version number was not properly marked as alpha/pre-release, and also due to the s

In [21]:
gset_v12.genomes.count()

50741

## Init

In [22]:
if db_out.is_file():
    db_out.unlink()

In [23]:
engine = create_engine(f'sqlite:///{db_out}')

In [24]:
init_db(engine)

In [25]:
Session = sessionmaker(engine)
session = Session()

## Genome set

In [26]:
gset = ReferenceGenomeSet(
    key='gambit/refseq-curated',
    version='1.0b1',
    name='GAMBIT curated RefSeq genomes version 1.0b1',
    description='Curated genome database derived from NCBI RefSeq. Beta version.',
    extra=dict(
        author='Jared Lumpe',
        revision=dict(
            num=1,
            date=DATESTR_ISO,
            description='Initial revision.'
        ),
    ),
)

In [27]:
session.add(gset)
session.commit()

## Taxa

In [28]:
old_id_to_taxon = dict()

In [29]:
for row in taxa.itertuples():
    taxon = Taxon(
        genome_set=gset,
        key=row.key,
        ncbi_id=None if pd.isnull(row.ncbi_id) else int(row.ncbi_id),
        name=row.name,
        rank=None if pd.isnull(row.rank) else row.rank,
        report=row.report,
        distance_threshold=row.final_threshold,
    )
    old_id_to_taxon[row.Index] = taxon
    session.add(taxon)

In [30]:
# Assign parents
for row in taxa.itertuples():
    if not pd.isnull(row.parent_id):
        taxon = old_id_to_taxon[row.Index]
        taxon.parent = old_id_to_taxon[int(row.parent_id)]

In [31]:
session.commit()

## Genomes

In [32]:
from gambit.query.classify import reportable_taxon

In [33]:
for old in tqdm(gset_v12.genomes, total=gset_v12.genomes.count()):
    acc = old.genome.refseq_acc
    try:
        taxon_id = genome_assignments[acc]
    except KeyError:
        continue

    taxon = old_id_to_taxon[taxon_id]
    reportable = reportable_taxon(taxon)
    
    assert old.genome.description.startswith('[' + acc + '] ')
    
    genome = Genome(
        key=old.genome.key,
        description=old.genome.description,
        ncbi_db=old.genome.entrez_db,
        ncbi_id=old.genome.entrez_id,
        refseq_acc=old.genome.refseq_acc,
        genbank_acc=old.genome.genbank_acc,
    )
    session.add(genome)
    
    annotated = AnnotatedGenome(
        genome_set=gset,
        genome=genome,
        taxon=taxon,
        organism=reportable.name,
    )
    session.add(annotated)

100%|██████████| 50741/50741 [00:38<00:00, 1307.19it/s]


In [34]:
assert gset.genomes.count() == len(genome_assignments)

In [35]:
session.commit()

## Save additional data

In [36]:
_data = [(old_id, taxon.id) for old_id, taxon in old_id_to_taxon.items()]

with open(intermediate_out / 'old-tid-to-new.json', 'w') as f:
    json.dump(_data, f)