# 211116 gambit-refseq-curated-1.0b2-rev2

In [1]:
from pathlib import Path
import shutil
from collections import Counter

In [2]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

In [3]:
from gambit.db.models import *
from gambit.db.models import reportable_taxon
import gambit.io.json as gjson

## Setup

In [4]:
DATESTR = '211116'
DATESTR_LONG = '2021-11-16'

In [5]:
archive_dir = Path('archive')
archive_dir.mkdir(exist_ok=True)

In [6]:
infiles = dict(
    db_rev1=archive_dir / 'gambit-genomes-1.0b2-211111.db',
)

In [7]:
outfiles = dict(
    db_rev2=archive_dir / f'gambit-genomes-1.0b2-rev2-{DATESTR}.db',
)

## Copy DB and open

In [8]:
shutil.copy(infiles['db_rev1'], outfiles['db_rev2'])

PosixPath('archive/gambit-genomes-1.0b2-rev2-211116.db')

In [9]:
engine = create_engine(f"sqlite:///{outfiles['db_rev2']}")
Session = sessionmaker(engine)

In [10]:
session = Session()

In [11]:
gset = session.query(ReferenceGenomeSet).one()

## Check for all mismatches

In [12]:
counts = Counter()

for genome in session.query(AnnotatedGenome):
    expected = reportable_taxon(genome.taxon)
    if genome.organism != expected.name:
        counts[(genome.taxon, genome.organism)] += 1

counts

Counter({(<Taxon:1893 'Escherichia coli subgroup 2'>,
          'Escherichia coli/Shigella'): 2204,
         (<Taxon:1894 'Escherichia coli subgroup 3'>,
          'Escherichia coli/Shigella'): 508,
         (<Taxon:1892 'Escherichia coli subgroup 1'>,
          'Escherichia coli/Shigella'): 1487})

Looks like just the expected ones.

## Update

### `genome.organism` column

In [13]:
for tid in [1892, 1893, 1894]:
    taxon = session.query(Taxon).get(tid)
    for genome in taxon.genomes:
        genome.organism = 'Escherichia coli'

In [14]:
for genome in session.query(AnnotatedGenome):
    expected = reportable_taxon(genome.taxon)
    assert genome.organism == expected.name

### Update genome set

In [15]:
vars(gset)

{'_sa_instance_state': <sqlalchemy.orm.state.InstanceState at 0x7f85a432bfa0>,
 'version': '1.0b2',
 'key': 'gambit/refseq-curated',
 'description': 'Curated genome database derived from NCBI RefSeq. Beta version.',
 'id': 1,
 'name': 'GAMBIT curated RefSeq genomes version 1.0b2',
 'extra': {'author': 'Jared Lumpe',
  'revision': {'num': 1,
   'date': '2021-11-11',
   'description': 'Initial revision.'},
  'notes': "Changes from 1.0b1: created new E. coli taxon (id=1917), inserted in taxonomy tree between E. coli/Shigella (id=1862) and E. coli subgroup taxa (ids=1892,1893,1894). Changed rank of E. coli/Shigella from 'species' to None."}}

In [18]:
extra = dict(gset.extra)

extra['revision'] = dict(
    num=2,
    date=DATESTR_LONG,
    description='Update organism column of genomes which have new E. Coli taxon (id=1917) as their reportable taxon.',
)

gset.extra = extra

### Done

In [19]:
session.commit()