# 211111 gambit-refseq-curated-1.0b2

In [1]:
from pathlib import Path
import shutil

In [2]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

In [3]:
from gambit.db.fromfile import file_sessionmaker
from gambit.db.models import *
import gambit.io.json as gjson

## Setup

In [4]:
DATESTR = '211111'
DATESTR_LONG = '2021-11-11'

In [5]:
NEW_VERSION = '1.0b2'

In [6]:
archive_dir = Path('archive')
archive_dir.mkdir(exist_ok=True)

In [7]:
infiles = dict(
    db_old=Path('/home/jared/s/gambit/data/databases/refseq-curated/1.0-beta1/gambit-genomes-1.0b1-210719.db'),
)

In [8]:
outfiles = dict(
    db_new=archive_dir / f'gambit-genomes-{NEW_VERSION}-{DATESTR}.db',
)

## Code

In [9]:
def print_tax_tree(taxon, indent='  ', _indent=''):
    print(_indent, taxon.id, ' ', taxon.name, sep='')
    for child in taxon.children:
        print_tax_tree(child, indent=indent, _indent=_indent + indent)

## Copy DB and open

In [10]:
shutil.copy(infiles['db_old'], outfiles['db_new'])

PosixPath('archive/gambit-genomes-1.0b2-211111.db')

In [11]:
engine = create_engine(f"sqlite:///{outfiles['db_new']}")
Session = sessionmaker(engine)

In [12]:
session = Session()

In [13]:
gset = session.query(ReferenceGenomeSet).one()

## Inspect

In [14]:
root = session.query(Taxon).filter_by(name='Escherichia coli/Shigella').one()

In [15]:
root.__dict__

{'_sa_instance_state': <sqlalchemy.orm.state.InstanceState at 0x7f70100cc4f0>,
 'key': 'gambit/refseq-curated/Escherichia_coli_Shigella',
 'name': 'Escherichia coli/Shigella',
 'description': None,
 'report': True,
 'parent_id': 453,
 'id': 1862,
 'rank': 'species',
 'distance_threshold': 0.6825377941131592,
 'genome_set_id': 1,
 'ncbi_id': None}

In [16]:
print_tax_tree(root)

1862 Escherichia coli/Shigella
  1863 Shigella
    1909 Shigella boydii
      1913 Shigella boydii subgroup 1
      1914 Shigella boydii subgroup 2
    1910 Shigella dysenteriae
      1915 Shigella dysenteriae subgroup 1
      1916 Shigella dysenteriae subgroup 2
    1911 Shigella flexneri
    1912 Shigella sonnei
  1892 Escherichia coli subgroup 1
  1893 Escherichia coli subgroup 2
  1894 Escherichia coli subgroup 3


In [17]:
ecoli_subgroups = [c for c in root.children if c.name.startswith('Escherichia coli')]
ecoli_subgroups

[<Taxon:1892 'Escherichia coli subgroup 1'>,
 <Taxon:1893 'Escherichia coli subgroup 2'>,
 <Taxon:1894 'Escherichia coli subgroup 3'>]

## Update database

### Create E coli taxon

In [18]:
ecoli = Taxon(
    genome_set=gset,
    key='gambit/refseq-curated/Escherichia_coli',
    name='Escherichia coli',
    ncbi_id=562,
    rank='species',
    distance_threshold=0,
    report=True,
)

In [19]:
session.add(ecoli)
session.commit()

### Insert into tree

In [20]:
ecoli.parent = root

In [21]:
for t in ecoli_subgroups:
    t.parent = ecoli

In [22]:
session.commit()

In [23]:
print_tax_tree(root)

1862 Escherichia coli/Shigella
  1863 Shigella
    1909 Shigella boydii
      1913 Shigella boydii subgroup 1
      1914 Shigella boydii subgroup 2
    1910 Shigella dysenteriae
      1915 Shigella dysenteriae subgroup 1
      1916 Shigella dysenteriae subgroup 2
    1911 Shigella flexneri
    1912 Shigella sonnei
  1917 Escherichia coli
    1892 Escherichia coli subgroup 1
    1893 Escherichia coli subgroup 2
    1894 Escherichia coli subgroup 3


### Update E coli / Shigella taxon attributes

In [24]:
root.rank = None

In [25]:
session.commit()

### Update genome set

In [26]:
NOTES = f'''
Changes from 1.0b1:
created new E. coli taxon (id={ecoli.id}), inserted in taxonomy tree between E. coli/Shigella (id={root.id})
and E. coli subgroup taxa (ids={",".join(str(t.id) for t in ecoli_subgroups)}).
Changed rank of E. coli/Shigella from 'species' to None.
'''

NOTES = ' '.join(NOTES.split())

In [27]:
gset.version = NEW_VERSION
gset.name = f'GAMBIT curated RefSeq genomes version {NEW_VERSION}'
gset.extra = dict(
    author='Jared Lumpe',
    revision=dict(
        num=1,
        date=DATESTR_LONG,
        description='Initial revision.',
    ),
    notes=NOTES,
)

In [28]:
vars(gset)

{'_sa_instance_state': <sqlalchemy.orm.state.InstanceState at 0x7f7010061400>,
 'version': '1.0b2',
 'name': 'GAMBIT curated RefSeq genomes version 1.0b2',
 'extra': {'author': 'Jared Lumpe',
  'revision': {'num': 1,
   'date': '2021-11-11',
   'description': 'Initial revision.'},
  'notes': "Changes from 1.0b1: created new E. coli taxon (id=1917), inserted in taxonomy tree between E. coli/Shigella (id=1862) and E. coli subgroup taxa (ids=1892,1893,1894). Changed rank of E. coli/Shigella from 'species' to None."}}

In [29]:
session.commit()