# 220916_1 Create DB

This creates the sqlite database file for the Candida DB.

In [1]:
from pathlib import Path
import json
from datetime import date

In [2]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

In [3]:
from gambit.db import ReferenceGenomeSet, Genome, AnnotatedGenome, Taxon
from gambit.db.migrate import init_db

from entrez_tools.db.assembly import format_summary_meta

## Setup

In [4]:
DATESTR = '220916'
NBNAME = f'{DATESTR}_1-create-db'

In [5]:
infiles = dict(
    genomes_table=Path('data-src/genomes.csv'),
    taxa_table=Path('data-intermediate/220831_3-find-diameters/species-data.csv'),
    fasta_dir=Path('/home/jared/gambit/data/genomes/220831-candida/fasta'),
    genome_esummary_dir=Path('data-intermediate/220831_1-download-genomes/esummary'),
    taxon_esummary_dir=Path('data-intermediate/220912-download-taxa/taxa'),
)

In [6]:
processed_data = Path('data-processed') / NBNAME
processed_data.mkdir(exist_ok=True, parents=True)

outfiles = dict(
    db=processed_data / f'{DATESTR}-theiagen-candida-test.db',
)

## Load data

In [7]:
genomes_df = pd.read_csv(infiles['genomes_table'])
taxa_df = pd.read_csv(infiles['taxa_table'])

In [8]:
genome_summaries = dict()

for uid in genomes_df['ncbi_uid']:
    with open(infiles['genome_esummary_dir'] / f'{uid}.json') as f:
        genome_summaries[uid] = json.load(f)

In [9]:
taxon_summaries = dict()

for taxid in set(genomes_df['ncbi_taxid']):
    with open(infiles['taxon_esummary_dir'] / f'{taxid}.json') as f:
        taxon_summaries[uid] = json.load(f)

## Create database

The database schema has gone through a lot of updates but still has a lot of weird baggage from when it was first developed in 2016. Back then we hadn't settled on the one-sqlite-file-per-database format so it's set up to allow multiple "GAMBIT databases" in a single SQL database. I'd like to simplify things in a future version now that I have the time.

In [10]:
if outfiles['db'].is_file():
    outfiles['db'].unlink()

engine = create_engine('sqlite:///' + str(outfiles['db']))

In [11]:
init_db(engine)

In [12]:
Session = sessionmaker(engine)
session = Session()

### Genome set

`ReferenceGenomeSet` basically encompasses the concept of a "gambit database," but because it is just a row in a table you can technically have more than one. The CLI commands expect there to be just one per SQLite file, though.

In [13]:
gset = ReferenceGenomeSet(
    # This is supposed to be a unique string ID
    # No other database file should have the same key and version
    # I intended it to be in a sort of hierearchical format with the "owner" organization coming first,
    # which keeps it distinct from anyone else's "candida-test" database.
    key='theiagen/candida-test',
    version='1.0.0',
    name='Fungal test database',
    description='Test database containing several Candida species.',
    # This is arbitrary JSON data. There's no set schema for it currently.
    extra=dict(
        author='Jared Lumpe',
        date=date.today().isoformat(),
    ),
)

In [14]:
session.add(gset)
session.commit()

### Taxa

In [15]:
taxa = dict()

for row in taxa_df.itertuples():
    taxon = Taxon(
        genome_set=gset,
        # Arbitrary unique string ID
        # These are all regular NCBI taxa
        key=f'ncbi/{row.ncbi_taxid}',
        # Using names I got from you here, NCBI reassigned several to other genera
        name=row.name,
        rank='species',
        distance_threshold=row.diameter,
        ncbi_id=row.ncbi_taxid,
        # Arbitrary JSON-formatted metadata
        extra=dict(
            # The actual name used by NCBI
            ncbi_name=row.ncbi_name,
        ),
    )
    
    taxa[row.ncbi_taxid] = taxon
    session.add(taxon)
    
session.commit()

### Genomes

In [16]:
for row in genomes_df.itertuples():
    summary = genome_summaries[row.ncbi_uid]
    assembly_stats = format_summary_meta(summary['meta'])
    taxon = taxa[row.ncbi_taxid]
    
    genome = Genome(
        key=row.genbank_acc,
        description=f'[{row.genbank_acc}] {summary["organism"]}',
        ncbi_db='assembly',
        ncbi_id=row.ncbi_uid,
        genbank_acc=row.genbank_acc,
        # More JSON metadata
        extra=dict(
            length=assembly_stats['total_length'],
        )
    )
    session.add(genome)
    
    ag = AnnotatedGenome(
        genome=genome,
        genome_set=gset,
        taxon=taxon,
        organism=taxon.name,
    )
    session.add(ag)
    
session.commit()