# 210719 Create signatures file

In [1]:
from pathlib import Path
import json
from gzip import GzipFile

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import h5py as h5

In [3]:
from gambit.db.models import *
from gambit.kmers import KmerSpec
from gambit.signatures import SignaturesMeta
from gambit.signatures.hdf5 import HDF5Signatures
from gambit_legacy.io.signaturefile import SignatureFile
import gambit.io.json as gjson

## Setup

In [4]:
DATESTR = '210719'
NBNAME = DATESTR + '-create-signatures-file'
DATESTR_ISO = '2021-07-19'

In [5]:
infiles = dict(
    db=Path('/home/jared/projects/gambit/data/databases/refseq-curated/1.0-beta1/gambit-genomes-1.0b1-210719.db'),
    signatures_in=Path('/home/jared/projects/gambit/data/databases/refseq-curated/midas-1.1/refseq_curated-1.1beta-210718.midas-signatures.gz'),
)

In [6]:
signatures_out_path = Path(f'/home/jared/projects/gambit/data/databases/refseq-curated/1.0-beta1/gambit-signatures-1.0b1-{DATESTR}.h5')

## Load data

### Genomes database

In [7]:
engine = create_engine(f'sqlite:///{infiles["db"]}')

In [8]:
session = sessionmaker(engine)()

In [9]:
gset = session.query(ReferenceGenomeSet).one()

### Input signatures file

In [10]:
signatures_in = SignatureFile(GzipFile(infiles['signatures_in']))

In [11]:
acc_to_index = {acc: i for i, acc in enumerate(signatures_in.ids)}

In [12]:
meta_in = signatures_in.get_metadata()

In [13]:
kspec = gjson.from_json(meta_in['kmer_spec'], KmerSpec)
kspec

KmerSpec(11, 'ATGAC')

## Find genome order

Order by taxonomy instead of accession # this time. Assemble full list of genomes by traversing taxonomy tree, ordering children by NCBI ID.

In [14]:
sort_key = lambda taxon: (1, taxon.ncbi_id) if taxon.ncbi_id is not None else (2, taxon.id)

In [15]:
accs_ordered = []

In [16]:
def add_genomes(taxon):
    accs_ordered.extend([genome.refseq_acc for genome in taxon.genomes])
    for child in sorted(taxon.children, key=sort_key):
        add_genomes(child)

In [17]:
for taxon in tqdm(sorted(gset.root_taxa(), key=sort_key)):
    add_genomes(taxon)

100%|██████████| 453/453 [00:15<00:00, 28.71it/s] 


In [18]:
assert len(accs_ordered) == gset.genomes.count()

## Read signatures in order

In [19]:
indices = [acc_to_index[acc] for acc in accs_ordered]

In [20]:
sigs = signatures_in.get_array(indices)

In [21]:
sigs.dtype

dtype('uint32')

## Create output file

In [22]:
meta = SignaturesMeta(
    id='gambit/refseq-curated',
    version='1.0b1',
    name='GAMBIT curated RefSeq genomes version 1.0b1',
    description='Signatures for curated genome database derived from NCBI RefSeq. Beta version.',
    id_attr='refseq_acc',
    extra=dict(
        author='Jared Lumpe',
        revision=dict(
            num=1,
            date=DATESTR_ISO,
            description='Initial revision.'
        ),
    ),
)

In [23]:
with h5.File(signatures_out_path, 'w') as f:
    HDF5Signatures.create(f, kspec, sigs, accs_ordered, meta)