# 220916_2 Finalize signatures

This adds metadata to the database signatures file.

In [1]:
from pathlib import Path
import json
from datetime import date

In [2]:
from gambit.sigs import SignaturesMeta, load_signatures, dump_signatures, AnnotatedSignatures

## Setup

In [3]:
DATESTR = '220916'
NBNAME = f'{DATESTR}_2-finalize-signatures'

In [4]:
infiles = dict(
    signatures=Path('data-intermediate/220831_2-signatures-and-dists/signatures.h5'),
)

In [5]:
processed_data = Path('data-processed') / NBNAME
processed_data.mkdir(exist_ok=True, parents=True)

outfiles = dict(
    signatures=processed_data / f'{DATESTR}-theiagen-candida-test.h5',
)

## Assign metadata

There are several `AbstractSignatureArray` subclasses which act as lists/arrays of GAMBIT signatures.
That means they support `len()`, iteration, and item access like `signatures[i]`.
These can be assigned an `ids` attribute which is an array of unique integer or string IDs for the signatures,
as well as a `meta` attribute which is an instance of `SignaturesMeta`.

The IDs are already fine because they came from the file names passed to the `signatures create` command,
which were the accession numbers with `.fna.gz` extension. We need to set the metadata, though.

In [6]:
meta = SignaturesMeta(
    id='theiagen/candida-test',
    version='1.0.0',
    name='Candida test database',
    description='Signatures for theiagen/candida-test database.',
    # This is which column of the "genomes" table the IDs in the signature file match to.
    # I'm going to get rid of this in a future version and just make everything match to "key"
    id_attr='key',
)

The `HDF5Signatures` class is an interface to an open HDF5 file which stores signatures in a effecient format.
It is returned by `load_signatures()`. Currently there isn't a way to modify existing ones, so we need to create
a new one from the old with `dump_signatures()`. It accepts an `AbstractSignatureArray` instance as its argument.
To add the metadata we will use the `AnnotatedSignatures` class which wraps another signature array, but allows
setting different IDs and metadata.

In [7]:
with load_signatures(infiles['signatures']) as src:
    out_sigs = AnnotatedSignatures(src, src.ids, meta)
    dump_signatures(outfiles['signatures'], out_sigs)