# 210707 Convert signature files

In [1]:
from pathlib import Path
import re

In [2]:
import numpy as np
import h5py as h5

In [3]:
from midas.signatures.hdf5 import HDF5Signatures
from midas.signatures import SignaturesMeta
from midas.kmers import KmerSpec
import midas.io.json as mjson
from gambit_legacy.io.signaturefile import SignatureFile

## Setup

In [4]:
TODAY = '2021-07-07'
DATESTR = '210707'

In [5]:
testdb_dir = Path('/home/jared/projects/midas/data/databases/testdb_210126/')
refseq_dir = Path('/home/jared/projects/midas/data/databases/refseq-curated/')

src_files = {
    'testdb': testdb_dir / 'testdb_210126.midas-signatures',
    'refseq': refseq_dir / 'refseq_assemblies_ATGAC11_2_0.midas-signatures',
}

dst_files = {
    'testdb': testdb_dir / f'testdb_210126-signatures-{DATESTR}.h5',
    'refseq': refseq_dir / f'refseq_assemblies_ATGAC11-2.0-{DATESTR}.h5',
}

## Open old files

In [6]:
sigfiles = {key: SignatureFile(f.open('rb')) for key, f in src_files.items()}

In [7]:
src_metadata = {key: sf.get_metadata() for key, sf in sigfiles.items()}

In [8]:
kspecs = {key: mjson.from_json(md['kmerspec'], KmerSpec) for key, md in src_metadata.items()}

## Metadata

In [9]:
dst_metadata = {}
dst_ids = {}

### testdb

In [10]:
src_metadata['testdb']

{'id': 'midas/test/testdb_210126',
 'version': '1.0',
 'name': 'testdb_210126',
 'date_created': '2021-01-26',
 'kmerspec': {'k': 8, 'prefix': 'ATG'},
 'description': 'Signatures for artificial genomes in testdb_210126 database'}

In [11]:
_meta = src_metadata['testdb']

dst_metadata['testdb'] = SignaturesMeta(
    id='gambit/testdb_210126',
    version='1.0',
    name='testdb_210126',
    id_attr='key',
    description=_meta['description'],
    extra=dict(
        date_created=_meta['date_created'],
        revision=dict(
            num=1,
            date_created=TODAY,
            description='Converted old version to new HDF5 format, changed ID values.'
        ),
    ),
)

Update IDs to remove the prefix part:

In [12]:
_ids = []

for id_ in sigfiles['testdb'].ids:
    m = re.fullmatch('^midas/testdb_210126/(.*)$', id_)
    _ids.append(m.group(1))
    
dst_ids['testdb'] = np.asarray(_ids)

### refseq

In [13]:
src_metadata['refseq']

{'id': 'midas/refseq-bacterial-assemblies',
 'version': '2.0',
 'metadata_version': '1.0',
 'kmerspec': {'k': 11, 'prefix': 'ATGAC'},
 'name': 'RefSeq bacterial assemblies ATGAC/11',
 'datecreated': '2017-07-29'}

In [14]:
_meta = src_metadata['refseq']

dst_metadata['refseq'] = SignaturesMeta(
    id=_meta['id'],
    version=_meta['version'],
    name=_meta['name'],
    id_attr='refseq_acc',
    extra=dict(
        date_created=_meta['datecreated'],
        revision=dict(
            num=2,
            date_created=TODAY,
            description='Converted to new HDF5 format.'
        ),
    ),
)

In [15]:
dst_ids['refseq'] = sigfiles['refseq'].ids

## Write new files

In [16]:
for key, dst_file in dst_files.items():
    print(key)
    data = sigfiles[key].get_array()
    with h5.File(dst_file, 'w') as h5file:
        HDF5Signatures.create(h5file, kspecs[key], data, dst_ids[key], dst_metadata[key])

testdb
refseq
