# 210718 Create signatures file

In [None]:
from pathlib import Path
import json

In [2]:
import numpy as np
from tqdm import tqdm

In [3]:
from gambit.kmers import KmerSpec
from gambit.signatures import SignatureArray
from gambit_legacy.io.signaturefile import SignatureFile

## Setup

In [4]:
DATESTR = '210718'
DATESTR_ISO = '2021-07-18'

In [5]:
infiles = dict(
    seq_urls=Path('data-intermediate/210717-find-sequence-urls/seq-urls.json'),
)

In [6]:
sigs_dir = Path('tmp/sigs')

In [7]:
signatures_out = Path(f'/home/jared/projects/gambit/data/databases/refseq-curated/midas-1.1/refseq_curated-1.1beta-{DATESTR}.midas-signatures')

## Load data

### Accession #s

In [8]:
with open(infiles['seq_urls']) as f:
    _items = json.load(f)
accs = [item['accession'] for item in _items]

### Signatures

In [9]:
sigs = []

for i, acc in enumerate(tqdm(accs)):
    sigs.append(np.load(sigs_dir / f'{i+1}.npy'))

100%|██████████| 50752/50752 [00:21<00:00, 2391.76it/s]


In [10]:
sigs = SignatureArray(sigs, dtype=np.dtype('u4'))

## Write file

In [11]:
# Metadata of original file
# Found in 200727-find-overlaps/200727-calculate-pw-distances
metadata = {
#     "date_created": "2020-06-04",
    "date_created": DATESTR_ISO,
    "genome_set": {
        "key": "midas/assembly/curated",
        "name": "refseq_curated_2020",
        "meta": {
            "date_created": "2020-05-26",
            "parent": {
                "key": "midas/assembly/curated",
                "key_version": "0.9"
            }
        },
        "description": "Created 2020-05-26 by filtering version 0.9 by inclusion in refseq/assembly/all 1.1",
        "key_version": "1.1"
    },
    "kmer_spec": {
        "k": 11,
        "prefix": "ATGAC"
    },
    "description": "Signatures for version 1.1 of curated genome set. Re-creation of original file created 200604",
    # New
    "id_attr": "refseq_acc",
}

In [12]:
with open(signatures_out, 'wb') as f:
    SignatureFile.write(f, sigs, ids=accs, metadata=metadata)

In [13]:
!gzip -f {signatures_out}