# 211015 Gambit gold standard genomes

In [1]:
from pathlib import Path

In [2]:
import numpy as np
import pandas as pd
import h5py as h5
from tqdm import tqdm

In [3]:
from gambit.kmers import KmerSpec
from gambit.signatures.calc import calc_file_signatures
from gambit.io.seq import SequenceFile
from gambit.signatures import SignatureArray
from gambit.signatures.hdf5 import HDF5Signatures
from gambit.metric import jaccarddist_matrix

## Setup

In [4]:
DATESTR = '211015'
NBNAME = DATESTR + '-gambit-gsg'

In [5]:
DATASET = '200726_gold_standard'

In [6]:
tmpdir = Path('tmp')

signatures_dir = tmpdir / 'gambit' / DATASET
signatures_dir.mkdir(exist_ok=True)

In [7]:
infiles = dict(
    genomes=Path('/home/jared/projects/gambit/data/validation/200726-gold-standard-seqs/files.csv'),
    genomes_dir=Path('/home/jared/projects/gambit/data/validation/200726-gold-standard-seqs/fasta/'),
    params=Path('../../data/intermediate/210902-mash-Escherichia-genomes/210917-gambit/params.csv')
)

In [16]:
intermediate_out = Path('data-intermediate') / NBNAME
intermediate_out.mkdir(exist_ok=True)

outfile = intermediate_out / f'{DATASET}.h5'

## Load data

In [9]:
genomes_all = pd.read_csv(infiles['genomes'])

# Exclude 200817 set
genomes = genomes_all.loc[genomes_all['set'] == 200726, 'name']

ngenomes = len(genomes)
ngenomes

80

In [10]:
params_df = pd.read_csv(infiles['params'], index_col=['prefix_len', 'prefix_version', 'k'])

params_df['kspec'] = [KmerSpec(row.k, row.prefix) for i, row in params_df.reset_index().iterrows()]

nparams = params_df.shape[0]
nparams

192

## Calculate signatures

In [11]:
genome_files = SequenceFile.from_paths((infiles['genomes_dir'] / f'{name}.fasta.gz' for name in genomes), 'fasta', 'gzip')

In [12]:
signature_files = {kspec: signatures_dir / f'{kspec.k}-{kspec.prefix_str}.h5' for kspec in params_df['kspec']}

In [13]:
for kspec in tqdm(params_df['kspec']):
    file = signature_files[kspec]
    if file.is_file():
        continue

    sigs = calc_file_signatures(kspec, genome_files, concurrency='processes', max_workers=12)

    with h5.File(file, 'w') as f:
        HDF5Signatures.create(f, sigs, genomes)

100%|██████████| 192/192 [00:00<00:00, 97778.82it/s]


## Pairwise distances

In [14]:
gi1, gi2 = np.tril_indices(ngenomes)
npairs = len(gi1)

dists_square = np.empty((ngenomes, ngenomes), dtype=np.float32)
dists_flat = np.empty((nparams, npairs), dtype=np.float32)
kmer_counts = np.empty((nparams, ngenomes), dtype=int)

for i, kspec in enumerate(tqdm(params_df['kspec'])):
    with HDF5Signatures.open(signature_files[kspec]) as h5sigs:
        sigs = h5sigs[:]

    jaccarddist_matrix(sigs, sigs, out=dists_square)
    dists_flat[i, :] = dists_square[gi1, gi2]

    kmer_counts[i, :] = sigs.sizes()

100%|██████████| 192/192 [00:45<00:00,  4.20it/s]


In [18]:
with h5.File(outfile, 'w') as f:
    f.create_dataset('genome1', data=gi1)
    f.create_dataset('genome2', data=gi2)
    f.create_dataset('pw_dists', data=dists_flat)
    f.create_dataset('kmer_counts', data=kmer_counts)