# 211001 Benchmarks

In [1]:
from pathlib import Path
from datetime import datetime

In [2]:
import numpy as np
import pandas as pd
import h5py as h5
from tqdm import tqdm

In [4]:
from gambit.signatures.hdf5 import HDF5Signatures

## Setup

In [39]:
DATESTR = '211001'
NBNAME = DATESTR + '-benchmarks'

In [5]:
infiles = dict(
    params=Path('../../data/intermediate/210902-mash-Escherichia-genomes/210917-gambit/params.csv'),
    sigs=Path('labnb/experiments/210902-mash-Escherichia-genomes/tmp/gambit/signatures/'),
)

In [42]:
intermediate_out = Path('data-intermediate') / NBNAME
intermediate_out.mkdir(exist_ok=True)

In [6]:
tmpdir = Path('tmp')
tmpdir.mkdir(exist_ok=True)

## Load data

### Original signatures

In [7]:
params_df = pd.read_csv(infiles['params'], index_col=['prefix_len', 'prefix_version', 'k'])
nparams = params_df.shape[0]

In [8]:
orig_dir = Path('../210902-mash-Escherichia-genomes/tmp/gambit/signatures/')

In [9]:
orig_files = [orig_dir / f'{row.k}-{row.prefix}.h5' for i, row in params_df.reset_index().iterrows()]

In [69]:
nkmers = []
nsigs = []
dtypes = []
nbytes = []

for orig_file in orig_files:
    sigs = HDF5Signatures.open(orig_file)
    nkmers.append(len(sigs.values))
    nsigs.append(len(sigs))
    dtypes.append(str(sigs.dtype))
    nbytes.append(sigs.dtype.itemsize)

params_df['nkmers'] = nkmers
params_df['nsigs'] = nsigs
params_df['dtype'] = dtypes
params_df['dtype_bytes'] = nbytes

## Compress

In [20]:
methods = ['gzip', 'lzf']
methods_all = ['none', *methods]

In [21]:
files=dict(
    none=orig_files,
)

In [22]:
for method in methods:
    d = tmpdir / method
    d.mkdir(exist_ok=True)
    files[method] = [d / f.name for f in orig_files]

In [23]:
for method in methods:
    for orig_file, compressed_file in tqdm(zip(orig_files, files[method]), desc=method):
        sigs = HDF5Signatures.open(orig_file)

        if not compressed_file.is_file():
            with h5.File(compressed_file, 'w') as f:
                HDF5Signatures.create(f, sigs, compression=method)

gzip: 192it [00:00, 735.12it/s]
lzf: 192it [00:57,  3.31it/s]


## Measure performance

In [24]:
ntrials = 5

In [30]:
trials_index = pd.MultiIndex.from_product((methods_all, range(1, ntrials+1)), names=['method', 'trial'])

### File sizes

In [25]:
sizes_df = pd.DataFrame(
    {method: [f.stat().st_size for f in files] for method, files in files.items()},
    index=params_df.index,
)

In [27]:
for method in methods:
    sizes_df[method + '_ratio'] = sizes_df[method] / sizes_df['none']

### Read all at once

In [33]:
read_all = pd.DataFrame(np.full((nparams, len(trials_index)), -1), index=params_df.index, columns=trials_index)

In [36]:
for trial in range(ntrials):
    for i in tqdm(range(nparams)):
        for method in methods_all:
            file = files[method][i]
            
            with HDF5Signatures.open(file) as sigs:
                start = datetime.now()
                sigs[:]
                dt = datetime.now() - start

            read_all.loc[:, (method, trial+1)].iloc[i] = dt.microseconds

100%|██████████| 192/192 [00:35<00:00,  5.41it/s]
100%|██████████| 192/192 [00:36<00:00,  5.30it/s]
100%|██████████| 192/192 [00:36<00:00,  5.29it/s]
100%|██████████| 192/192 [00:37<00:00,  5.16it/s]
100%|██████████| 192/192 [00:36<00:00,  5.26it/s]


### Read one at a time

In [37]:
read_single = pd.DataFrame(np.full((nparams, len(trials_index)), -1), index=params_df.index, columns=trials_index)

In [38]:
for trial in range(ntrials):
    for i in tqdm(range(nparams)):
        for method in methods_all:
            file = files[method][i]
            
            with HDF5Signatures.open(file) as sigs:
                start = datetime.now()
                for sig in sigs:
                    pass
                dt = datetime.now() - start

            read_single.loc[:, (method, trial+1)].iloc[i] = dt.microseconds

100%|██████████| 192/192 [01:18<00:00,  2.45it/s]
100%|██████████| 192/192 [01:18<00:00,  2.44it/s]
100%|██████████| 192/192 [01:17<00:00,  2.49it/s]
100%|██████████| 192/192 [01:20<00:00,  2.39it/s]
100%|██████████| 192/192 [01:19<00:00,  2.42it/s]


## Save data

In [71]:
params_df.to_csv(intermediate_out / 'signature-sets.csv')

In [55]:
sizes_df.to_csv(intermediate_out / 'file-sizes.csv')

In [56]:
reads_df = pd.concat([read_all, read_single], keys=['all', 'single'], names=['read_method']).stack('trial')

reads_df.to_csv(intermediate_out / 'read-performance.csv')