# 211118 Simple benchmarks

In [34]:
from pathlib import Path
from subprocess import run, CalledProcessError, PIPE
from itertools import product

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
# from gambit.db import load_database_from_dir
# from gambit.db.fromfile import locate_db_files

from gambit.sigs import SignatureArray
from gambit.sigs.calc import calc_file_signatures
from gambit.io.seq import SequenceFile
from gambit.sigs.hdf5 import HDF5Signatures
from gambit.kmers import KmerSpec
from gambit.metric import jaccarddist_matrix

## Setup

In [4]:
DATESTR = '211118'

In [5]:
infiles = dict(
    ondov_table=Path('../210902-mash-Escherichia-genomes/data-processed/210902-get-genomes/210902-mash-genomes-filtered.csv'),
    ondov_genomes_dir=Path('/home/jared/projects/gambit/data/external/ondov-2016/Escherichia/genomes/'),
    konstantinidis_table=Path('../../data/external/konstantinidis-2005/210910-genomes.csv'),
    konstantinidis_genomes_dir=Path('/home/jared/projects/gambit/data/external/konstantinidis-2005/genomes/'),
)

In [6]:
tmpdir = Path('tmp')
tmpdir.mkdir(exist_ok=True)

outputs_dir = tmpdir / 'outputs'
outputs_dir.mkdir(exist_ok=True)

## Prepare genome files

### Locate zipped files

In [7]:
zipped_genome_files = dict()

In [8]:
ondov_table = pd.read_csv(infiles['ondov_table'])
ondov_table.sort_values('assembly_accession', inplace=True)

zipped_genome_files['ondov_2016'] = [
    infiles['ondov_genomes_dir'] / f'{acc}.fa.gz'
    for acc in ondov_table['assembly_accession']
]

In [9]:
konst_table = pd.read_csv(infiles['konstantinidis_table'])
konst_table.sort_values('assembly', inplace=True)

zipped_genome_files['konstantinidis_2005'] = [
    infiles['konstantinidis_genomes_dir'] / f'{acc}.fa.gz'
    for acc in konst_table['assembly']
]

In [10]:
dsets = list(zipped_genome_files.keys())

### Unzip

In [11]:
genomes_base_dir = tmpdir / 'genomes'
genomes_base_dir.mkdir(exist_ok=True)

genome_dirs = {dset: genomes_base_dir / dset for dset in dsets}

for d in genome_dirs.values():
    d.mkdir(exist_ok=True)

In [12]:
genome_files = {
    dset: [genome_dirs[dset] / f.stem for f in files]
    for dset, files in zipped_genome_files.items()
}

In [13]:
for ds in dsets:
    for src, dst in tqdm(zip(zipped_genome_files[ds], genome_files[ds])):
        assert src.is_file()
        if not dst.is_file():
            run(f'gunzip -c {src} > {dst}', shell=True, check=True)

492it [00:00, 100840.38it/s]
70it [00:00, 107862.34it/s]


## Select reference genomes

In [14]:
ref_files = dict()

### Ondov

Just select 25 random ones.

In [15]:
np.random.seed(0)
ref_files['ondov_2016'] = np.random.choice(genome_files['ondov_2016'], size=25, replace=False)

### Konstantinidis

Select one from each group.

In [16]:
_ds = 'konstantinidis_2005'

ref_files[_ds] = [
    genome_dirs[_ds] / f'{acc}.fa'
    for acc in konst_table.groupby('group')['assembly'].first()
]

## Configure tools

In [17]:
benchmark_items = []

### General

In [18]:
NPROCS = 6
NTHREADS = NPROCS * 2

In [19]:
QUERY_FILE_LISTS = {ds: d / 'queries.txt' for ds, d in genome_dirs.items()}
REF_FILE_LISTS = {ds: d / 'refs.txt' for ds, d in genome_dirs.items()}

for ds in dsets:
    with open(QUERY_FILE_LISTS[ds], 'w') as f:
        for p in genome_files[ds]:
            f.write(f'{p}\n')
            
    with open(REF_FILE_LISTS[ds], 'w') as f:
        for p in ref_files[ds]:
            f.write(f'{p}\n')

### GAMBIT

In [None]:
gambit_sigs_out = outputs_dir / 'gambit-signatures.h5'
# gambit_dists_out = outputs_dir / 'gambit-query-results.json'

In [None]:
GAMBIT_K = [11]
GAMBIT_PREFIX = ['ATGAC']

GAMBIT_PARAMS = {f'{k}-{prefix}': dict(k=k, prefix=prefix) for k, prefix in product(GAMBIT_K, GAMBIT_PREFIX)}

In [None]:
def calc_gambit_sigs(**kw):
    return SignatureArray(calc_file_signatures(kspec, seqfiles, concurrency='processes', **kw))

In [None]:
def benchmark_gambit_sigs(dset, params=None):
    fff

In [None]:
sigs = calc_gambit_sigs(progress=True)

In [None]:
def benchmark_gambit_dists(dset, params=None):
    return jaccarddist_matrix(sigs, sigs)

### Mash

In [42]:
MASH_CMD = '/home/jared/opt/anaconda3/envs/gambit-bioconda/bin/mash'

MASH_S = [500, 1_000, 5_000, 10_000]
MASH_K = [16, 21]

MASH_PARAMS = {f's{s}-k{k}': (s, k) for s, k in product(MASH_S, MASH_K)}

MASH_SKETCH_OUT = outputs_dir / '.mash-sketch'

In [60]:
outputs_dir

PosixPath('tmp/outputs')

In [62]:
def benchmark_mash_sketch(dset, params):
    queries = genome_files[dset]
    s, k = params
    
    cmd = list(map(str, [
        MASH_CMD,
        'sketch',
        '-p', NTHREADS,
        '-l', QUERY_FILE_LISTS[ds],
        '-k', k,
        '-s', s,
        '-o', MASH_SKETCH_OUT,
    ]))
    
    return run(cmd, capture_output=True)

In [63]:
%time r = benchmark_mash_sketch('ondov_2016', (500, 16))

CPU times: user 10.5 ms, sys: 8.8 ms, total: 19.3 ms
Wall time: 1.24 s


In [58]:
len(list(r.stderr.decode().splitlines()))

71

### FastANI

## Run benchmarks