In [None]:
import time

import h5py
import numpy as np

from vl.data import load_kmer_range_batches_h5, load_kmer_random_batches_h5

In [None]:
# speed test
def speed_test(gen, test_name):
    print('begin test "{}"'.format(test_name))
    t0 = time.time()
    for batch, labels, step, epoch in gen1:
        if epoch == 2:
            break
        else:
            step_count = step
    t1 = time.time()
    print('finished test "{}"'.format(test_name))
    print('  {} steps'.format(step_count))
    print('  {:5.2f}ms per step'.format((t1-t0)*1000/step_count))
    print('  finished in {:5.2f}s'.format(t1-t0))

In [None]:
# 84ms per step
# 89ms per step separate arrays
with h5py.File('../data/training_testing.h5') as input_file:
    dset1 = input_file['/clean-bact/training1/extract/kmers/kmer_file1']
    dset2 = input_file['/clean-vir/training1/extract/kmers/kmer_file1']

    half_batch_size = 50

    gen1 = load_kmer_range_batches_h5(
        name='range shuffled',
        bacteria_dset=dset1,
        virus_dset=dset2,
        bacteria_range=(0, 10000),
        virus_range=(0, 10000),
        half_batch_size=half_batch_size,
        shuffle_batch=True,
        yield_state=True)

    speed_test(gen1, 'range shuffled')


In [None]:
# 82ms per step
# 81ms per step separate arrays
with h5py.File('../data/training_testing.h5') as input_file:
    dset1 = input_file['/clean-bact/training1/extract/kmers/kmer_file1']
    dset2 = input_file['/clean-vir/training1/extract/kmers/kmer_file1']

    half_batch_size = 50

    gen1 = load_kmer_range_batches_h5(
        name='range unshuffled',
        bacteria_dset=dset1,
        virus_dset=dset2,
        bacteria_range=(0, 10000),
        virus_range=(0, 10000),
        half_batch_size=half_batch_size,
        shuffle_batch=False,
        yield_state=True)

    speed_test(gen1, 'range unshuffled')


In [None]:
# 501ms per step using read_direct with one array for bacteria and virus
# 561ms per step using read_direct with separate arrays for bacteria and virus
with h5py.File('../data/training_testing.h5') as input_file:
    dset1 = input_file['/clean-bact/training1/extract/kmers/kmer_file1']
    dset2 = input_file['/clean-vir/training1/extract/kmers/kmer_file1']

    half_batch_size = 50

    gen1 = load_kmer_random_batches_h5(
        name='random shuffled',
        bacteria_dset=dset1,
        virus_dset=dset2,
        bacteria_subsample=np.random.permutation(dset1.shape[0])[:10000],
        virus_subsample=np.random.permutation(dset2.shape[0])[:10000],
        half_batch_size=half_batch_size,
        shuffle_batch=True,
        yield_state=True)

    speed_test(gen1, 'random shuffled')


In [None]:
# a little test for the generator
def test_generator():
    with h5py.File('../data/test_generator_data.h5', 'w') as test_data:
        dset1_shape = (12, 2)
        dset1 = test_data.create_dataset('/test/data1', dset1_shape)
        dset1[:, :] = np.arange(np.product(dset1_shape)).reshape(dset1_shape)
        
        dset2_shape = (10, 2)
        dset2 = test_data.create_dataset('/test/data2', dset2_shape)
        dset2[:, :] = np.arange(np.product(dset2_shape)).reshape(dset2_shape) + np.product(dset2_shape)
        
    with h5py.File('../data/test_generator_data.h5') as test_data:
        dset1 = test_data['/test/data1']
        print('{}:\n{}'.format(dset1.name, dset1[:]))
        print()
        dset2 = test_data['/test/data2']
        print('{}:\n{}'.format(dset2.name, dset2[:]))
        print()
    
    # return 3 batches of 5 samples from each of 2 datasets without shuffling
    with h5py.File('../data/test_generator_data.h5') as test_data:
        dset1 = test_data['/test/data1']
        dset2 = test_data['/test/data2']

        gen1 = load_kmer_random_batches_h5(
            name='gen1',
            bacteria_dset=dset1,
            virus_dset=dset2,
            bacteria_subsample=np.arange(dset1.shape[0]),
            virus_subsample=np.arange(dset2.shape[0]),
            half_batch_size=5,
            shuffle_batch=False)
        
        stuff = gen1.__next__()
        print('stuff:\n{}'.format(stuff))
        
        #for batch, labels in gen1:
        #    if epoch == 2 and step == 2:
        #        break
        #    else:
        #        print('batch {}:'.format(step))
        #        print(np.hstack((batch, labels)))
                
        gen2 = load_kmer_random_batches_h5(
            name='gen2',
            bacteria_dset=dset1,
            virus_dset=dset2,
            bacteria_subsample=[11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            virus_subsample=np.arange(dset2.shape[0]),
            half_batch_size=5,
            shuffle_batch=False,
            yield_state=True)
        
        for batch, labels, step, epoch in gen2:
            if epoch == 2 and step == 2:
                break
            else:
                print('batch {}:'.format(step))
                print(np.hstack((batch, labels)))


In [None]:
test_generator()

In [None]:
import sklearn.utils

In [None]:
batch = np.arange(25).reshape((5, 5))
labels = np.arange(5).reshape((5,1))

In [None]:
batch

In [None]:
labels

In [None]:
sbatch, slabels = sklearn.utils.shuffle(batch, labels)

In [None]:
sbatch

In [None]:
slabels

In [None]:
labels

In [None]:
np.all(labels == [[0], [1], [2], [3], [4]])

In [None]:
batch[(1, 0), :]