Compare training on samples in the order they occur in the file to training on samples out-of-order (permuted).
Try to compare speed of loading data using read_direct to just slicing H5 datatsets.

In [None]:
import time
import warnings

from keras.models import Sequential
from keras.layers import Dense, Dropout, Lambda
import h5py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.utils

%matplotlib inline

from vl.data import load_kmer_range_batches_h5, load_kmer_random_batches_h5

In [None]:
warnings.simplefilter('ignore', UserWarning)

In [None]:
def build_model(input_dim):
    """
    Return a 2-layer network.
    """

    model = Sequential()
    model.add(Dense(64, activation='relu', input_dim=input_dim))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [None]:
t0 = time.time()
# train on shuffled samples
train_test_fp = '../data/training_testing.h5'

# 25 samples from the bacteria dataset
# 25 samples from the virus dataset
batch_size = 50

with h5py.File(train_test_fp, 'r') as train_test_file:
    bacteria_dset = train_test_file['/clean-bact/training1/extract/kmers/kmer_file1']
    virus_dset = train_test_file['/clean-vir/training1/extract/kmers/kmer_file1']

    model = build_model(input_dim=bacteria_dset.shape[1])
    
    # use mini-batch training
    # record loss and accuracy after each 'generation' (for lack of a better term)
    generations = 2*20
    batches_per_generation = 100
    print('{} training generations = {} training samples'.format(
        generations, generations * batches_per_generation * batch_size))
    
    # divide the data into shuffled training and validation sets
    bacteria_index = np.random.permutation(bacteria_dset.shape[0])
    training_fraction = 7 * bacteria_dset.shape[0] // 8
    print('bacteria training_fraction: {}'.format(training_fraction))
    bacteria_training_index = bacteria_index[:training_fraction]
    bacteria_validation_index = bacteria_index[training_fraction:]

    virus_index = np.random.permutation(virus_dset.shape[0])
    training_fraction = 7 * virus_dset.shape[0] // 8
    print('virus training fraction: {}'.format(training_fraction))
    virus_training_index = virus_index[:training_fraction]
    virus_validation_index = virus_index[training_fraction:]

    validation_batches = min(len(bacteria_validation_index), len(virus_validation_index)) // (batch_size // 2)
    print('{} validation batches = {} validation samples'.format(
        validation_batches, validation_batches * batch_size))
    
    shuffled_history = model.fit_generator(
        generator=load_kmer_random_batches_h5(
            'random training',
            bacteria_dset=bacteria_dset,
            bacteria_subsample=bacteria_training_index,
            virus_dset=virus_dset,
            virus_subsample=virus_training_index,
            half_batch_size=batch_size // 2
        ),
        # there is no advantage to permuting the validation samples
        # and there may be a speed advantage to reading them in order
        validation_data=load_kmer_random_batches_h5(
            'random validation',
            bacteria_dset=bacteria_dset,
            bacteria_subsample=bacteria_validation_index,
            virus_dset=virus_dset,
            virus_subsample=virus_validation_index,
            half_batch_size=batch_size // 2
        ),
        epochs=generations,
        steps_per_epoch=batches_per_generation,
        validation_steps=validation_batches,
        workers=2
    )
print('finished in {:5.2f}s'.format(time.time()-t0))

In [None]:
def plot_loss_accuracy(history, title):
    training_performance_df = pd.DataFrame(data=history.history, index=range(1, generations + 1))
    training_performance_df.index.name = 'generation'
    training_performance_df.head()
    
    plt.figure()
    plt.plot(training_performance_df.index, training_performance_df.loss, training_performance_df.val_loss)
    plt.title('{}\nTraining and Validation Loss'.format(title))
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.legend(['training', 'validation'])
    
    plt.figure()
    plt.plot(training_performance_df.index, training_performance_df.acc, training_performance_df.val_acc)
    plt.title('{}\nTraining and Validation Accuracy'.format(title))
    plt.xlabel('epoch')
    plt.ylabel('accuracy')
    plt.legend(['training', 'validation'])

In [None]:
plot_loss_accuracy(shuffled_history, 'Shuffled Training Samples')

In [None]:
t0 = time.time()
# train on unshuffled samples
train_test_fp = '../data/training_testing.h5'

# 25 samples from the bacteria dataset
# 25 samples from the virus dataset
batch_size = 50

with h5py.File(train_test_fp, 'r') as train_test_file:
    bacteria_dset = train_test_file['/clean-bact/training1/extract/kmers/kmer_file1']
    virus_dset = train_test_file['/clean-vir/training1/extract/kmers/kmer_file1']

    model = build_model(input_dim=bacteria_dset.shape[1])
    
    # use mini-batch training
    # record loss and accuracy after each 'generation' (for lack of a better term)
    generations = 2*20
    batches_per_generation = 100
    print('{} training generations = {} training samples'.format(
        generations, generations * batches_per_generation * batch_size))
    
    # divide the data into un-shuffled training and validation sets
    bacteria_training_sample_count = 7 * bacteria_dset.shape[0] // 8
    virus_training_sample_count = 7 * virus_dset.shape[0] // 8

    validation_batches = (bacteria_dset.shape[0] - bacteria_training_sample_count) // (batch_size // 2)
    print('{} validation batches = {} validation samples'.format(
        validation_batches, validation_batches * batch_size))
    
    unshuffled_history = model.fit_generator(
        generator=load_kmer_range_batches_h5(
            name='range training',
            bacteria_dset=bacteria_dset,
            bacteria_range=(0, bacteria_training_sample_count),
            virus_dset=virus_dset,
            virus_range=(0, virus_training_sample_count),
            half_batch_size=batch_size // 2,
            shuffle_batch=False
        ),
        # there is no advantage to permuting the validation samples
        # and there may be a speed advantage to reading them in order
        validation_data=load_kmer_range_batches_h5(
            name='range validation',
            bacteria_dset=bacteria_dset,
            bacteria_range=(bacteria_training_sample_count, bacteria_dset.shape[0]),
            virus_dset=virus_dset,
            virus_range=(virus_training_sample_count, virus_dset.shape[0]),
            half_batch_size=batch_size // 2,
            shuffle_batch=False
        ),
        epochs=generations,
        steps_per_epoch=batches_per_generation,
        validation_steps=validation_batches
    )
print('finished in {:5.2f}s'.format(time.time()-t0))

In [None]:
plot_loss_accuracy(unshuffled_history, 'UnShuffled Training Samples')