## neural network trained on kmers loaded from H5 file
Steps:
1. load data
2. find dimensions of the data
3. standardize the data?
4. build a model
5. train the model

In [None]:
import gzip
from io import StringIO
import itertools

import h5py
import numpy as np
from numpy import random

import pandas as pd
import sklearn.utils

from keras.models import Sequential
from keras.layers import Dense, Activation

### 1. Load Data

In [None]:
bacteria_dataset_shape = (100, 32000)
batch_size = 8
list(range(0, bacteria_dataset_shape[0], batch_size))

In [None]:
def load_kmer_batches_h5(bacteria_kmer_fp, virus_kmer_fp, batch_size):

    with h5py.File(bacteria_kmer_fp, 'r') as bacteria_file, h5py.File(virus_kmer_fp, 'r') as virus_file:
        bacteria_dataset = bacteria_file['bacteria']
        virus_dataset = virus_file['virus']

        bacteria_batch = np.zeros((batch_size, bacteria_dataset.shape[1]))
        virus_batch = np.zeros((batch_size, virus_dataset.shape[1]))
        print('kmer batch shape is {}'.format((bacteria_batch.shape[0] * 2, bacteria_batch.shape[1])))
        
        # bacteria label is 0
        # virus label is 1
        labels = np.vstack((np.zeros((batch_size, 1)), np.ones((batch_size, 1))))

        for n in range(0, bacteria_dataset.shape[0], batch_size):
            source_slice = np.s_[n:n+batch_size, :]
            bacteria_dataset.read_direct(bacteria_batch, source_sel=source_slice)
            virus_dataset.read_direct(virus_batch, source_sel=source_slice)
            batch = np.vstack((bacteria_batch, virus_batch))
            # yeild shuffled views
            # the source arrays are not modified
            yield sklearn.utils.shuffle(batch, labels)


In [None]:
def load_kmer_batches_h5_shuffle_labels(bacteria_kmer_fp, virus_kmer_fp, batch_size):
    for batch_df, labels in load_kmer_batches_h5(bacteria_kmer_fp, virus_kmer_fp, batch_size):
        shuffled_labels = sklearn.utils.shuffle(labels)
        yield batch_df, shuffled_labels

In [None]:
bacteria_kmer_file1_fp = '../data/bact_kmer_file1.h5'
bacteria_kmer_file2_fp = '../data/bact_kmer_file2.h5'

In [None]:
virus_kmer_file1_fp = '../data/vir_kmer_file1.h5'
virus_kmer_file2_fp = '../data/vir_kmer_file2.h5'

In [None]:
with h5py.File(bacteria_kmer_file1_fp, 'r') as bacteria_file:
    dset = bacteria_file['bacteria']
    print(dset.shape)

In [None]:
for batch, labels in load_kmer_batches_h5(bacteria_kmer_file1_fp, virus_kmer_file1_fp, 10):
    print(batch[:5, :5])
    print(labels[:5])
    break

### Find the dimensions of the data

In [None]:
batch_feature_count = batch.shape[1]
batch_sample_count = batch.shape[0]

print('batch feature count : {}'.format(batch_feature_count))
print('batch sample count  : {}'.format(batch_sample_count))

### 4. Build a Model

In [None]:
model = Sequential()
model.add(Dense(8, activation='relu', input_dim=batch_feature_count))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


### 5. Train the Model

In [None]:
# train with shuffled labels as sanity check
model.fit_generator(
    generator=load_kmer_batches_h5_shuffle_labels(bacteria_kmer_file1_fp, virus_kmer_file1_fp, 16),
    steps_per_epoch=10,
    verbose=1,
    workers=2)