## neural network trained on kmers using numpy
Steps:
1. load data
2. find dimensions of the data
3. standardize the data?
4. build a model
5. train the model

In [None]:
import gzip
from io import StringIO
import itertools

import numpy as np
from numpy import random

import pandas as pd
import sklearn.utils

from keras.models import Sequential
from keras.layers import Dense, Activation, Conv1D, Dropout, Embedding, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer

### 1. Load Data

In [None]:
def load_kmer_batches(bacteria_kmer_fp, virus_kmer_fp, batch_size):

    def not_read_type(column_name):
        """
        Return True if the column name is NOT 'read_type'.
        """
        return column_name != 'read_type'


    bacteria_kmer_iter = pd.read_table(
        filepath_or_buffer=bacteria_kmer_fp,
        index_col=0,
        usecols=not_read_type,
        engine='c',
        chunksize=batch_size)

    virus_kmer_iter = pd.read_table(
        filepath_or_buffer=virus_kmer_fp,
        index_col=0,
        usecols=not_read_type,
        engine='c',
        chunksize=batch_size)
    
    labels = np.vstack((np.zeros((batch_size, 1)), np.ones((batch_size, 1))))

    for bacteria_batch, virus_batch in zip(bacteria_kmer_iter, virus_kmer_iter):
        batch_df = pd.concat((bacteria_batch, virus_batch))
        yield sklearn.utils.shuffle(batch_df, labels)


In [None]:
def load_kmer_batches_shuffle_labels(bacteria_kmer_fp, virus_kmer_fp, batch_size):
    for batch_df, labels in load_kmer_batches(bacteria_kmer_fp, virus_kmer_fp, batch_size):
        shuffled_labels = sklearn.utils.shuffle(labels)
        yield batch_df, shuffled_labels

In [None]:
bacteria_kmer_file1_fp = '../data/bact_kmer_file1.fasta.tab.gz'
bacteria_kmer_file2_fp = '../data/bact_kmer_file2.fasta.tab.gz'

In [None]:
virus_kmer_file1_fp = '../data/vir_kmer_file1.fasta.tab.gz'
virus_kmer_file2_fp = '../data/vir_kmer_file2.fasta.tab.gz'

In [None]:
for batch, labels in load_kmer_batches(bacteria_kmer_file1_fp, virus_kmer_file1_fp, 10):
    print(batch['AAAAAAAA'])
    print(labels)
    break

In [None]:
batch['labels'] = labels
batch[['AAAAAAAA', 'labels']]

### Find the dimensions of the data

In [None]:
batch_feature_count = batch.shape[1]
batch_sample_count = batch.shape[0]

print('batch feature count : {}'.format(batch_feature_count))
print('batch sample count  : {}'.format(batch_sample_count))

### 4. Build a Model

In [None]:
model = Sequential()
model.add(Dense(8, activation='relu', input_dim=batch_feature_count))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


### 5. Train the Model

In [None]:
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
df

In [None]:
sdf1, sdf2 = sklearn.utils.shuffle(df, df)
print(sdf1)
print(sdf2)

In [None]:
# train with shuffled labels as sanity check
model.fit_generator(
    generator=load_kmer_batches_shuffle_labels(bacteria_kmer_file1_fp, virus_kmer_file1_fp, 16),
    steps_per_epoch=10,
    verbose=1,
    workers=2)