## neural network trained on kmers using numpy
Steps:
1. load data
2. find dimensions of the data
3. standardize the data?
4. build a model
5. train the model

In [None]:
import gzip
import time

from io import StringIO

import numpy as np
from numpy import random

import pandas as pd

from keras.models import Sequential
from keras.layers import Dense, Activation, Conv1D, Dropout, Embedding, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer

### 1. Load Data

In [None]:
def load_kmers(kmer_fp):
    t0 = time.time()
    kmer_df = np.genfromtxt(
        fname=kmer_fp,
        skip_header=1,
        usecols=range(1, 32768),
        max_rows=10000)
    print('loaded "{}" in {:5.2f}s'.format(kmer_fp, time.time()-t0))
    return kmer_df

In [None]:
# loaded 1000 rows in 46s
bacteria_kmer_csv_fp = '../data/bact_kmer_file1.fasta.tab.gz'
bacteria_kmer_array = load_kmers(kmer_fp=bacteria_kmer_csv_fp)
bacteria_kmer_array.shape

In [None]:
virus_kmer_csv_fp = '../data/vir_kmer_file1.fasta.tab.gz'
virus_kmer_array = load_kmers(kmer_fp=virus_kmer_csv_fp)
virus_kmer_array.shape

### Find the dimensions of the data

In [None]:
feature_count = bacteria_kmer_array.shape[1]
bacteria_sample_count = bacteria_kmer_array.shape[0]
virus_sample_count = virus_kmer_array.shape[0]

print('features               : {}'.format(feature_count))
print('bacterial sample count : {}'.format(bacteria_sample_count))
print('viral sample count     : {}'.format(virus_sample_count))

### 4. Build a Model

In [None]:
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=feature_count))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])


### 5. Train the Model

In [None]:
training_bacteria_kmer_array = bacteria_kmer_array.values[:500, :]
training_bacteria_label_array = 0.0 * np.ones((training_bacteria_kmer_array.shape[0], 1))

print('training bacteria kmer dim  : {}'.format(training_bacteria_kmer_array.shape))
print('training bacteria label dim : {}'.format(training_bacteria_label_array.shape))

In [None]:
validation_bacteria_kmer_array = bacteria_kmer_array.values[500:, :]
validation_bacteria_label_array = 0.0 * np.ones((validation_bacteria_kmer_array.shape[0], 1))

print('validation bacteria kmer dim  : {}'.format(validation_bacteria_kmer_array.shape))
print('validation bacteria label dim : {}'.format(validation_bacteria_label_array.shape))

In [None]:
training_virus_kmer_array = virus_kmer_array.values[500:, :]
training_virus_label_array = 1.0 * np.ones((training_virus_kmer_array.shape[0], 1))

print('training virus kmer dim  : {}'.format(training_virus_kmer_array.shape))
print('training virus label dim : {}'.format(training_virus_label_array.shape))

In [None]:
validation_virus_kmer_array = virus_kmer_array.values[500:, :]
validation_virus_label_array = 1.0 * np.ones((validation_virus_kmer_array.shape[0], 1))

print('validation virus kmer dim  : {}'.format(validation_virus_kmer_array.shape))
print('validation virus label dim : {}'.format(validation_virus_label_array.shape))

In [None]:
model.fit(np.vstack((training_bacteria_kmer_array, training_virus_kmer_array)),
          np.vstack((training_bacteria_label_array, training_virus_label_array)),
          batch_size=20,
          epochs=2,
          validation_data=(
              np.vstack((validation_bacteria_kmer_array, validation_virus_kmer_array)),
              np.vstack((validation_bacteria_label_array, validation_virus_label_array))))