In [2]:
import os
import pandas as pd
import numpy as np

import keras
from keras import backend as K
from keras import metrics, activations, initializers, regularizers, constraints
from keras.models import Model, Sequential
from keras.layers import Flatten, Dense, Dropout, Reshape, Permute, Activation, \
    Input, Lambda, Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, ZeroPadding2D

Using TensorFlow backend.


In [3]:
def preprocess(data, base_types):
    new_data = np.zeros((len(data),len(data[0,1])))
    for idx,i in zip(data[:,0],data[:,1]):
        new_data[idx] = [base_types[j]  for j in i]
    return new_data

class Dataset:
    
    def __init__(self, file_path):
        data = pd.read_csv(file_path)
        base_types = {'A':0, 'C':1, 'G':2, 'T':3}
        self._data = preprocess(data.values, base_types)
        if data.values.shape[1] == 3:
            self._labels = data.values[:,2]
        else:
            self._labels = None
            
    @property
    def data(self):
        return self._data
    
    @property
    def labels(self):
        return self._labels
   

In [22]:
data_dir = "."
train_file = os.path.join(data_dir,"train.csv")
test_file = os.path.join(data_dir,"test.csv")
train_data = Dataset(train_file)
test_data = Dataset(test_file)

x_train = train_data.data
y_train = train_data.labels

#model settings
batch_size = 10
num_classes = 1
num_base_types = 4
epochs = 10
cutoff_len = int(0.9 * len(train_data.data))

# inputs dimensions
seq_len, emb_dim = 14, 32

model = Sequential()
model.add(Embedding(input_dim=num_base_types+1, output_dim=emb_dim, input_length=seq_len))
model.add(Conv1D(32, kernel_size=2, activation='relu'))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='sigmoid'))

model.compile(loss=keras.losses.binary_crossentropy,
              optimizer=keras.optimizers.rmsprop(), metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          shuffle=True,
          validation_data=(x_train[cutoff_len:], y_train[cutoff_len:]))
score = model.evaluate(x_train[cutoff_len:], y_train[cutoff_len:], verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

predictions = np.zeros((test_data.data.shape[0],2))
test = model.predict(test_data.data, batch_size=batch_size, verbose=1)

for idx,p in enumerate(test):
    predictions[idx,0] = idx
    if p > 0.5:
        predictions[idx,1] = 1
    else:
        predictions[idx,1] = 0 

with open("submission.csv","wb") as f:
        np.savetxt(f, predictions, fmt='%d',delimiter=',',header="id,prediction")

Train on 2000 samples, validate on 200 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.438507359028
Test accuracy: 0.84
