In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import keras
from keras import backend as K
from keras import metrics, activations, initializers, regularizers, constraints
from keras.models import Model, Sequential
from keras.layers import Flatten, Dense, Dropout, Reshape, Permute, Activation, \
    Input, Lambda, Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, ZeroPadding2D

Using TensorFlow backend.


In [2]:
def preprocess(data, base_types):
    """Encodes base types into real integers from [0 num_base_types)"""
    new_data = np.zeros((len(data),len(data[0,1])))
    for idx,i in zip(data[:,0],data[:,1]):
        new_data[idx] = [base_types[j]  for j in i]
    return new_data

class Dataset:
    
    def __init__(self, file_path):
        data = pd.read_csv(file_path)
        base_types = {'A':0, 'C':1, 'G':2, 'T':3}
        self._data = preprocess(data.values, base_types)
        if data.values.shape[1] == 3:
            self._labels = data.values[:,2]
        else:
            self._labels = None
            
    @property
    def data(self):
        return self._data
    
    @property
    def labels(self):
        return self._labels
   

In [9]:

"""Replace data_dir path with the path of testing and training data location"""
data_dir = "."
train_file = os.path.join(data_dir,"train.csv")
test_file = os.path.join(data_dir,"test.csv")
train_data = Dataset(train_file)
test_data = Dataset(test_file)

x_train = train_data.data
y_train = train_data.labels

#model settings
batch_size = 10
num_classes = 1
num_base_types = 4
epochs = 10
cutoff_len = int(0.9 * len(train_data.data))

# inputs dimensions
seq_len, emb_dim = 14, 32

model = Sequential()
model.add(Embedding(input_dim=num_base_types+1, output_dim=emb_dim, input_length=seq_len))
model.add(Conv1D(32, kernel_size=2, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(128, 2, activation='relu'))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='sigmoid'))

model.compile(loss=keras.losses.binary_crossentropy,
              optimizer=keras.optimizers.rmsprop(), metrics=['accuracy'])

print(model.summary())

history = model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          shuffle=True,)
          #validation_data=(x_train[cutoff_len:], y_train[cutoff_len:]))
score = model.evaluate(x_train[cutoff_len:], y_train[cutoff_len:], verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

predictions = np.zeros((test_data.data.shape[0],2))
test = model.predict(test_data.data, batch_size=batch_size, verbose=1)

for idx,p in enumerate(test):
    predictions[idx,0] = idx
    if p > 0.5:
        predictions[idx,1] = 1
    else:
        predictions[idx,1] = 0 

with open("submission.csv","wb") as f:
        np.savetxt(f, predictions, fmt='%d',delimiter=',',header="id,prediction")
        
#x = range(1,epochs+1)
#plt.subplot(2, 1, 1)
#plt.xlabel('Epochs')
#plt.ylabel('Loss')
#plt.plot(x,history.history['loss'])
#plt.plot(x,history.history['val_loss'])
#plt.legend(('Training', 'Validation'), loc='upper right')
#plt.title('Binary cross entropy')
#plt.subplot(2, 1, 2)
#plt.xlabel('Epochs')
#plt.ylabel('Accuracy')
#plt.plot(x,history.history['acc'])
#plt.plot(x,history.history['val_acc'])
#plt.legend(('Training', 'Validation'), loc='upper right')
#plt.title('Binary accuracy')

#plt.tight_layout()         
#plt.show()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 14, 32)            160       
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 13, 32)            2080      
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 6, 32)             0         
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 4, 64)             6208      
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 2, 64)             0         
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 1, 128)            16512     
_________________________________________________________________
dropout_11 (Dropout)         (None, 1, 128)            0         
__________

"        \nx = range(1,epochs+1)\nplt.subplot(2, 1, 1)\nplt.xlabel('Epochs')\nplt.ylabel('Loss')\nplt.plot(x,history.history['loss'])\nplt.plot(x,history.history['val_loss'])\nplt.legend(('Training', 'Validation'), loc='upper right')\nplt.title('Binary cross entropy')\nplt.subplot(2, 1, 2)\nplt.xlabel('Epochs')\nplt.ylabel('Accuracy')\nplt.plot(x,history.history['acc'])\nplt.plot(x,history.history['val_acc'])\nplt.legend(('Training', 'Validation'), loc='upper right')\nplt.title('Binary accuracy')\n\nplt.tight_layout()         \nplt.show()\n"