In [None]:
import numpy as np 
import pandas as pd
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import *
from keras.layers import *
from keras.regularizers import l2
from sklearn.model_selection import train_test_split, GridSearchCV
from keras.metrics import categorical_accuracy
from keras import backend as K
from sklearn.model_selection import KFold
import tensorflow as tf

In [None]:
'''
various helper functions
'''

# Fixed-size Ordinally Forgetting Encoding
def encode_FOFE(onehot, alpha, maxlen):
    enc = np.zeros((maxlen, 2 * 22))
    enc[0, :22] = onehot[0] 
    enc[maxlen-1, 22:] = onehot[maxlen-1] 
    for i in range(1, maxlen):
        enc[i, :22] = enc[i-1, :22] * alpha + onehot[i]
        enc[maxlen-i-1, 22:] = enc[maxlen-i, 22:] * alpha + onehot[maxlen-i-1]
    return enc

# The custom accuracy metric used for this task
def accuracy(y_true, y_pred):
    y = tf.argmax(y_true, axis =- 1)
    y_ = tf.argmax(y_pred, axis =- 1)
    mask = tf.greater(y, 0)
    return K.cast(K.equal(tf.boolean_mask(y, mask), tf.boolean_mask(y_, mask)), K.floatx())

def to_seq(y):
    seqs=[]
    for i in range(len(y)):
        seq_i=''
        for j in range(len(y[i])):
            seq_i += q8_list[np.argmax(y[i][j])]
        seqs.append(seq_i)
    return seqs

In [None]:
residue_list = ['A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y', 'X','NoSeq']
q8_list = ['L', 'B', 'E', 'G', 'I', 'H', 'S', 'T','NoSeq']

cb513filename = '../data/cb513.npy'
cb6133filename = '../data/cb6133.npy'
cb6133filteredfilename = '../data/cb6133filtered.npy'

cb513 = np.load(cb513filename)
cb6133 = np.load(cb6133filename)
cb6133filtered = np.load(cb6133filteredfilename)

print(cb513.shape)
print(cb6133.shape)
print(cb6133filtered.shape)

In [None]:
num_teams = 4

teams_fold_preds_df = pd.read_table('meta-features/ln2401_kat2193-ddl2133_ks3311-ps2958_jw3468-yw3169_yh3050.csv')
teams_fold_preds = teams_fold_preds_df['id,expected']

teams_cb6133_preds_df = pd.read_table('meta-features/cb6133test_ln2401_kat2193-ddl2133_ks3311-ps2958_jw3468-yw3169_yh3050.csv')
teams_cb6133_preds = teams_cb6133_preds_df['id,expected']

teams_cb513_preds_df = pd.read_table('meta-features/cb513test_ln2401_kat2193-ddl2133_ks3311-ps2958_jw3468-yw3169_yh3050.csv')
teams_cb513_preds = teams_cb513_preds_df['id,expected']

train_input_teams = np.zeros((len(teams_fold_preds), 700, num_teams * 9))
for i in range(len(teams_fold_preds)):
    preds = teams_fold_preds[i].split(',')[1:]
    for j in range(700):
        if j < len(preds):
            for k in range(num_teams):
                onehot_idx = k * 9 + q8_list.index(preds[j][k])
                train_input_teams[i, j, onehot_idx] = 1
        else:
            for k in range(num_teams):
                onehot_idx = k * 9 + 8
                train_input_teams[i, j, onehot_idx] = 1

cb6133_input_teams = np.zeros((len(teams_cb6133_preds), 700, num_teams * 9))
for i in range(len(teams_cb6133_preds)):
    preds = teams_cb6133_preds[i].split(',')[1:]
    for j in range(700):
        if j < len(preds):
            for k in range(num_teams):
                onehot_idx = k * 9 + q8_list.index(preds[j][k])
                cb6133_input_teams[i, j, onehot_idx] = 1
        else:
            for k in range(num_teams):
                onehot_idx = k * 9 + 8
                cb6133_input_teams[i, j, onehot_idx] = 1
                
print(train_input_teams.shape)
print(cb6133_input_teams.shape)

In [None]:
maxlen_seq = 700 # maximum sequence length
alpha = 0.5 # parameter for long range encoding

# getting inputs
input_seqs = cb6133.reshape(6133, 700, 57)

train_input_seqs = input_seqs[0:5600]
train_input_data = np.zeros((5600, 700, 46))
train_input_data[:, :, :22] = train_input_seqs[:, :, :22]
train_input_data[:, :, 22:24] = train_input_seqs[:, :, 31:33]
train_input_data[:, :, 24:] = train_input_seqs[:, :, 35:]

train_input_onehot = train_input_data[:,:,0:22]
train_input_fofe = np.array(list(map(lambda x:encode_FOFE(x, alpha, maxlen_seq), 
                                     train_input_onehot)))

train_input_data = np.concatenate((train_input_data, train_input_fofe, train_input_teams), axis=2)

test_input_seqs = input_seqs[5605:5877]
test_input_data = np.zeros((272, 700, 46))

test_input_data[:,:,:22] = test_input_seqs[:,:, :22]
test_input_data[:,:,22:24] = test_input_seqs[:,:, 31:33]
test_input_data[:,:,24:] = test_input_seqs[:,:, 35:]

test_input_onehot = test_input_data[:,:,0:22]
test_input_fofe = np.array(list(map(lambda x:encode_FOFE(x, alpha, maxlen_seq), 
                                     test_input_onehot)))

test_input_data = np.concatenate((test_input_data, test_input_fofe, cb6133_input_teams), axis=2)

# ... and targets
train_target_data = train_input_seqs[:,:,22:31]
test_target_data = test_input_seqs[:, :, 22:31]

# Computing the number of words and number of tags 
n_words = len(train_input_data[0,0])
n_tags = len(train_target_data[0,0])

print(n_words, n_tags)
print(train_input_data.shape, train_target_data.shape, test_input_data.shape, test_target_data.shape)

In [None]:
input = Input(shape=(maxlen_seq, n_words,))

# one dense layer to remove sparsity
x = GaussianNoise(.75)(input)
x = Dense(128, activation='relu', use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros')(x)
x = Reshape([maxlen_seq, 128, 1])(x)

# Defining 3 convolutional layers with different kernel sizes
# kernel size = 3
conv1 = ZeroPadding2D((3//2, 0), data_format='channels_last')(x)
conv1 = Conv2D(filters=64, 
               kernel_size=(3, 128), 
               input_shape=(1, maxlen_seq, 128), 
               data_format='channels_last',
               strides=(1, 1), 
               dilation_rate=(1, 1), 
               activation='relu', 
               use_bias=True, 
               kernel_initializer='glorot_uniform', 
               bias_initializer='zeros')(conv1)
conv1 = BatchNormalization(axis=-1)(conv1)

# kernel size = 7
conv2 = ZeroPadding2D((7//2, 0), data_format='channels_last')(x)
conv2 = Conv2D(filters=64, 
               kernel_size=(7, 128), 
               input_shape=(1, maxlen_seq, 128), 
               data_format='channels_last',
               strides=(1, 1), 
               padding='valid', 
               dilation_rate=(1, 1), 
               activation='relu', 
               use_bias=True, 
               kernel_initializer='glorot_uniform', 
               bias_initializer='zeros')(conv2)
conv2 = BatchNormalization(axis=-1)(conv2)

# kernel size = 11
conv3 = ZeroPadding2D((11//2, 0), data_format='channels_last')(x)
conv3 = Conv2D(filters=64, 
               kernel_size=(11, 128), 
               input_shape=(1, maxlen_seq, 128), 
               data_format='channels_last',
               strides=(1, 1), 
               padding='valid', 
               dilation_rate=(1, 1), 
               activation='relu', 
               use_bias=True, 
               kernel_initializer='glorot_uniform', 
               bias_initializer='zeros')(conv3)
conv3 = BatchNormalization(axis=-1)(conv3)
conv = concatenate([conv1, conv2, conv3])
conv = Reshape([maxlen_seq, 3*64])(conv)
conv = GaussianNoise(.25)(conv)
# conv = Reshape([maxlen_seq, 64])(conv2)
# conv = GaussianNoise(7.5e-1)(conv)

# Defining 3 bidirectional GRU layers; taking the concatenation of outputs 
gru1 = Bidirectional(GRU(32, 
                         return_sequences='True',
                         activation='tanh', 
                         recurrent_activation='hard_sigmoid', 
                         use_bias=True, 
                         kernel_initializer='glorot_uniform', 
                         recurrent_initializer='orthogonal', 
                         bias_initializer='zeros', 
                         dropout=0.0,
                         recurrent_dropout=0.5, 
                         implementation=1))(conv)
# gru1 = GaussianNoise(7.5e-1)(gru1)

gru2 = Bidirectional(GRU(32, 
                         return_sequences='True',
                         activation='tanh', 
                         recurrent_activation='hard_sigmoid', 
                         use_bias=True, 
                         kernel_initializer='glorot_uniform', 
                         recurrent_initializer='orthogonal', 
                         bias_initializer='zeros', 
                         dropout=0.0,
                         recurrent_dropout=0.5, 
                         implementation=1))(gru1)

gru3 = Bidirectional(GRU(32, 
                         return_sequences='True',
                         activation='tanh', 
                         recurrent_activation='hard_sigmoid', 
                         use_bias=True, 
                         kernel_initializer='glorot_uniform', 
                         recurrent_initializer='orthogonal', 
                         bias_initializer='zeros', 
                         dropout=0.0,
                         recurrent_dropout=0.5, 
                         implementation=1))(gru2)

comb = concatenate([gru1, gru2, gru3, conv])
comb = GaussianNoise(.25)(comb)


# Defining two fully-connected layers with dropout
x = TimeDistributed(Dense(256, 
                          activation='relu', 
                          use_bias=True, 
                          kernel_initializer='glorot_uniform', 
                          bias_initializer='zeros'))(comb)
x = Dropout(0.5)(x)

x = TimeDistributed(Dense(128, 
                          activation='relu', 
                          use_bias=True,  
                          kernel_initializer='glorot_uniform', 
                          bias_initializer='zeros'))(x)
x = Dropout(0.5)(x)

# Defining the output layer
y = TimeDistributed(Dense(n_tags, 
                          activation='softmax', 
                          use_bias=False, 
                          kernel_initializer='glorot_uniform'))(x)

# Defining the model as a whole and printing the summary
model = Model(input, y)
model.summary()

In [None]:
model.compile(optimizer = "nadam", loss = "categorical_crossentropy", metrics = ["accuracy", accuracy])
model.fit(train_input_data, train_target_data, 
          batch_size = 64, epochs = 10, 
          validation_data = (test_input_data, test_target_data), 
          verbose = 1)

In [None]:
y_ = model.predict(test_input_data[:])
q8_structures = to_seq(y_)

path = 'predictions/cb6133test_stacker_model5_4.csv'
file_output = pd.DataFrame({'id' : np.array(range(272))+1, 'expected' : q8_structures}, columns=['id', 'expected'])
file_output.to_csv(path, index=False) 