In [None]:
# Auth for GDrive
from google.colab import drive
from google.colab import auth
auth.authenticate_user()
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')

In [None]:
# Install Tensorboard
!pip install -q tensorboardcolab
from tensorboardcolab import TensorBoardColab, TensorBoardColabCallback

In [None]:
# Download files from GDrive
file_ids = {
    'cb6133filtered':'1yYe9xfC9g8lYEsmDwC7_fP7l3CWQBKNp',
    'cb6133':'1SS6hbyAXJV3oBMSrDxnaHa_dSV7Ijq7M',
    'cb513':'1liYeocK2OQTn0mnBgNzl1dsTANHpP-pp'
}

import numpy as np
import io
from googleapiclient.http import MediaIoBaseDownload

cb_data = {}

for file_name, file_id in file_ids.items():
    request = drive_service.files().get_media(fileId=file_id) 
    downloaded = io.BytesIO()
    downloader = MediaIoBaseDownload(downloaded, request)
    done = False

    while done is False:
        status, done = downloader.next_chunk()
        print("Download %d%%." % int(status.progress() * 100))
        
    downloaded.seek(0)   
    cb_data[file_name] = np.load(downloaded)

In [None]:
# for uploading meta-features
from google.colab import files
data = files.upload()
for fn in data.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(data[fn])))

In [None]:
cb513 = cb_data['cb513']
cb6133 = cb_data['cb6133']
cb6133filtered = cb_data['cb6133filtered']

print(cb513.shape)
print(cb6133.shape)
print(cb6133filtered.shape)

residue_list = ['A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y', 'X','NoSeq']
q8_list = ['L', 'B', 'E', 'G', 'I', 'H', 'S', 'T','NoSeq']

r = 700 # protein residues padded to 700
f = 57 # number of features for each residue

In [None]:
import numpy as np 
import pandas as pd
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Model, Input, Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Concatenate, BatchNormalization
from keras.layers import Bidirectional, Activation, Dropout, CuDNNGRU, CuDNNLSTM, Conv1D, GRU
from sklearn.model_selection import train_test_split
from keras.metrics import categorical_accuracy
from keras import backend as K
from keras.regularizers import l1, l2
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
K.tensorflow_backend._get_available_gpus()

In [None]:
def get_data(i, arr):
    seq, q8, profiles = '', '', []
    
    for j in range(r):
        jf = j*f
        residue_onehot = arr[i,jf+0:jf+22] # residue one-hot encoded = [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
        residue_q8_onehot = arr[i,jf+22:jf+31] # testing fields [22,31) q8 one-hot encoded = [0. 0. 1. 0. 0. 0. 0. 0. 0.]
        nc_terminals = arr[i,jf+31:jf+33] # nc_terminals start and end of sequence = [0. 0.]
        sa = arr[i,jf+33:jf+35] # do not use these fields at all sa = [0. 0.]
        profile = arr[i,jf+35:jf+57] # profile features
        residue = residue_list[np.argmax(residue_onehot)] # decode residue symbol
        residue_q8 = q8_list[np.argmax(residue_q8_onehot)] # decode q8 secondary structure symbol
        
        if residue == 'NoSeq': # terminating sequence symbol
            break
        seq += residue # concat residues into amino acid sequence
        q8 += residue_q8 # concat secondary structure into secondary structure sequence
        profiles.append(profile)

    if len(seq) == len(q8): # verify amino acid sequence and secondary structure sequence lengths match
        header = 'id,len,input,expected' # data header
        line = str(i+1) + ',' + str(len(seq)) + ',' + seq + ',' + q8 # data row
        return (str(i+1), str(len(seq)), seq, np.array(profiles), q8)
        #print('\n' + header + '\n' + line) # example printout of header and protein data
        
    else:
        print('length mismatch', str(len(seq)), str(len(q8)))

# The custom accuracy metric used for this task
def accuracy(y_true, y_pred):
    y = tf.argmax(y_true, axis =- 1)
    y_ = tf.argmax(y_pred, axis =- 1)
    mask = tf.greater(y, 0)
    return K.cast(K.equal(tf.boolean_mask(y, mask), tf.boolean_mask(y_, mask)), K.floatx())

# eager execution accuracy
def ex_accuracy(y_true, y_pred):
    y = np.argmax(y_true, axis =- 1)
    y_ = np.argmax(y_pred, axis =- 1)
    mask = np.greater(y, 0)
    arr = np.equal(y[mask], y_[mask])
    return sum(arr) / float(len(arr))

# Maps the sequence to a one-hot encoding
def onehot_to_seq(oh_seq, index):
    s = ''
    for o in oh_seq:
        i = np.argmax(o)
        if i != 0:
            s += index[i]
        else:
            break
    return s

def seq2onehot(seq, n):
    out = np.zeros((len(seq), maxlen_seq, n))
    for i in range(len(seq)):
        for j in range(maxlen_seq):
            out[i, j, seq[i, j]] = 1
    return out

# prints the results
def print_results(x, y_, revsere_decoder_index):
    # print("input     : " + str(x))
    # print("prediction: " + str(onehot_to_seq(y_, revsere_decoder_index).upper()))
    print(str(onehot_to_seq(y_, revsere_decoder_index).upper()))

# Computes and returns the n-grams of a particualr sequence, defaults to trigrams
def seq2ngrams(seqs, n = 1):
    return np.array([[seq[i : i + n] for i in range(len(seq))] for seq in seqs])

training_idx = range(5600)
test_idx = range(5605,5877)
validation_idx = range(5877,6133)

training_idx2 = range(5534)
test_idx2 = range(514)

training_data = [get_data(i, cb6133) for i in training_idx]
test_data = [get_data(i, cb6133) for i in test_idx]
# training_data = [get_data(i, cb6133filtered) for i in training_idx2]
# test_data = [get_data(i, cb513) for i in test_idx2]
# validation_data = [get_data(i, cb6133) for i in validation_idx]

In [None]:
training_data_df = pd.DataFrame(training_data)
test_data_df = pd.DataFrame(test_data)
validation_data_df = pd.DataFrame(test_data)

columns = ["id", "len", "input", "profiles", "expected"]
           
training_data_df.columns = columns
test_data_df.columns = columns
validation_data_df.columns = columns

In [None]:
train_df = training_data_df
test_df = test_data_df
val_df = validation_data_df
len(train_df), len(test_df), len(val_df)

In [None]:
maxlen_seq = 700

# Load train inputs
train_input_seqs, train_target_seqs = train_df[['input', 'expected']][train_df.len.astype(int) <= maxlen_seq].values.T
train_input_grams = seq2ngrams(train_input_seqs)
# Load test inputs
test_input_seqs, test_target_seqs = test_df[['input', 'expected']][(test_df.len.astype(int) <= maxlen_seq)].values.T
test_input_grams = seq2ngrams(test_input_seqs)
# Load val inputs
val_input_seqs, val_target_seqs = val_df[['input', 'expected']][(val_df.len.astype(int) <= maxlen_seq)].values.T
val_input_grams = seq2ngrams(val_input_seqs)

# Initializing and defining the tokenizer encoders and decoders based on the train set
tokenizer_encoder = Tokenizer()
tokenizer_encoder.fit_on_texts(train_input_grams)
tokenizer_decoder = Tokenizer(char_level = True)
tokenizer_decoder.fit_on_texts(train_target_seqs)

# Tokenize train
# Inputs
train_input_data = tokenizer_encoder.texts_to_sequences(train_input_grams)
train_input_data = sequence.pad_sequences(train_input_data, maxlen = maxlen_seq, padding = 'post')
# Targets
train_target_data = tokenizer_decoder.texts_to_sequences(train_target_seqs)
train_target_data = sequence.pad_sequences(train_target_data, maxlen = maxlen_seq, padding = 'post')
# train_target_data = to_categorical(train_target_data)

# Tokenize val
# Inputs
val_input_data = tokenizer_encoder.texts_to_sequences(val_input_grams)
val_input_data = sequence.pad_sequences(val_input_data, maxlen = maxlen_seq, padding = 'post')
# Targets
val_target_data = tokenizer_decoder.texts_to_sequences(val_target_seqs)
val_target_data = sequence.pad_sequences(val_target_data, maxlen = maxlen_seq, padding = 'post')
# val_target_data = to_categorical(val_target_data)

# Tokenize Test
# Inputs
test_input_data = tokenizer_encoder.texts_to_sequences(test_input_grams)
test_input_data = sequence.pad_sequences(test_input_data, maxlen = maxlen_seq, padding = 'post')
# Targets
test_target_data = tokenizer_decoder.texts_to_sequences(test_target_seqs)
test_target_data = sequence.pad_sequences(test_target_data, maxlen = maxlen_seq, padding = 'post')
# test_target_data = to_categorical(test_target_data)


'''
Resolving dimension issue in target data
'''
targets = to_categorical(np.r_[train_target_data, val_target_data, test_target_data])
train_target_data = targets[:5600]
val_target_data = targets[5600: 5872]
test_target_data = targets[5872:]

print(train_target_data.shape, val_target_data.shape, test_target_data.shape)

# Computing the number of words and number of tags to be passed as parameters to the keras model
n_words = len(tokenizer_encoder.word_index) + 1
n_tags = len(tokenizer_decoder.word_index) + 1

print(n_words, n_tags)

In [None]:
train_input_data_alt = train_input_data
train_input_data = seq2onehot(train_input_data, n_words)
train_profiles = train_df.profiles.values

test_input_data_alt = test_input_data
test_input_data = seq2onehot(test_input_data, n_words)
test_profiles = test_df.profiles.values

val_input_data_alt = val_input_data
val_input_data = seq2onehot(val_input_data, n_words)
val_profiles = val_df.profiles.values

print(train_input_data_alt.shape, train_input_data.shape, train_profiles.shape)

In [None]:
train_profiles_np = np.zeros((len(train_profiles), maxlen_seq, 22))
for i, profile in enumerate(train_profiles):
    for j in range(profile.shape[0]):
        for k in range(profile.shape[1]):
            train_profiles_np[i, j, k] = profile[j, k]
            
test_profiles_np = np.zeros((len(test_profiles), maxlen_seq, 22))
for i, profile in enumerate(test_profiles):
    for j in range(profile.shape[0]):
        for k in range(profile.shape[1]):
            test_profiles_np[i, j, k] = profile[j, k]
            
val_profiles_np = np.zeros((len(val_profiles), maxlen_seq, 22))
for i, profile in enumerate(val_profiles):
    for j in range(profile.shape[0]):
        for k in range(profile.shape[1]):
            val_profiles_np[i, j, k] = profile[j, k]

print(train_profiles_np.shape, test_profiles_np.shape, val_profiles_np.shape)

In [None]:
num_teams = 4

teams_fold_preds_df = pd.read_table('ln2401_kat2193-ddl2133_ks3311-ps2958_jw3468-yw3169_yh3050.csv')
teams_fold_preds = teams_fold_preds_df['id,expected']

teams_cb6133_preds_df = pd.read_table('cb6133test_ln2401_kat2193-ddl2133_ks3311-ps2958_jw3468-yw3169_yh3050.csv')
teams_cb6133_preds = teams_cb6133_preds_df['id,expected']

teams_cb513_preds_df = pd.read_table('cb513test_ln2401_kat2193-ddl2133_ks3311-ps2958_jw3468-yw3169_yh3050.csv')
teams_cb513_preds = teams_cb513_preds_df['id,expected']

train_input_teams = np.zeros((len(teams_fold_preds), 700, num_teams * 9))
for i in range(len(teams_fold_preds)):
    preds = teams_fold_preds[i].split(',')[1:]
    for j in range(700):
        if j < len(preds):
            for k in range(num_teams):
                onehot_idx = k * 9 + q8_list.index(preds[j][k])
                train_input_teams[i, j, onehot_idx] = 1
        else:
            for k in range(num_teams):
                onehot_idx = k * 9 + 8
                train_input_teams[i, j, onehot_idx] = 1

cb6133_input_teams = np.zeros((len(teams_cb6133_preds), 700, num_teams * 9))
for i in range(len(teams_cb6133_preds)):
    preds = teams_cb6133_preds[i].split(',')[1:]
    for j in range(700):
        if j < len(preds):
            for k in range(num_teams):
                onehot_idx = k * 9 + q8_list.index(preds[j][k])
                cb6133_input_teams[i, j, onehot_idx] = 1
        else:
            for k in range(num_teams):
                onehot_idx = k * 9 + 8
                cb6133_input_teams[i, j, onehot_idx] = 1
                
print(train_input_teams.shape)
print(cb6133_input_teams.shape)

In [None]:
X_train, X_val, y_train, y_val = train_input_data, val_input_data, train_target_data, val_target_data
X_train_alt, X_val_alt = train_input_data_alt, val_input_data_alt
X_train_profile, X_val_profile = train_profiles_np, val_profiles_np
X_train_teams, X_val_teams, X_test_teams = train_input_teams, cb6133_input_teams, cb6133_input_teams

In [None]:
def conv_block(x, activation=True, batch_norm=True, drop_out=True, res=True):
    c11 = Conv1D(64, 11, padding="same")(x)
    
    if activation:
        c11 = TimeDistributed(Activation("relu"))(c11)
    if batch_norm:
        c11 = TimeDistributed(BatchNormalization())(c11)
    if drop_out:
        c11 = TimeDistributed(Dropout(0.5))(c11)
    if res:
        return Concatenate(axis=-1)([x, c11])
    else:
        return c11

def super_conv_block(x):
    c3 = Conv1D(32, 3, padding="same")(x)
    c3 = TimeDistributed(Activation("relu"))(c3)
    c3 = TimeDistributed(BatchNormalization())(c3)
    c3 = TimeDistributed(Dropout(0.5))(c3)
    
    c7 = Conv1D(64, 7, padding="same")(x)
    c7 = TimeDistributed(Activation("relu"))(c7)
    c7 = TimeDistributed(BatchNormalization())(c7)
    c7 = TimeDistributed(Dropout(0.5))(c7)
    
    c11 = Conv1D(128, 11, padding="same")(x)
    c11 = TimeDistributed(Activation("relu"))(c11)
    c11 = TimeDistributed(BatchNormalization())(c11)
    c11 = TimeDistributed(Dropout(0.5))(c11)
    
    x = Concatenate(axis=-1)([x, c3, c7, c11])
    return x

def CNN_BIGRU():
    # inp is one-hot encoded, inp_alt is the same sequence before being one-hot encoded
    # We have both since the non-one-hot encoded version is required for Embedding layer
    inp = Input(shape = (maxlen_seq, n_words))
    inp_alt = Input(shape=(maxlen_seq,))
    inp_profiles = Input(shape=(maxlen_seq, 22))
    inp_teams = Input(shape=(maxlen_seq, num_teams * n_tags))

    #Concatenate embedded and unembedded input
    x_emb = Embedding(input_dim = n_words, output_dim = 64, input_length = maxlen_seq)(inp_alt)
    x = Concatenate(axis=-1)([inp, x_emb, inp_profiles, inp_teams])

    x = super_conv_block(x)
    x = conv_block(x)
    x = conv_block(x)
    x = conv_block(x)

    x = Bidirectional(CuDNNGRU(units = 256, return_sequences = True, recurrent_regularizer=l2(0.2)))(x)
    x = TimeDistributed(Dense(128, activation = "relu"))(x)
    x = TimeDistributed(Dense(64, activation = "relu"))(x)
    y = TimeDistributed(Dense(n_tags, activation = "softmax"))(x)

    model = Model([inp, inp_alt, inp_profiles, inp_teams], y)
    
    return model

model = CNN_BIGRU()

model.compile(optimizer = "Nadam", loss = "categorical_crossentropy", metrics = ["accuracy", accuracy])
model.summary()

In [None]:
USE_TENSORBOARD = True # Visualize training with Tensorboard
if USE_TENSORBOARD:
    tbc=TensorBoardColab()
    props = dict(verbose=1, callbacks=[TensorBoardColabCallback(tbc)])
else:
    props = dict(verbose=1)

history = model.fit(
    [X_train, X_train_alt, X_train_profile, X_train_teams],
    y_train,
    batch_size = 128,
    epochs = 100,
    validation_data = ([X_val, X_val_alt, X_val_profile, X_val_teams], y_val),
    **props
)

In [None]:
# Print out graph of val acc if history is saved
import matplotlib.pyplot as plt

plt.plot(history.history['val_accuracy'])
plt.show()

In [None]:
# Save models
from google.colab import files

# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
files.download('model.json') 
    
# serialize weights to HDF5
model.save_weights("model.h5")
files.download('model.h5')

print("Saved model to disk")

In [None]:
revsere_decoder_index = {value:key for key,value in tokenizer_decoder.word_index.items()}
revsere_encoder_index = {value:key for key,value in tokenizer_encoder.word_index.items()}

y_test_pred = model.predict([test_input_data[:], test_input_data_alt[:], test_profiles_np, X_test_teams])
y_test_true = test_target_data[:]

y_test_pred

print(ex_accuracy(y_test_true, y_test_pred))

# for i in range(len(test_input_data)):
#     print_results(test_input_seqs[i], y_test_pred[i], revsere_decoder_index)

def decode_results(x, y_, revsere_decoder_index):
    # print("input     : " + str(x))
    # print("prediction: " + str(onehot_to_seq(y_, revsere_decoder_index).upper()))
    return str(onehot_to_seq(y_, revsere_decoder_index).upper())

decoded_y_pred = []
for i in range(len(test_input_data)):
    decoded_y_pred.append(decode_results(test_input_seqs[i], y_test_pred[i], revsere_decoder_index))

out_df = pd.DataFrame()
out_df["id"] = test_df.id.values
out_df["expected"] = decoded_y_pred

In [None]:
with open('cb6133test_stacker_model1_9.csv', 'w') as f:
    out_df.to_csv(f, index=False)

from google.colab import files
files.download('cb6133_stacker2_5.csv') 