In [13]:
import os
os.environ['KERAS_BACKEND'] = 'theano'
import keras
from keras.layers.recurrent import LSTM, SimpleRNN, GRU
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import Bidirectional, TimeDistributed
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, RepeatVector
from keras.optimizers import RMSprop, SGD
import numpy as np

In [None]:
mem_depth = 20
proteins = ['*']*int(mem_depth/2)
structures = ['*']*int(mem_depth/2)


path = "output102361.out"
with open(path) as f:
    for line in f:
        splited = line.strip().split(' ')
        if len(list(splited[1])) == len(list(splited[2])):
            proteins.extend(list(splited[1].upper()))
            proteins.extend(['*']*int(mem_depth/2))
            structures.extend(list(splited[2].upper()))
            proteins.extend(['*']*int(mem_depth/2))
        
            
            
alphabet_structures = ['C', 'H', 'E', 'T', "*"]
alphabet_proteins = ['A','R','N','D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', '*']

known_proteins = []
known_structures = []
for i in range(len(proteins)):
    if proteins[i] in alphabet_proteins and structures[i] in alphabet_structures:
        known_proteins.append(proteins[i])
        known_structures.append(structures[i])
proteins = known_proteins
structures = known_structures

structures_indices = dict((c, i) for i, c in enumerate(alphabet_structures))
indices_structures = dict((i, c) for i, c in enumerate(alphabet_structures))

proteins_indices = dict((c, i) for i, c in enumerate(alphabet_proteins))
indices_proteins = dict((i, c) for i, c in enumerate(alphabet_proteins))


#Get time series
protein_blocks = []
structure = []
for i in range(0, len(proteins) - mem_depth + 1):
    protein_blocks.append(proteins[i: i + mem_depth])
    structure.append(structures[i + int(mem_depth/2) - 1])
    

#Vectorisation
X = np.zeros((len(protein_blocks), mem_depth, len(alphabet_proteins)), dtype=np.bool)
y = np.zeros((len(structure), len(alphabet_structures)), dtype=np.bool)
for i, block in enumerate(protein_blocks):
    for t, protein in enumerate(block):
        X[i, t, proteins_indices[protein]] = 1
    y[i, structures_indices[structure[i]]] = 1
    
print(np.shape(X))
    

In [17]:
print('Build model...')
model = Sequential()
model.add(Bidirectional(GRU(128), input_shape=(mem_depth, len(alphabet_proteins))))
model.add(Dense(len(alphabet_structures)))
model.add(Activation('softmax'))

optimizer = SGD(lr=0.01, momentum=0.9)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


Build model...




Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f4d2127f470>

In [35]:
#The following line has been executed twice

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, train_size=0.66)

train_filtered_x = np.array([X_test[i] for i in range(len(X_train)) if alphabet_structures[np.argmax(y_train[i])] != '*'])
train_filtered_y = np.array([y_test[i] for i in range(len(y_train)) if alphabet_structures[np.argmax(y_train[i])] != '*'])

test_filtered_x = np.array([X_test[i] for i in range(len(X_test)) if alphabet_structures[np.argmax(y_test[i])] != '*'])
test_filtered_y = np.array([y_test[i] for i in range(len(y_test)) if alphabet_structures[np.argmax(y_test[i])] != '*'])

model.fit(train_filtered_x, train_filtered_y, batch_size=128, epochs=10)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f4d20e53c88>

In [55]:
#Saving trained model
model_yaml = model.to_yaml()
with open("gru_model.yaml", "w") as yaml_file:
    yaml_file.write(model_yaml)
    
model.save_weights("gru_model.h5")
print("Saved model to disk")

Saved model to disk


In [60]:
#Predicting
predictions = (model.predict(test_filtered_x))
#Getting structure codes
predictions = np.array([alphabet_structures[np.argmax(prediction)] for prediction in predictions])
#Getting structure codes
structures = np.array([alphabet_structures[np.argmax(y)] for y in test_filtered_y])

'C'

In [69]:
#Getting accuracy
acc_arr = predictions == structures
np.mean(acc_arr)

In [78]:
C_arr_pred = predictions == "C"
C_arr_struct = structures == "C"
C_precision = sum(C_arr_pred * acc_arr)/sum(C_arr_pred)
C_recall = sum(C_arr_pred * acc_arr)/sum(C_arr_struct)
print("C Precision: {0}".format(C_precision))
print("C Recall: {0}".format(C_recall))

C Precision: 0.5799530987033479
C Recall: 0.6301510009259244


In [79]:
H_arr_pred = predictions == "H"
H_arr_struct = np.array(structure[12000000:]) == "H"
H_precision = sum(H_arr_pred * acc_arr)/sum(H_arr_pred)
H_recall = sum(H_arr_pred * acc_arr)/sum(H_arr_struct)
print("H Precision: {0}".format(H_precision))
print("H Recall: {0}".format(H_recall))

H Precision: 0.704974945101719
H Recall: 0.7984023817740601


In [80]:
E_arr_pred = predictions == "E"
E_arr_struct = structures == "E"
E_precision = sum(E_arr_pred * acc_arr)/sum(E_arr_pred)
E_recall = sum(E_arr_pred * acc_arr)/sum(E_arr_struct)
print("E Precision: {0}".format(E_precision))
print("E Recall: {0}".format(E_recall))

E Precision: 0.6200789875475623
E Recall: 0.5607686359951419


In [81]:
T_arr_pred = predictions == "T"
T_arr_struct = structures == "T"
T_precision = sum(T_arr_pred * acc_arr)/sum(T_arr_pred)
T_recall = sum(T_arr_pred * acc_arr)/sum(T_arr_struct)
print("T Precision: {0}".format(T_precision))
print("T Recall: {0}".format(T_recall))

T Precision: 0.5232188465061437
T Recall: 0.2836822605938521
