In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed

In [13]:
# Load dataset
data=pd.read_csv('https://raw.githubusercontent.com/KirkDCO/pdb-secondary-structure-2022/master/raw_data/2022-12-17-pdb-intersect-pisces_pc30_r2.5.csv')
data.head()

Unnamed: 0,pdb_id,chain_code,seq,sst8,sst3,len_x,has_nonstd_aa,len_y,method,resol,rfac,freerfac
0,5D8V,A,AAPANAVTADDPTAIALKYNQDATKSERVAAARPGLPPEEQHCANC...,CCCTTBCCTTCHHHHHHTCBSSGGGSCHHHHCCTTSCGGGCCGGGB...,CCCCCECCCCCHHHHHHCCECCHHHCCHHHHCCCCCCHHHCCHHHE...,83,False,83,XRAY,0.48,0.072,0.078
1,3NIR,A,TTCCPSIVARSNFNVCRLPGTPEALCATYTGCIIIPGATCPGDYAN,CEECSSHHHHHHHHHHHTTTCCHHHHHHHHSCEECSSSCCCTTSCC,CEECCCHHHHHHHHHHHCCCCCHHHHHHHHCCEECCCCCCCCCCCC,46,False,46,XRAY,0.48,0.127,
2,5NW3,A,MAKWVCKICGYIYDEDAGDPDNGISPGTKFEELPDDWVCPICGAPK...,CCEEEETTTCCEEETTTCBGGGTBCTTCCGGGSCTTCBCTTTCCBG...,CCEEEECCCCCEEECCCCEHHHCECCCCCHHHCCCCCECCCCCCEH...,54,False,54,XRAY,0.59,0.135,0.146
3,1UCS,A,NKASVVANQLIPINTALTLIMMKAEVVTPMGIPAEEIPKLVGMQVN...,CCCEEEESSCBCTTCBCCGGGEEEECCSSCCSBGGGHHHHTTCBBS...,CCCEEEECCCECCCCECCHHHEEEECCCCCCCEHHHHHHHCCCEEC...,64,False,64,XRAY,0.62,0.139,0.155
4,3X2M,A,ATGGYVQQATGQASFTMYSGCGSPACGKAASGFTAAINQLAFGSAP...,CGGGCCCCSEEEEEEEEESCCSSCTTSCCBSSCEEEEEHHHHTSCT...,CHHHCCCCCEEEEEEEEECCCCCCCCCCCECCCEEEEEHHHHCCCC...,180,False,180,XRAY,0.64,0.122,0.129


In [14]:
print(data.shape[0])
data

15079


Unnamed: 0,pdb_id,chain_code,seq,sst8,sst3,len_x,has_nonstd_aa,len_y,method,resol,rfac,freerfac
0,5D8V,A,AAPANAVTADDPTAIALKYNQDATKSERVAAARPGLPPEEQHCANC...,CCCTTBCCTTCHHHHHHTCBSSGGGSCHHHHCCTTSCGGGCCGGGB...,CCCCCECCCCCHHHHHHCCECCHHHCCHHHHCCCCCCHHHCCHHHE...,83,False,83,XRAY,0.48,0.072,0.078
1,3NIR,A,TTCCPSIVARSNFNVCRLPGTPEALCATYTGCIIIPGATCPGDYAN,CEECSSHHHHHHHHHHHTTTCCHHHHHHHHSCEECSSSCCCTTSCC,CEECCCHHHHHHHHHHHCCCCCHHHHHHHHCCEECCCCCCCCCCCC,46,False,46,XRAY,0.48,0.127,
2,5NW3,A,MAKWVCKICGYIYDEDAGDPDNGISPGTKFEELPDDWVCPICGAPK...,CCEEEETTTCCEEETTTCBGGGTBCTTCCGGGSCTTCBCTTTCCBG...,CCEEEECCCCCEEECCCCEHHHCECCCCCHHHCCCCCECCCCCCEH...,54,False,54,XRAY,0.59,0.135,0.146
3,1UCS,A,NKASVVANQLIPINTALTLIMMKAEVVTPMGIPAEEIPKLVGMQVN...,CCCEEEESSCBCTTCBCCGGGEEEECCSSCCSBGGGHHHHTTCBBS...,CCCEEEECCCECCCCECCHHHEEEECCCCCCCEHHHHHHHCCCEEC...,64,False,64,XRAY,0.62,0.139,0.155
4,3X2M,A,ATGGYVQQATGQASFTMYSGCGSPACGKAASGFTAAINQLAFGSAP...,CGGGCCCCSEEEEEEEEESCCSSCTTSCCBSSCEEEEEHHHHTSCT...,CHHHCCCCCEEEEEEEEECCCCCCCCCCCECCCEEEEEHHHHCCCC...,180,False,180,XRAY,0.64,0.122,0.129
...,...,...,...,...,...,...,...,...,...,...,...,...
15074,6CSV,A,AHMTRFLEEEELRSHHILERLDAHIEELKRESEKTVRQFTALKGSE...,CCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHTCCHH...,CCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCHH...,90,False,90,XRAY,2.50,0.265,0.321
15075,2FRH,A,GSHMAITKINDCFELLSMVTYADKLKSLIKKEFSISFEEFAVLTYI...,CCSCCCCCCCSHHHHHHHHHHHHHHHHHHHHTTCCCHHHHHHHHHH...,CCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHCCCCCHHHHHHHHHH...,127,False,127,XRAY,2.50,0.266,0.302
15076,6RP4,A,GFLGIFFTGADKNIEKATLYKNLIAKYQNNHFISLIILSALVSDSK...,CCSCCCCCCSSCCSHHHHHHHHHHHHTSSCHHHHHHHHHHHHHCTT...,CCCCCCCCCCCCCCHHHHHHHHHHHHCCCCHHHHHHHHHHHHHCCC...,125,False,125,XRAY,2.50,0.272,0.296
15077,2EUC,A,XQYFSPEQQYNAWIVSDLVKQIFHKRAGCSPGIHELAVFAEEHFHI...,CCCCCHHHHHHHHHHHHHHHHHHHHHBSSCCCHHHHHHHHHHTTCC...,CCCCCHHHHHHHHHHHHHHHHHHHHHECCCCCHHHHHHHHHHCCCC...,108,False,108,XRAY,2.50,0.274,0.328


In [15]:
# Function to encode sequences
def encode_sequences(sequences):
    label_encoder = LabelEncoder()
    unique_chars = set(''.join(sequences))
    label_encoder.fit(list(unique_chars))
    encoded_sequences = [label_encoder.transform(list(seq)) for seq in sequences]
    return encoded_sequences, label_encoder

# Encode seq and sst8:
encoded_seq, seq_encoder = encode_sequences(data['seq'])
encoded_sst8, sst8_encoder = encode_sequences(data['sst8'])

In [16]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(encoded_seq, encoded_sst8, test_size=0.2, random_state=42)

In [29]:
max_seq_length = max(max(len(s) for s in X_train), max(len(s) for s in y_train))
#max_seq_val_length = max(max(len(s) for s in X_val), max(len(s) for s in y_val))
print(max_seq_length)

X_train_padded = pad_sequences(X_train, maxlen=max_seq_length, padding='post')
X_val_padded = pad_sequences(X_val, maxlen=max_seq_length, padding='post')
y_train_padded = pad_sequences(y_train, maxlen=max_seq_length, padding='post')
y_val_padded = pad_sequences(y_val, maxlen=max_seq_length, padding='post')

for index, element in enumerate(X_train):
    if len(element) >= 1400:
        print(index)


#print(X_train_padded[3])


2128
118
2520
4059
7537


In [18]:
num_amino_acids = len(seq_encoder.classes_)  
num_classes = len(sst8_encoder.classes_) 
print (num_amino_acids, num_classes)
print(seq_encoder.classes_, sst8_encoder.classes_)

21 8
['A' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'K' 'L' 'M' 'N' 'P' 'Q' 'R' 'S' 'T' 'V'
 'W' 'X' 'Y'] ['B' 'C' 'E' 'G' 'H' 'I' 'S' 'T']


In [236]:
#RNN MODEl
model = Sequential()
model.add(Embedding(input_dim=num_amino_acids, output_dim=128, input_length=max_seq_length))
model.add(LSTM(64, return_sequences=True))
model.add(TimeDistributed(Dense(num_classes, activation='softmax')))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [237]:
model.fit(X_train_padded, y_train_padded, batch_size=64, epochs=50, validation_data=(X_val_padded, y_val_padded))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x1c6318de070>

In [238]:
# model summary:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 2128, 128)         2688      
                                                                 
 lstm_3 (LSTM)               (None, 2128, 64)          49408     
                                                                 
 time_distributed_3 (TimeDi  (None, 2128, 8)           520       
 stributed)                                                      
                                                                 
Total params: 52616 (205.53 KB)
Trainable params: 52616 (205.53 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [239]:
loss, accuracy = model.evaluate(X_val_padded, y_val_padded)
print('Validation accuracy: {:5.2f}%'.format(100 * accuracy))

Validation accuracy: 93.70%


In [19]:
model = load_model('pssp_model3.h5')




In [30]:
import numpy as np

#Top 5 indexes with the most sequences
#125
#1981
#4079
#5064
#8521

#Indexes with 100% accuracy
#29

sample_index = 4059

# Retrieve the actual label for the sample
actual_class = y_train_padded[sample_index]

# Retrieve the sequence for the sample
single_sequence = X_train_padded[sample_index]
single_sequence = np.expand_dims(single_sequence, axis=0)

# Use the model to predict the outcome
prediction = model.predict(single_sequence)
predicted_class = np.argmax(prediction, axis=-1)
shape = prediction.shape

# Print the predicted class
print("Predicted sst8 of", sample_index, ": ", predicted_class[0])
print("Actual sst8 of", sample_index, ": ", actual_class)


predicted_sst8 = sst8_encoder.inverse_transform(predicted_class[0])
actual_sst8 = sst8_encoder.inverse_transform(actual_class)

print("Predicted sst8 of", sample_index, ": ",predicted_sst8)
print("Actual sst8 of", sample_index, ": ",actual_sst8)

print(len(predicted_class[0]))
print(len(actual_class))

Predicted sst8 of 4059 :  [1 1 1 ... 0 0 0]
Actual sst8 of 4059 :  [1 2 2 ... 0 0 0]
Predicted sst8 of 4059 :  ['C' 'C' 'C' ... 'B' 'B' 'B']
Actual sst8 of 4059 :  ['C' 'E' 'E' ... 'B' 'B' 'B']
2128
2128


In [31]:
# Check the predicted class accuracy

from sklearn.metrics import mean_squared_error, accuracy_score
mse = mean_squared_error(actual_class, predicted_class[0])
print("Mean squared error: ", mse)

correct = 0

total = 0
for i in y_train[sample_index]:
    total += 1
    if actual_class[i] == predicted_class[0][i]:
        correct += 1
        
acc = correct / total
print("Accuracy score: {:5.2f}%".format(100 * acc))

print("Length prediction array of", sample_index, ":", len(y_train[sample_index]))

Mean squared error:  4.021616541353383
Accuracy score: 44.06%
Length prediction array of 4059 : 1491


In [262]:
model.save('pssp_model3.h5')

  saving_api.save_model(
