In [23]:
import warnings
warnings.filterwarnings('ignore')

In [24]:
import pandas as pd 
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

The final test with PyTorch neural networks involved combining the predictions of the LSTM with the classification model. The idea was to process the sequence information with the LSTM, make predictions, and then add the output as a feature to the dataset, which would later be used by the classifier. This is similar to hybrid models, but with a different implementation.

The classifier input includes the LSTM prediction, which is likely to be the most important feature for the final prediction, but it can also be enhanced with additional protein characteristics.

For dataset preparation, the same steps from previous notebooks were followed, and the pre-trained LSTM and classifier were imported. The output from the LSTM was then added to the training dataset and used as an additional feature (X).

## Data preparation

In [25]:
X_lstm = np.load('./Data/x_lstm.npy')

X = np.load('./Data/x.npy')
y = np.load('./Data/y.npy')

In [26]:
train_size = int(len(X)*0.6)
test_size = int(len(X)*0.2)

In [27]:
X_train = X[:train_size]
X_test = X[train_size:(train_size+test_size)]
X_val = X[(train_size+test_size):]

y_train = y[:train_size]
y_test = y[train_size:(train_size+test_size)]
y_val = y[(train_size+test_size):]

X_train_lstm = X_lstm[:train_size]
X_test_lstm = X_lstm[train_size:(train_size+test_size)]
X_val_lstm = X_lstm[(train_size+test_size):]

In [28]:
weights_array = np.load('./Data/weights.npy')

In [29]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_scaled_train = scaler.fit_transform(X_train)
X_scaled_test = scaler.transform(X_test)
X_scaled_val = scaler.transform(X_val)

In [30]:
X_train_tensor = torch.tensor(X_scaled_train,dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

X_test_tensor = torch.tensor(X_scaled_test,dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

X_val_tensor = torch.tensor(X_scaled_val,dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)


X_train_tensor_lstm = torch.tensor(X_train_lstm,dtype=torch.float32)

X_test_tensor_lstm = torch.tensor(X_test_lstm,dtype=torch.float32)

X_val_tensor_lstm = torch.tensor(X_val_lstm,dtype=torch.float32)

## Models

In [31]:
from Model.model import LSTMClassifier, NNClassifier

In [32]:
input_size = 25
hidden_size = 128
num_layers = 3
output_size = 33

model_lstm = LSTMClassifier(input_size, hidden_size, num_layers, output_size)
model_lstm.load_state_dict(torch.load("./lstm.pth"))

<All keys matched successfully>

In [33]:
model_lstm.eval()
with torch.no_grad():
    predictions_train = model_lstm(X_train_tensor_lstm)
    predicted_labels_train = torch.argmax(predictions_train, dim=1)
    X_train_tensor_combined = torch.concat((X_train_tensor, predicted_labels_train.reshape([70507,1])), dim = 1)

    predictions_test = model_lstm(X_test_tensor_lstm)
    predicted_labels_test = torch.argmax(predictions_test, dim=1)
    X_test_tensor_combined = torch.concat((X_test_tensor, predicted_labels_test.reshape([23502,1])), dim = 1)

    predictions_val = model_lstm(X_val_tensor_lstm)
    predicted_labels_val = torch.argmax(predictions_val, dim=1)
    X_val_tensor_combined = torch.concat((X_val_tensor, predicted_labels_val.reshape([23504,1])), dim = 1)

### Classifier training loop

In [None]:
input_size = 29
hidden_size = 128
output_size = 33

model= NNClassifier(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss(weight=torch.tensor(weights_array,dtype=torch.float32))
optimizer = optim.Adam(model.parameters(), lr=0.005)

n_epochs = 30
batch_size = 64

test_loss_array = []
patience = 3
best_result = np.inf

for epoch in range(n_epochs):

    model.train()
    batches = len(X_train_tensor_combined) // batch_size
    for batch in range(batches):
        
        optimizer.zero_grad()
        i = batch * batch_size

        X_batch = X_train_tensor_combined[i:i+batch_size]
        y_batch = y_train_tensor[i:i+batch_size]

        y_pred = model(X_batch)

        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()

    model.eval()
    batches = len(X_test_tensor_combined) // batch_size
    test_loss = 0
    correct = 0
    with torch.no_grad():  

        for batch in range(batches):
            i = batch * batch_size

            X_batch = X_test_tensor_combined[i:i+batch_size]
            y_batch = y_test_tensor[i:i+batch_size]

            output = model(X_batch)
            test_loss += criterion(output, y_batch).item()


    test_loss /= len(y_test_tensor)
    test_loss_array.append(test_loss)

    if test_loss < best_result:
        torch.save(model.state_dict(), "./combined.pth")

    print(f"Epoch: {epoch}, Train loss: {loss}, Test loss: {test_loss}")

    if len(test_loss_array)>patience+1:
        if not (any(x > test_loss_array[-1] for x in test_loss_array[len(test_loss_array)-patience-1:-1])):
            break

Epoch: 0, Train loss: 1.3869458436965942, Test loss: 0.026206301025771737
Epoch: 1, Train loss: 1.2308436632156372, Test loss: 0.024847129326395843
Epoch: 2, Train loss: 1.2106237411499023, Test loss: 0.024442050012869792
Epoch: 3, Train loss: 1.1160680055618286, Test loss: 0.023857157820387602
Epoch: 4, Train loss: 1.0490655899047852, Test loss: 0.023551149192175738
Epoch: 5, Train loss: 1.11536705493927, Test loss: 0.023502278293450227
Epoch: 6, Train loss: 1.0452492237091064, Test loss: 0.02330130801262952
Epoch: 7, Train loss: 0.9038224220275879, Test loss: 0.022985174753606882
Epoch: 8, Train loss: 1.1124333143234253, Test loss: 0.023039482434387765
Epoch: 9, Train loss: 1.1002899408340454, Test loss: 0.02273804005912269
Epoch: 10, Train loss: 1.0974364280700684, Test loss: 0.023072959100852143


In [35]:
model.load_state_dict(torch.load("./combined.pth"))

model.eval()
with torch.no_grad():
    predictions = model(X_val_tensor_combined)
    predicted_labels = torch.argmax(predictions, dim=1)
    print("Predicted Labels:", predicted_labels)

Predicted Labels: tensor([17, 13,  1,  ...,  3,  7,  3])


With this method the highest results were achieved among Neural Networks, the sequence data was enriched with additional features so that way full potential of the dataset was used. 

I'm suppose that the results could be even better if LSTM performance would be improved - for example by using full sequences and more hidden layers in the model, but then the training would be more time-consuming. 

In [36]:
results = pd.DataFrame({"pred":predicted_labels,"true":y_val_tensor})

y_test = results["true"]
y_pred = results["pred"]

print(f"accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred, average='weighted')}")
print(f"precision: {precision_score(y_test, y_pred, average='weighted')}")
print(f"recall: {recall_score(y_test, y_pred, average='weighted')}")

accuracy: 0.6211708645336964
F1: 0.6167632074145097
precision: 0.6283539320980643
recall: 0.6211708645336964
