In [12]:
import warnings
warnings.filterwarnings('ignore')

In [13]:
import pandas as pd 
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

Because the sequence itself did not provide enough information for accurate predictions, a neural network classifier was tested.

The data was preprocessed as in the base model and then converted to a tensor and used with the model. Data was also scaled before feeding it to Neural network

## Data preparation

In [28]:
proteins_cleaned = pd.read_csv("./Data/joined_tables.csv")

In [29]:
proteins_cleaned

Unnamed: 0,structureId,chainId,sequence,residueCount,len,A,C,D,E,F,...,T,U,V,W,Y,X,B,Z,classification,chainCount
0,4LZV,A,LIVTQTMKGLDIQKVAGTWYSLAMAASDISLLDAQSAPLRVYVEEL...,162,162,0.092593,0.030864,0.061728,0.098765,0.024691,...,0.049383,0.0,0.055556,0.012346,0.024691,0.0,0.0,0.0,TRANSPORT PROTEIN,1
1,4FK5,A,GAAAAMSICPHIQQVFQNEKSKDGVLKTCNAARYILNHSVPKEKFL...,767,476,0.042017,0.054622,0.048319,0.048319,0.052521,...,0.042017,0.0,0.042017,0.010504,0.033613,0.0,0.0,0.0,HYDROLASE,4
2,4DOY,A,MGSSHHHHHHSSGLVPRGSHMTLSPEKQHVRPRDAADNDPVAVARG...,3496,437,0.125858,0.002288,0.059497,0.054920,0.032037,...,0.061785,0.0,0.066362,0.022883,0.025172,0.0,0.0,0.0,OXIDOREDUCTASE,8
3,2XZK,A,INDPAKSAAPYHDEFPLFRSANMASPDKLSTGIGFHSFRIPAVVRT...,772,386,0.085492,0.002591,0.072539,0.033679,0.036269,...,0.077720,0.0,0.049223,0.025907,0.033679,0.0,0.0,0.0,HYDROLASE,2
4,4WR9,A,AYLDEELQTELYEIKHQILQTMGVLSLQGSMLSVGDKVFSTNGQSV...,148,148,0.054054,0.027027,0.067568,0.087838,0.027027,...,0.067568,0.0,0.060811,0.013514,0.067568,0.0,0.0,0.0,BINDING PROTEIN,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117508,3QAH,A,MGSHHHHHHHHGSDYDIPTTENLYFQGSTKVKYVDKIHIGNYEIDA...,304,304,0.023026,0.026316,0.052632,0.055921,0.039474,...,0.042763,0.0,0.055921,0.023026,0.075658,0.0,0.0,0.0,TRANSFERASE,1
117509,4M0P,A,MPNIKIFSGSSHQDLSQKIADRLGLELGKVVTKKFSNQETCVEIGE...,652,326,0.095092,0.027607,0.067485,0.052147,0.027607,...,0.042945,0.0,0.085890,0.006135,0.015337,0.0,0.0,0.0,TRANSFERASE,2
117510,4NPM,A,DESEYEERRDAEARRVKSGIKQASIFTLEECARIEAKIDEVVAKAD...,500,250,0.072000,0.020000,0.068000,0.076000,0.040000,...,0.028000,0.0,0.088000,0.004000,0.032000,0.0,0.0,0.0,OXIDOREDUCTASE,2
117511,2AF0,A,ADLGTENLYFQSMKPSPEEAQLWSEAFDELLASKYGLAAFRAFLKS...,146,146,0.089041,0.027397,0.041096,0.116438,0.082192,...,0.054795,0.0,0.006849,0.013699,0.041096,0.0,0.0,0.0,SIGNALING PROTEIN,1


In [30]:
proteins_dict = dict(zip(list(proteins_cleaned["classification"].value_counts().index), range(len(proteins_cleaned["classification"].value_counts().index.tolist()))))

In [31]:
X=proteins_cleaned[['residueCount', 'len', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K',
       'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'X',
       'B', 'Z','chainCount']]
y=np.array(proteins_cleaned["classification"].map(proteins_dict))

In [None]:
# np.save('./Data/x.npy', np.array(X))
# np.save('./Data/y.npy', y)

In [33]:
train_size = int(len(proteins_cleaned)*0.6)
test_size = int(len(proteins_cleaned)*0.2)

In [34]:
X_train = X[:train_size]
X_test = X[train_size:(train_size+test_size)]
X_val = X[(train_size+test_size):]

y_train = y[:train_size]
y_test = y[train_size:(train_size+test_size)]
y_val = y[(train_size+test_size):]

In [None]:
weights_array = np.load('./Data/weights.npy')

In [36]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_scaled_train = scaler.fit_transform(X_train)
X_scaled_test = scaler.transform(X_test)
X_scaled_val = scaler.transform(X_val)

In [12]:
X_train_tensor = torch.tensor(X_scaled_train,dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

X_test_tensor = torch.tensor(X_scaled_test,dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

X_val_tensor = torch.tensor(X_scaled_val,dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

## Defining NN

The development began with a simple single-layer network, which was gradually expanded by adding more layers, followed by ReLU activations and a LogSoftmax layer at the end for classification. Sigmoid activation was also tested in place of ReLU, but the results were similar, so ReLU was chosen for its simplicity. Additionally, different dropout values were experimented with, and the placement of the dropout layer was adjusted.

In [13]:
class NNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, output_size)
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = torch.relu(x)
        x = self.fc3(x)
        x = torch.relu(x)
        x = self.fc4(x)
        out = self.softmax(x)
        return out

In [None]:
input_size = 28
hidden_size = 128
output_size = 33

n_epochs = 100
batch_size = 64
patience = 5

In [None]:
model = NNClassifier(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss(weight=torch.tensor(weights_array,dtype=torch.float32))
optimizer = optim.Adam(model.parameters(), lr=0.001)

The same techniques used in the LSTM training were applied in the training loop, including early stopping and saving the best model based on validation performance

In [None]:
test_loss_array = []
best_result = np.inf

for epoch in range(n_epochs):

    model.train()
    batches = len(X_train_tensor) // batch_size
    for batch in range(batches):
        
        optimizer.zero_grad()
        i = batch * batch_size

        X_batch = X_train_tensor[i:i+batch_size]
        y_batch = y_train_tensor[i:i+batch_size]

        y_pred = model(X_batch)

        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()

    model.eval()
    batches = len(X_test_tensor) // batch_size
    test_loss = 0
    correct = 0
    with torch.no_grad():  

        for batch in range(batches):
            i = batch * batch_size

            X_batch = X_test_tensor[i:i+batch_size]
            y_batch = y_test_tensor[i:i+batch_size]

            output = model(X_batch)
            test_loss += criterion(output, y_batch).item()


    test_loss /= len(y_test_tensor)
    test_loss_array.append(test_loss)

    if test_loss < best_result:
        torch.save(model.state_dict(), "./classifier.pth")

    print(f"Epoch: {epoch}, Train loss: {loss}, Test loss: {test_loss}")

    if len(test_loss_array)>patience+1:
        if not (any(x > test_loss_array[-1] for x in test_loss_array[len(test_loss_array)-patience-1:-1])):
            break

Epoch: 0, Train loss: 1.7337801456451416, Test loss: 0.03462577888219166
Epoch: 1, Train loss: 1.8514653444290161, Test loss: 0.03282576407303598
Epoch: 2, Train loss: 1.789962649345398, Test loss: 0.0314701911860573
Epoch: 3, Train loss: 1.7612991333007812, Test loss: 0.030352098519685837
Epoch: 4, Train loss: 1.6025117635726929, Test loss: 0.029537602768341192
Epoch: 5, Train loss: 1.4969170093536377, Test loss: 0.0292267415327941
Epoch: 6, Train loss: 1.5037764310836792, Test loss: 0.029055897387714854
Epoch: 7, Train loss: 1.460148572921753, Test loss: 0.028660190287026717
Epoch: 8, Train loss: 1.3525357246398926, Test loss: 0.02822093901081538
Epoch: 9, Train loss: 1.354172945022583, Test loss: 0.02831905893331568
Epoch: 10, Train loss: 1.3626848459243774, Test loss: 0.027986412981238003
Epoch: 11, Train loss: 1.284071445465088, Test loss: 0.02785644275301538
Epoch: 12, Train loss: 1.1847188472747803, Test loss: 0.027795278762778916
Epoch: 13, Train loss: 1.2072335481643677, Test 

In [31]:
model.load_state_dict(torch.load("./classifier.pth"))

model.eval()
with torch.no_grad():
    predictions = model(X_val_tensor)
    predicted_labels = torch.argmax(predictions, dim=1)
    print("Predicted Labels:", predicted_labels)

Predicted Labels: tensor([2, 0, 4,  ..., 0, 9, 9])


In [32]:
results = pd.DataFrame({"pred":predicted_labels,"true":y_val_tensor})

In [33]:
results

Unnamed: 0,pred,true
0,2,4
1,0,1
2,4,4
3,4,4
4,2,4
...,...,...
23499,0,0
23500,0,0
23501,0,5
23502,9,9


The results haven't improved comparing to LSTM or the base model. I believe the data patterns are quite complex, and a larger dataset would be needed to capture them accurately. That may be the reason that, an ensemble method like Random Forest performs better with this dataset.

In [34]:
y_test = results["true"]
y_pred = results["pred"]

print(f"accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred, average='weighted')}")
print(f"precision: {precision_score(y_test, y_pred, average='weighted')}")
print(f"recall: {recall_score(y_test, y_pred, average='weighted')}")

accuracy: 0.5054033356024507
F1: 0.4859439843573175
precision: 0.49377457930436514
recall: 0.5054033356024507
