In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from icecream import ic
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from xgboost import XGBClassifier

import random
import numpy as np
import torch

def set_seed(seed_value=42):
    """Set seed for reproducibility across numpy, torch, and random."""
    random.seed(seed_value)  # Python's built-in random module
    np.random.seed(seed_value)  # NumPy random seed

    torch.manual_seed(seed_value)  # PyTorch random seed for CPU
    torch.cuda.manual_seed(seed_value)  # PyTorch random seed for CUDA
    torch.cuda.manual_seed_all(seed_value)  # PyTorch seed for all GPUs

    torch.backends.cudnn.deterministic = True  # Ensures deterministic behavior in cuDNN
    torch.backends.cudnn.benchmark = False  # Disables cuDNN auto-optimization

# Usage
set_seed(42)  # Call this function before running models or data splits to ensure reproducibility

device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [55]:
df = pd.read_csv('all_seq722.csv')

    
df = df[~df["Sequences"].str.contains('-')]
df['Sequences'] = df['Sequences'].str.upper()
max_length = df['Sequences'].str.len().max()
print(max_length)
df['Sequences'] = df['Sequences'].apply(lambda x: x.ljust(max_length, 'X'))

128


In [35]:

unique_letters = set(''.join(df["Sequences"]))
print(unique_letters)
print(len(unique_letters))
amino_acids = set("ACDEFGHIKLMNPQRSTVWY")
non_standard_amino_acids = unique_letters - amino_acids
print(non_standard_amino_acids)

{'E', 'K', 'R', 'P', 'W', 'S', 'I', 'C', 'V', 'Q', 'X', 'Y', 'M', 'F', 'G', 'N', 'T', 'L', 'A', 'D', 'H'}
21
{'X'}


In [57]:
# Re-import necessary libraries after execution state reset
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split



# Define One-Hot Encoding Function for DNA Sequences in PyTorch
def one_hot_torch(seq: str, dtype=torch.float32):
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"
    seq_bytes = torch.ByteTensor(list(bytes(seq, "utf-8")))
    aa_bytes = torch.ByteTensor(list(bytes(amino_acids, "utf-8")))
    arr = torch.zeros(len(amino_acids), len(seq_bytes), dtype=dtype)
    for i, aa in enumerate(aa_bytes):
        arr[i, seq_bytes == aa] = 1
    return arr

# Define custom dataset class with transformation

# Updating the Dataset class with the OneHotEncoder function at the end
class SequenceDataset(Dataset):
    def __init__(self, sequences, labels, one_hot_dtype=torch.float32):
        self.sequences = sequences  # Raw sequences
        self.labels = labels  # Labels
        self.one_hot_dtype = one_hot_dtype  # Data type for one-hot encoding

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seqs_comb = self.sequences.iloc[idx]  # Get sequence
        amp_label = self.labels.iloc[idx]    # Get corresponding label
        # Apply one-hot encoding transformation at the end
        return one_hot_torch(seqs_comb, dtype=self.one_hot_dtype), torch.tensor(amp_label, dtype=torch.float32)


# Convert dataset into PyTorch Dataset

X = df["Sequences"]
y = df["AMP"]
# Split into train (70%), validation (15%), test (15%)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

# Step 2: Split train+val into train and val (stratified)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.15, random_state=42, stratify=y_train_val
)  # 0.1765 to maintain 15% of original dataset

# Convert back to PyTorch datasets
train_dataset = SequenceDataset(X_train, y_train)
val_dataset = SequenceDataset(X_val, y_val)
test_dataset = SequenceDataset(X_test, y_test)

# Define DataLoaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Display dataset sizes
dataset_sizes = {
    "Train": len(train_dataset),
    "Validation": len(val_dataset),
    "Test": len(test_dataset)
}
# for x,y in train_loader:
#     print(x)
#     print(y)
#     break

In [52]:
for x,y in train_loader:
    print(x.shape)
    print(y)
    break

torch.Size([16, 20, 128])
tensor([1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 1., 0., 1., 0., 0.],
       dtype=torch.float64)


# Classification

## basic lstm

In [88]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix
import numpy as np
import time

ic.disable()
# Define the Basic LSTM Model
class BasicLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(BasicLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        ic(h_n.shape)
        ic(h_n[-1].shape)
        ic(h_n[-1])
        out = self.fc(h_n[-1])
        return self.sigmoid(out)

# Function to calculate specificity
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

# Training and Evaluation Function
def train_and_evaluate(model, train_loader, val_loader, criterion, optimizer, tensorboard_,num_epochs=100):
    writer = SummaryWriter(tensorboard_)
    for epoch in range(num_epochs):
        model.train()
        train_losses = []
        for sequences, labels in train_loader:
            optimizer.zero_grad()
            ic(sequences.shape)
            outputs = model(sequences)
            loss = criterion(outputs, labels.unsqueeze(1))
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
        
        avg_train_loss = np.mean(train_losses)
        writer.add_scalar('Loss/Train', avg_train_loss, epoch)

        # Evaluation
        model.eval()
        all_labels = []
        all_preds = []
        val_losses = []
        with torch.no_grad():
            for sequences, labels in val_loader:
                outputs = model(sequences)
                loss = criterion(outputs, labels.unsqueeze(1))
                val_losses.append(loss.item())
                preds = (outputs > 0.5).float()
                all_labels.extend(labels.numpy())
                all_preds.extend(preds.numpy())
                
            avg_val_loss = np.mean(val_losses)
            writer.add_scalar('Loss/Val', avg_val_loss, epoch)

        all_labels = np.array(all_labels)
        all_preds = np.array(all_preds)
        accuracy = accuracy_score(all_labels, all_preds)
        auc = roc_auc_score(all_labels, all_preds)
        sensitivity = recall_score(all_labels, all_preds)
        specificity = specificity_score(all_labels, all_preds)

        writer.add_scalar('Metrics/Accuracy', accuracy, epoch)
        writer.add_scalar('Metrics/AUC', auc, epoch)
        writer.add_scalar('Metrics/Sensitivity', sensitivity, epoch)
        writer.add_scalar('Metrics/Specificity', specificity, epoch)
        if epoch % 5 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, TrainLoss: {avg_train_loss:.4f}, ValLoss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}, AUC: {auc:.4f}, Sensitivity: {sensitivity:.4f}, Specificity: {specificity:.4f}')

    writer.close()

# Hyperparameters
input_size = 128       
hidden_size = 128    # Number of features in the hidden state
num_layers = 2       # Number of stacked LSTM layers
output_size = 1      # Binary classification (AMP or not)
batch_size = 16
num_epochs = 21
learning_rate = 0.001
tensorboard_ = f"runs/basic_lstm/experiment_{time.strftime('%Y-%m-%d_%H-%M-%S')}"


# Model, Loss, Optimizer
model = BasicLSTM(input_size, hidden_size, num_layers, output_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train and Evaluate
train_and_evaluate(model, train_loader, val_loader, criterion, optimizer, tensorboard_,num_epochs)



Epoch 1/21, TrainLoss: 0.6935, ValLoss: 0.6904, Accuracy: 0.5495, AUC: 0.5444, Sensitivity: 0.0889, Specificity: 1.0000
Epoch 6/21, TrainLoss: 0.5593, ValLoss: 0.4415, Accuracy: 0.8132, AUC: 0.8126, Sensitivity: 0.7556, Specificity: 0.8696
Epoch 11/21, TrainLoss: 0.5188, ValLoss: 0.4786, Accuracy: 0.7473, AUC: 0.7488, Sensitivity: 0.8889, Specificity: 0.6087
Epoch 16/21, TrainLoss: 0.4357, ValLoss: 0.3672, Accuracy: 0.8462, AUC: 0.8457, Sensitivity: 0.8000, Specificity: 0.8913
Epoch 21/21, TrainLoss: 0.2891, ValLoss: 0.3293, Accuracy: 0.8791, AUC: 0.8795, Sensitivity: 0.9111, Specificity: 0.8478


In [90]:
import torch
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix
import numpy as np

def evaluate_model(model, test_loader):
    model.eval()  # Set the model to evaluation mode
    all_labels = []
    all_preds = []
    
    with torch.no_grad():  # Disable gradient computation
        for sequences, labels in test_loader:
            outputs = model(sequences)
            preds = (outputs > 0.5).float()
            all_labels.extend(labels.numpy())
            all_preds.extend(preds.numpy())
    
    all_labels = np.array(all_labels)
    all_preds = np.array(all_preds)
    
    accuracy = accuracy_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_preds)
    sensitivity = recall_score(all_labels, all_preds)
    tn, fp, fn, tp = confusion_matrix(all_labels, all_preds).ravel()
    specificity = tn / (tn + fp)
    
    print(f'Test Accuracy: {accuracy:.4f}')
    print(f'Test AUC: {auc:.4f}')
    print(f'Test Sensitivity (Recall): {sensitivity:.4f}')
    print(f'Test Specificity: {specificity:.4f}')
    
    return accuracy, auc, sensitivity, specificity
# After training is complete
evaluate_model(model, test_loader)


Test Accuracy: 0.8505
Test AUC: 0.8506
Test Sensitivity (Recall): 0.8679
Test Specificity: 0.8333


(0.8504672897196262, 0.85062893081761, 0.8679245283018868, 0.8333333333333334)

## lstm with attention

In [91]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ScaledDotProductAttention(nn.Module):
    def __init__(self, hidden_size):
        super(ScaledDotProductAttention, self).__init__()
        self.scale_factor = torch.sqrt(torch.tensor(hidden_size, dtype=torch.float32))
    
    def forward(self, query, key, value):
        attention_scores = torch.bmm(query, key.transpose(1, 2))  # [batch, seq_len, seq_len]
        attention_scores = attention_scores / self.scale_factor
        attention_weights = F.softmax(attention_scores, dim=-1)  # Normalize scores
        context_vector = torch.bmm(attention_weights, value)  # Weighted sum
        return context_vector, attention_weights
    
class LSTM_ScaledDotAttention(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM_ScaledDotAttention, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.attention = ScaledDotProductAttention(hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        lstm_out, _ = self.lstm(x)  # LSTM output shape: [batch_size, seq_len, hidden_size]
        
        # Using last hidden state as Query
        query = lstm_out[:, -1, :].unsqueeze(1)  # [batch_size, 1, hidden_size]

        # Key and Value are the full sequence outputs
        key = lstm_out  # [batch_size, seq_len, hidden_size]
        value = lstm_out

        context_vector, attention_weights = self.attention(query, key, value)
        context_vector = context_vector.squeeze(1)  # [batch_size, hidden_size]

        out = self.fc(context_vector)  # [batch_size, output_size]
        return self.sigmoid(out)


In [94]:
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix
import numpy as np

def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

def train_and_evaluate(model, train_loader, val_loader, criterion, optimizer, tensorboard_log_dir, num_epochs=25):
    writer = SummaryWriter(tensorboard_log_dir)

    for epoch in range(num_epochs):
        model.train()
        train_losses = []
        for sequences, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, labels.unsqueeze(1))
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        avg_train_loss = np.mean(train_losses)
        writer.add_scalar('Loss/Train', avg_train_loss, epoch)

        # Validation
        model.eval()
        all_labels = []
        all_preds = []
        val_losses = []
        with torch.no_grad():
            for sequences, labels in val_loader:
                outputs = model(sequences)
                loss = criterion(outputs, labels.unsqueeze(1))
                val_losses.append(loss.item())
                preds = (outputs > 0.5).float()
                all_labels.extend(labels.numpy())
                all_preds.extend(preds.numpy())

            avg_val_loss = np.mean(val_losses)
            writer.add_scalar('Loss/Val', avg_val_loss, epoch)

        all_labels = np.array(all_labels)
        all_preds = np.array(all_preds)
        accuracy = accuracy_score(all_labels, all_preds)
        auc = roc_auc_score(all_labels, all_preds)
        sensitivity = recall_score(all_labels, all_preds)
        specificity = specificity_score(all_labels, all_preds)

        writer.add_scalar('Metrics/Accuracy', accuracy, epoch)
        writer.add_scalar('Metrics/AUC', auc, epoch)
        writer.add_scalar('Metrics/Sensitivity', sensitivity, epoch)
        writer.add_scalar('Metrics/Specificity', specificity, epoch)

        if epoch % 5 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, TrainLoss: {avg_train_loss:.4f}, ValLoss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}, AUC: {auc:.4f}, Sensitivity: {sensitivity:.4f}, Specificity: {specificity:.4f}')

    writer.close()
import torch.optim as optim
import time

# Hyperparameters
input_size = 128  # One-hot encoding size or embedding dimension
hidden_size = 128  # LSTM hidden state size
num_layers = 2  # Stacked LSTM layers
output_size = 1  # Binary classification
batch_size = 16
num_epochs = 26
learning_rate = 0.001
tensorboard_log_dir = f"runs/lstm_attention/experiment_{time.strftime('%Y-%m-%d_%H-%M-%S')}"

# Model, Loss, Optimizer
model = LSTM_ScaledDotAttention(input_size, hidden_size, num_layers, output_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train and Evaluate
train_and_evaluate(model, train_loader, val_loader, criterion, optimizer, tensorboard_log_dir, num_epochs)


Epoch 1/26, TrainLoss: 0.6951, ValLoss: 0.6895, Accuracy: 0.5275, AUC: 0.5222, Sensitivity: 0.0444, Specificity: 1.0000
Epoch 6/26, TrainLoss: 0.5163, ValLoss: 0.4518, Accuracy: 0.8022, AUC: 0.8012, Sensitivity: 0.7111, Specificity: 0.8913
Epoch 11/26, TrainLoss: 0.4682, ValLoss: 0.4259, Accuracy: 0.8352, AUC: 0.8350, Sensitivity: 0.8222, Specificity: 0.8478
Epoch 16/26, TrainLoss: 0.4533, ValLoss: 0.3931, Accuracy: 0.8352, AUC: 0.8341, Sensitivity: 0.7333, Specificity: 0.9348
Epoch 21/26, TrainLoss: 0.4512, ValLoss: 0.3829, Accuracy: 0.8571, AUC: 0.8570, Sensitivity: 0.8444, Specificity: 0.8696
Epoch 26/26, TrainLoss: 0.4153, ValLoss: 0.4528, Accuracy: 0.8352, AUC: 0.8353, Sensitivity: 0.8444, Specificity: 0.8261


In [93]:
def evaluate_model(model, test_loader):
    model.eval()  # Set model to evaluation mode
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for sequences, labels in test_loader:
            outputs = model(sequences)
            preds = (outputs > 0.5).float()
            all_labels.extend(labels.numpy())
            all_preds.extend(preds.numpy())

    all_labels = np.array(all_labels)
    all_preds = np.array(all_preds)

    accuracy = accuracy_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_preds)
    sensitivity = recall_score(all_labels, all_preds)
    tn, fp, fn, tp = confusion_matrix(all_labels, all_preds).ravel()
    specificity = tn / (tn + fp)

    print(f'Test Accuracy: {accuracy:.4f}')
    print(f'Test AUC: {auc:.4f}')
    print(f'Test Sensitivity (Recall): {sensitivity:.4f}')
    print(f'Test Specificity: {specificity:.4f}')

    return accuracy, auc, sensitivity, specificity

# Evaluate
evaluate_model(model, test_loader)


Test Accuracy: 0.8505
Test AUC: 0.8506
Test Sensitivity (Recall): 0.8679
Test Specificity: 0.8333


(0.8504672897196262, 0.85062893081761, 0.8679245283018868, 0.8333333333333334)

## CNN-LSTM with attention

In [None]:

import torch
import torch.nn as nn
import torch.nn.functional as F
ic.disable()
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attention_weights = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, lstm_output):
        # lstm_output: [batch_size, seq_len, hidden_dim]
        attention_scores = self.attention_weights(lstm_output)  # [batch_size, seq_len, 1]
        attention_scores = attention_scores.squeeze(-1)  # [batch_size, seq_len]
        attention_weights = F.softmax(attention_scores, dim=1)  # [batch_size, seq_len]
        context_vector = torch.bmm(attention_weights.unsqueeze(1), lstm_output)  # [batch_size, 1, hidden_dim]
        context_vector = context_vector.squeeze(1)  # [batch_size, hidden_dim]
        return context_vector, attention_weights

class CNN_LSTM_Attention(nn.Module):
    def __init__(self, input_channels, cnn_output_dim, lstm_hidden_dim, lstm_layers, output_dim):
        super(CNN_LSTM_Attention, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(input_channels, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.Conv1d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2)
        )
        self.lstm = nn.LSTM(cnn_output_dim, lstm_hidden_dim, lstm_layers, batch_first=True)
        self.attention = Attention(lstm_hidden_dim)
        self.fc = nn.Linear(lstm_hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # x: [batch_size, seq_len, input_channels]
        batch_size, seq_len, _ = x.size()
        ic(x.shape)
        x = x.view(batch_size * seq_len, -1, x.size(2))  # [batch_size * seq_len, input_channels, feature_dim]
        ic(x.shape)
        cnn_out = self.cnn(x)  # [batch_size * seq_len, 128, feature_dim/4]
        cnn_out = cnn_out.view(batch_size, seq_len, -1)  # [batch_size, seq_len, cnn_output_dim]
        lstm_out, _ = self.lstm(cnn_out)  # [batch_size, seq_len, lstm_hidden_dim]
        context_vector, attention_weights = self.attention(lstm_out)  # [batch_size, lstm_hidden_dim]
        out = self.fc(context_vector)  # [batch_size, output_dim]
        return self.sigmoid(out)



In [None]:
# Hyperparameters
input_channels = 1  # Number of input channels
cnn_output_dim = 128 * (input_size // 4)  # Adjust based on CNN architecture
lstm_hidden_dim = 128
lstm_layers = 2
output_dim = 1  # Binary classification
batch_size = 16
num_epochs = 26
learning_rate = 0.001
tensorboard_log_dir = f"runs/cnn_lstm_attention/experiment_{time.strftime('%Y-%m-%d_%H-%M-%S')}"

# Model, Loss, Optimizer
model = CNN_LSTM_Attention(input_channels, cnn_output_dim, lstm_hidden_dim, lstm_layers, output_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train and Evaluate
train_and_evaluate(model, train_loader, val_loader, criterion, optimizer, tensorboard_log_dir, num_epochs)

# Evaluate on Test Set
evaluate_model(model, test_loader)


## bidirectional LSTM

In [68]:
import torch
import torch.nn as nn

class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(BiLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)  # Multiply hidden_size by 2 for bidirectional
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        _, (h_n, _) = self.lstm(x)  # LSTM output: (batch_size, seq_len, hidden_size * 2)
        # Concatenate the last hidden states from both directions
        out = self.fc(torch.cat((h_n[-2], h_n[-1]), dim=1))  
        return self.sigmoid(out)


In [69]:
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix
import numpy as np

# Function to calculate specificity
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

def train_and_evaluate(model, train_loader, val_loader, criterion, optimizer, tensorboard_log_dir, num_epochs=25):
    writer = SummaryWriter(tensorboard_log_dir)

    for epoch in range(num_epochs):
        model.train()
        train_losses = []
        for sequences, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, labels.unsqueeze(1))
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        avg_train_loss = np.mean(train_losses)
        writer.add_scalar('Loss/Train', avg_train_loss, epoch)

        # Evaluation
        model.eval()
        all_labels = []
        all_preds = []
        val_losses = []
        with torch.no_grad():
            for sequences, labels in val_loader:
                outputs = model(sequences)
                loss = criterion(outputs, labels.unsqueeze(1))
                val_losses.append(loss.item())
                preds = (outputs > 0.5).float()
                all_labels.extend(labels.numpy())
                all_preds.extend(preds.numpy())

            avg_val_loss = np.mean(val_losses)
            writer.add_scalar('Loss/Val', avg_val_loss, epoch)

        all_labels = np.array(all_labels)
        all_preds = np.array(all_preds)
        accuracy = accuracy_score(all_labels, all_preds)
        auc = roc_auc_score(all_labels, all_preds)
        sensitivity = recall_score(all_labels, all_preds)
        specificity = specificity_score(all_labels, all_preds)

        writer.add_scalar('Metrics/Accuracy', accuracy, epoch)
        writer.add_scalar('Metrics/AUC', auc, epoch)
        writer.add_scalar('Metrics/Sensitivity', sensitivity, epoch)
        writer.add_scalar('Metrics/Specificity', specificity, epoch)

        if epoch % 5 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, TrainLoss: {avg_train_loss:.4f}, ValLoss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}, AUC: {auc:.4f}, Sensitivity: {sensitivity:.4f}, Specificity: {specificity:.4f}')

    writer.close()


In [70]:
import torch.optim as optim
import time

# Hyperparameters
input_size = 128  # Number of features per time step (e.g., One-hot encoding size)
hidden_size = 128  # Number of LSTM units
num_layers = 2  # Stacked LSTM layers
output_size = 1  # Binary classification (AMP or not)
batch_size = 16
num_epochs = 26
learning_rate = 0.001
tensorboard_log_dir = f"runs/bilstm/experiment_{time.strftime('%Y-%m-%d_%H-%M-%S')}"

# Model, Loss, Optimizer
model = BiLSTM(input_size, hidden_size, num_layers, output_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train and Evaluate
train_and_evaluate(model, train_loader, val_loader, criterion, optimizer, tensorboard_log_dir, num_epochs)


Epoch 1/26, TrainLoss: 0.6900, ValLoss: 0.6719, Accuracy: 0.7363, AUC: 0.7338, Sensitivity: 0.5111, Specificity: 0.9565
Epoch 6/26, TrainLoss: 0.4284, ValLoss: 0.4699, Accuracy: 0.7473, AUC: 0.7473, Sensitivity: 0.7556, Specificity: 0.7391
Epoch 11/26, TrainLoss: 0.3351, ValLoss: 0.3742, Accuracy: 0.8242, AUC: 0.8242, Sensitivity: 0.8222, Specificity: 0.8261
Epoch 16/26, TrainLoss: 0.2488, ValLoss: 0.3774, Accuracy: 0.8352, AUC: 0.8353, Sensitivity: 0.8444, Specificity: 0.8261
Epoch 21/26, TrainLoss: 0.2203, ValLoss: 0.3713, Accuracy: 0.8681, AUC: 0.8671, Sensitivity: 0.7778, Specificity: 0.9565
Epoch 26/26, TrainLoss: 0.1320, ValLoss: 0.3322, Accuracy: 0.8791, AUC: 0.8785, Sensitivity: 0.8222, Specificity: 0.9348


In [71]:
def evaluate_model(model, test_loader):
    model.eval()  # Set model to evaluation mode
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for sequences, labels in test_loader:
            outputs = model(sequences)
            preds = (outputs > 0.5).float()
            all_labels.extend(labels.numpy())
            all_preds.extend(preds.numpy())

    all_labels = np.array(all_labels)
    all_preds = np.array(all_preds)

    accuracy = accuracy_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_preds)
    sensitivity = recall_score(all_labels, all_preds)
    tn, fp, fn, tp = confusion_matrix(all_labels, all_preds).ravel()
    specificity = tn / (tn + fp)

    print(f'Test Accuracy: {accuracy:.4f}')
    print(f'Test AUC: {auc:.4f}')
    print(f'Test Sensitivity (Recall): {sensitivity:.4f}')
    print(f'Test Specificity: {specificity:.4f}')

    return accuracy, auc, sensitivity, specificity

# Evaluate on Test Set
evaluate_model(model, test_loader)


Test Accuracy: 0.9065
Test AUC: 0.9064
Test Sensitivity (Recall): 0.8868
Test Specificity: 0.9259


(0.9065420560747663,
 0.9063591893780573,
 0.8867924528301887,
 0.9259259259259259)

## biLSTM with attention

In [74]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ScaledDotProductAttention(nn.Module):
    def __init__(self, hidden_size):
        super(ScaledDotProductAttention, self).__init__()
        self.scale_factor = torch.sqrt(torch.tensor(hidden_size, dtype=torch.float32))
    
    def forward(self, query, key, value):
        attention_scores = torch.bmm(query, key.transpose(1, 2))  # [batch, seq_len, seq_len]
        attention_scores = attention_scores / self.scale_factor
        attention_weights = F.softmax(attention_scores, dim=-1)  # Normalize scores
        context_vector = torch.bmm(attention_weights, value)  # Weighted sum
        return context_vector, attention_weights
    
class BiLSTM_ScaledDotAttention(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(BiLSTM_ScaledDotAttention, self).__init__()
        self.bilstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.attention = ScaledDotProductAttention(hidden_size * 2)  # Bidirectional -> 2x hidden size
        self.fc = nn.Linear(hidden_size * 2, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        lstm_out, _ = self.bilstm(x)  # LSTM output shape: [batch_size, seq_len, hidden_size * 2]
        
        # Using last hidden state as Query
        query = lstm_out[:, -1, :].unsqueeze(1)  # [batch_size, 1, hidden_size * 2]

        # Key and Value are the full sequence outputs
        key = lstm_out  # [batch_size, seq_len, hidden_size * 2]
        value = lstm_out

        context_vector, attention_weights = self.attention(query, key, value)
        context_vector = context_vector.squeeze(1)  # [batch_size, hidden_size * 2]

        out = self.fc(context_vector)  # [batch_size, output_size]
        return self.sigmoid(out)


In [77]:
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix
import numpy as np

def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

def train_and_evaluate(model, train_loader, val_loader, criterion, optimizer, tensorboard_log_dir, num_epochs=25):
    writer = SummaryWriter(tensorboard_log_dir)

    for epoch in range(num_epochs):
        model.train()
        train_losses = []
        for sequences, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, labels.unsqueeze(1))
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        avg_train_loss = np.mean(train_losses)
        writer.add_scalar('Loss/Train', avg_train_loss, epoch)

        # Validation
        model.eval()
        all_labels = []
        all_preds = []
        val_losses = []
        with torch.no_grad():
            for sequences, labels in val_loader:
                outputs = model(sequences)
                loss = criterion(outputs, labels.unsqueeze(1))
                val_losses.append(loss.item())
                preds = (outputs > 0.5).float()
                all_labels.extend(labels.numpy())
                all_preds.extend(preds.numpy())

            avg_val_loss = np.mean(val_losses)
            writer.add_scalar('Loss/Val', avg_val_loss, epoch)

        all_labels = np.array(all_labels)
        all_preds = np.array(all_preds)
        accuracy = accuracy_score(all_labels, all_preds)
        auc = roc_auc_score(all_labels, all_preds)
        sensitivity = recall_score(all_labels, all_preds)
        specificity = specificity_score(all_labels, all_preds)

        writer.add_scalar('Metrics/Accuracy', accuracy, epoch)
        writer.add_scalar('Metrics/AUC', auc, epoch)
        writer.add_scalar('Metrics/Sensitivity', sensitivity, epoch)
        writer.add_scalar('Metrics/Specificity', specificity, epoch)

        if epoch % 5 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, TrainLoss: {avg_train_loss:.4f}, ValLoss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}, AUC: {auc:.4f}, Sensitivity: {sensitivity:.4f}, Specificity: {specificity:.4f}')

    writer.close()

import torch.optim as optim
import time

# Hyperparameters
input_size = 128  # One-hot encoding size or embedding dimension
hidden_size = 128  # LSTM hidden state size
num_layers = 2  # Stacked BiLSTM layers
output_size = 1  # Binary classification
batch_size = 16
num_epochs = 40
learning_rate = 0.0001
tensorboard_log_dir = f"runs/bilstm_attention/experiment_{time.strftime('%Y-%m-%d_%H-%M-%S')}"

# Model, Loss, Optimizer
model = BiLSTM_ScaledDotAttention(input_size, hidden_size, num_layers, output_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train and Evaluate
train_and_evaluate(model, train_loader, val_loader, criterion, optimizer, tensorboard_log_dir, num_epochs)




Epoch 1/40, TrainLoss: 0.6938, ValLoss: 0.6926, Accuracy: 0.5055, AUC: 0.5000, Sensitivity: 0.0000, Specificity: 1.0000
Epoch 6/40, TrainLoss: 0.6839, ValLoss: 0.6774, Accuracy: 0.5824, AUC: 0.5867, Sensitivity: 0.9778, Specificity: 0.1957
Epoch 11/40, TrainLoss: 0.4866, ValLoss: 0.3833, Accuracy: 0.8462, AUC: 0.8464, Sensitivity: 0.8667, Specificity: 0.8261
Epoch 16/40, TrainLoss: 0.4589, ValLoss: 0.3749, Accuracy: 0.8462, AUC: 0.8464, Sensitivity: 0.8667, Specificity: 0.8261
Epoch 21/40, TrainLoss: 0.4347, ValLoss: 0.3704, Accuracy: 0.8462, AUC: 0.8454, Sensitivity: 0.7778, Specificity: 0.9130
Epoch 26/40, TrainLoss: 0.4217, ValLoss: 0.3631, Accuracy: 0.8462, AUC: 0.8464, Sensitivity: 0.8667, Specificity: 0.8261
Epoch 31/40, TrainLoss: 0.4437, ValLoss: 0.3875, Accuracy: 0.8132, AUC: 0.8123, Sensitivity: 0.7333, Specificity: 0.8913
Epoch 36/40, TrainLoss: 0.4204, ValLoss: 0.3636, Accuracy: 0.8462, AUC: 0.8457, Sensitivity: 0.8000, Specificity: 0.8913


In [76]:
def evaluate_model(model, test_loader):
    model.eval()  # Set model to evaluation mode
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for sequences, labels in test_loader:
            outputs = model(sequences)
            preds = (outputs > 0.5).float()
            all_labels.extend(labels.numpy())
            all_preds.extend(preds.numpy())

    all_labels = np.array(all_labels)
    all_preds = np.array(all_preds)

    accuracy = accuracy_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_preds)
    sensitivity = recall_score(all_labels, all_preds)
    tn, fp, fn, tp = confusion_matrix(all_labels, all_preds).ravel()
    specificity = tn / (tn + fp)

    print(f'Test Accuracy: {accuracy:.4f}')
    print(f'Test AUC: {auc:.4f}')
    print(f'Test Sensitivity (Recall): {sensitivity:.4f}')
    print(f'Test Specificity: {specificity:.4f}')

    return accuracy, auc, sensitivity, specificity

# Evaluate
evaluate_model(model, test_loader)


Test Accuracy: 0.8318
Test AUC: 0.8326
Test Sensitivity (Recall): 0.9245
Test Specificity: 0.7407


(0.8317757009345794,
 0.8326345213137666,
 0.9245283018867925,
 0.7407407407407407)

## Stacked LSTM

In [78]:
import torch
import torch.nn as nn

class StackedLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(StackedLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        lstm_out, _ = self.lstm(x)  # LSTM output: (batch_size, seq_len, hidden_size)
        out = self.fc(lstm_out[:, -1, :])  # Take the last time step's output
        return self.sigmoid(out)


In [82]:
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix
import numpy as np

# Function to calculate specificity
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

def train_and_evaluate(model, train_loader, val_loader, criterion, optimizer, tensorboard_log_dir, num_epochs=25):
    writer = SummaryWriter(tensorboard_log_dir)

    for epoch in range(num_epochs):
        model.train()
        train_losses = []
        for sequences, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, labels.unsqueeze(1))
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        avg_train_loss = np.mean(train_losses)
        writer.add_scalar('Loss/Train', avg_train_loss, epoch)

        # Validation
        model.eval()
        all_labels = []
        all_preds = []
        val_losses = []
        with torch.no_grad():
            for sequences, labels in val_loader:
                outputs = model(sequences)
                loss = criterion(outputs, labels.unsqueeze(1))
                val_losses.append(loss.item())
                preds = (outputs > 0.5).float()
                all_labels.extend(labels.numpy())
                all_preds.extend(preds.numpy())

            avg_val_loss = np.mean(val_losses)
            writer.add_scalar('Loss/Val', avg_val_loss, epoch)

        all_labels = np.array(all_labels)
        all_preds = np.array(all_preds)
        accuracy = accuracy_score(all_labels, all_preds)
        auc = roc_auc_score(all_labels, all_preds)
        sensitivity = recall_score(all_labels, all_preds)
        specificity = specificity_score(all_labels, all_preds)

        writer.add_scalar('Metrics/Accuracy', accuracy, epoch)
        writer.add_scalar('Metrics/AUC', auc, epoch)
        writer.add_scalar('Metrics/Sensitivity', sensitivity, epoch)
        writer.add_scalar('Metrics/Specificity', specificity, epoch)

        if epoch % 5 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, TrainLoss: {avg_train_loss:.4f}, ValLoss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}, AUC: {auc:.4f}, Sensitivity: {sensitivity:.4f}, Specificity: {specificity:.4f}')

    writer.close()
    
import torch.optim as optim
import time

# Hyperparameters
input_size = 128  # Feature size (e.g., one-hot encoding or embedding)
hidden_size = 128  # LSTM hidden state size
num_layers = 3  # Stacked LSTM layers
output_size = 1  # Binary classification
batch_size = 16
num_epochs = 26
learning_rate = 0.0001
tensorboard_log_dir = f"runs/stacked_lstm/experiment_{time.strftime('%Y-%m-%d_%H-%M-%S')}"

# Model, Loss, Optimizer
model = StackedLSTM(input_size, hidden_size, num_layers, output_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train and Evaluate
train_and_evaluate(model, train_loader, val_loader, criterion, optimizer, tensorboard_log_dir, num_epochs)


Epoch 1/26, TrainLoss: 0.6936, ValLoss: 0.6927, Accuracy: 0.5055, AUC: 0.5000, Sensitivity: 0.0000, Specificity: 1.0000
Epoch 6/26, TrainLoss: 0.6873, ValLoss: 0.6814, Accuracy: 0.5495, AUC: 0.5541, Sensitivity: 0.9778, Specificity: 0.1304
Epoch 11/26, TrainLoss: 0.5466, ValLoss: 0.4706, Accuracy: 0.7802, AUC: 0.7802, Sensitivity: 0.7778, Specificity: 0.7826
Epoch 16/26, TrainLoss: 0.5107, ValLoss: 0.4543, Accuracy: 0.7912, AUC: 0.7923, Sensitivity: 0.8889, Specificity: 0.6957
Epoch 21/26, TrainLoss: 0.4844, ValLoss: 0.4219, Accuracy: 0.8132, AUC: 0.8135, Sensitivity: 0.8444, Specificity: 0.7826
Epoch 26/26, TrainLoss: 0.4759, ValLoss: 0.4417, Accuracy: 0.8022, AUC: 0.8002, Sensitivity: 0.6222, Specificity: 0.9783


In [81]:
def evaluate_model(model, test_loader):
    model.eval()  # Set model to evaluation mode
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for sequences, labels in test_loader:
            outputs = model(sequences)
            preds = (outputs > 0.5).float()
            all_labels.extend(labels.numpy())
            all_preds.extend(preds.numpy())

    all_labels = np.array(all_labels)
    all_preds = np.array(all_preds)

    accuracy = accuracy_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_preds)
    sensitivity = recall_score(all_labels, all_preds)
    tn, fp, fn, tp = confusion_matrix(all_labels, all_preds).ravel()
    specificity = tn / (tn + fp)

    print(f'Test Accuracy: {accuracy:.4f}')
    print(f'Test AUC: {auc:.4f}')
    print(f'Test Sensitivity (Recall): {sensitivity:.4f}')
    print(f'Test Specificity: {specificity:.4f}')

    return accuracy, auc, sensitivity, specificity

# Evaluate
evaluate_model(model, test_loader)


Test Accuracy: 0.8411
Test AUC: 0.8405
Test Sensitivity (Recall): 0.7736
Test Specificity: 0.9074


(0.8411214953271028,
 0.8404961565338924,
 0.7735849056603774,
 0.9074074074074074)

# Generative model

In [122]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split

# Load Dataset
df = pd.read_csv('all_seq722.csv')
df = df[df['Sequences'].str.len() >= 3]  # Remove rows with less than 3 letters in sequences
df = df[df['AMP'] != 0]  # Remove rows where AMP is 0

# Preprocess sequences
df = df[~df["Sequences"].str.contains('-')]
df['Sequences'] = df['Sequences'].str.upper()
max_length = df['Sequences'].str.len().max()
df['Sequences'] = df['Sequences'].apply(lambda x: x.ljust(max_length, 'X'))  # Pad sequences

# Define One-Hot Encoding Function for Sequences in PyTorch
def one_hot_torch(seq: str, dtype=torch.float32):
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"  # 20 standard amino acids
    seq_bytes = torch.ByteTensor(list(bytes(seq, "utf-8")))
    aa_bytes = torch.ByteTensor(list(bytes(amino_acids, "utf-8")))
    arr = torch.zeros(len(amino_acids), len(seq_bytes), dtype=dtype)  # One-hot encoded matrix
    for i, aa in enumerate(aa_bytes):
        arr[i, seq_bytes == aa] = 1
    return arr

def preprocess_sequence(full_sequence, max_length):
    """
    Prepares input and target sequences while avoiding padding ('X') during training.
    
    Args:
        full_sequence (str): The original sequence including 'X' padding.
        max_length (int): Maximum sequence length (for padding).
    
    Returns:
        Tuple of (input sequence, target sequence) after removing padding.
    """
    # Remove trailing 'X' (padding) before slicing
    trimmed_sequence = full_sequence.rstrip('X')

    # Ensure there's at least 2 valid residues left for training
    if len(trimmed_sequence) < 2:
        return None, None  # Skip sequences that are too short to predict anything

    # Create input & target sequences (without padding)
    input_seq = trimmed_sequence[:-1]  # Exclude last valid residue
    target_seq = trimmed_sequence[1:]  # Shifted by one position

    # Pad back to maintain fixed-length format
    input_seq = input_seq.ljust(max_length - 1, 'X')  # Pad to max_length - 1
    target_seq = target_seq.ljust(max_length - 1, 'X')  # Target also gets 'X' padding

    return input_seq, target_seq

# Custom Dataset for Generative Model
class GenerativeSequenceDataset(Dataset):
    def __init__(self, sequences, max_length, one_hot_dtype=torch.float32):
        self.sequences = sequences
        self.max_length = max_length
        self.one_hot_dtype = one_hot_dtype

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        full_sequence = self.sequences.iloc[idx]  # Get sequence

        # Process input & target while handling padding correctly
        input_seq, target_seq = preprocess_sequence(full_sequence, self.max_length)
        if input_seq is None or target_seq is None:
            return None  # Skip sequences that are too short

        # Convert to one-hot encoding
        input_one_hot = one_hot_torch(input_seq, dtype=self.one_hot_dtype)
        target_one_hot = one_hot_torch(target_seq, dtype=self.one_hot_dtype)

        return input_one_hot, target_one_hot


# Split into train (70%), validation (15%), test (15%)
X_train_val, X_test = train_test_split(df["Sequences"], test_size=0.15, random_state=42)
X_train, X_val = train_test_split(X_train_val, test_size=0.15, random_state=42)  

# Convert to PyTorch Dataset
train_dataset = GenerativeSequenceDataset(X_train, max_length)
val_dataset = GenerativeSequenceDataset(X_val, max_length)
test_dataset = GenerativeSequenceDataset(X_test, max_length)

# Define DataLoaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Display dataset sizes
dataset_sizes = {
    "Train": len(train_dataset),
    "Validation": len(val_dataset),
    "Test": len(test_dataset)
}
print(dataset_sizes)


{'Train': 255, 'Validation': 45, 'Test': 54}


In [123]:
def one_hot_to_sequence(one_hot_tensor):
    """
    Converts a one-hot encoded tensor back into an amino acid sequence,
    handling fully zero vectors (padding 'X').

    Args:
        one_hot_tensor (torch.Tensor): One-hot encoded tensor of shape (num_amino_acids, seq_length)

    Returns:
        str: Decoded amino acid sequence
    """
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"  # 20 standard amino acids
    seq_length = one_hot_tensor.shape[1]  # Get sequence length
    
    decoded_sequence = ""
    for i in range(seq_length):
        column = one_hot_tensor[:, i]  # Get one-hot vector for residue
        
        if torch.sum(column) == 0:  # If it's fully zero, assume it's padding
            decoded_sequence += "X"
        else:
            amino_acid_index = torch.argmax(column).item()  # Get highest probability index
            decoded_sequence += amino_acids[amino_acid_index]  # Convert index to amino acid
    
    return decoded_sequence

for x, y in train_loader:
    # print("One-Hot Encoded Input:\n", x)
    # print("One-Hot Encoded Target:\n", y)
    
    # Convert back to amino acid sequences
    input_seq_decoded = one_hot_to_sequence(x[0])  # Decode first sample in batch
    target_seq_decoded = one_hot_to_sequence(y[0])  # Decode first target sequence
    
    print("\nDecoded Input Sequence:", input_seq_decoded)
    print("Decoded Target Sequence:", target_seq_decoded)
    break  # Print one batch and exit



Decoded Input Sequence: RWRRKWWWXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Decoded Target Sequence: WRRKWWWWXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


### LSTM

In [124]:
import torch
import torch.nn as nn
import torch.optim as optim
ic.enable()
class LSTM_Generative(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM_Generative, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=-1)  # Softmax for probability distribution

    def forward(self, x):
        ic(x.shape)
        lstm_out, _ = self.lstm(x)  # Shape: (batch_size, seq_length, hidden_size)
        out = self.fc(lstm_out)  # Shape: (batch_size, seq_length, output_size)
        return self.softmax(out)  # Probability distribution over amino acids


In [None]:
from torch.utils.tensorboard import SummaryWriter
import numpy as np
ic.disable()
def train_lstm(model, train_loader, val_loader, criterion, optimizer, tensorboard_log_dir, num_epochs=25):
    writer = SummaryWriter(tensorboard_log_dir)

    for epoch in range(num_epochs):
        model.train()
        train_losses = []

        for input_seq, target_seq in train_loader:
            optimizer.zero_grad()
            outputs = model(input_seq)  # Forward pass

            # Convert one-hot target to class indices (needed for CrossEntropyLoss)
            ic(target_seq.shape)
            target_indices = torch.argmax(target_seq, dim=-1)  # Shape: (batch_size, seq_length
            ic(target_indices.shape)
            ic(outputs.shape)
            # Compute loss (CrossEntropy expects target of shape (batch_size, seq_length))
            loss = criterion(outputs.view(-1, outputs.shape[-1]), target_indices.view(-1))
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        avg_train_loss = np.mean(train_losses)
        writer.add_scalar('Loss/Train', avg_train_loss, epoch)

        # Validation Step
        model.eval()
        val_losses = []
        with torch.no_grad():
            for input_seq, target_seq in val_loader:
                outputs = model(input_seq)

                # Convert one-hot target to class indices
                target_indices = torch.argmax(target_seq, dim=-1)

                loss = criterion(outputs.view(-1, outputs.shape[-1]), target_indices.view(-1))
                val_losses.append(loss.item())

        avg_val_loss = np.mean(val_losses)
        writer.add_scalar('Loss/Val', avg_val_loss, epoch)

        if epoch % 5 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, TrainLoss: {avg_train_loss:.4f}, ValLoss: {avg_val_loss:.4f}')

    writer.close()


import time

# Define Amino Acid Encoding
amino_acids = "ACDEFGHIKLMNPQRSTVWY"
char_to_index = {aa: i for i, aa in enumerate(amino_acids)}
index_to_char = {i: aa for i, aa in enumerate(amino_acids)}

# Hyperparameters
input_size = 127 #len(amino_acids)  # Number of amino acids (20)
hidden_size = 128  # LSTM hidden state size
num_layers = 2  # Stacked LSTM layers
output_size = 127#len(amino_acids)  # Predicting one of 20 amino acids
batch_size = 16
num_epochs = 30
learning_rate = 0.001
tensorboard_log_dir = f"runs/generative_lstm/experiment_{time.strftime('%Y-%m-%d_%H-%M-%S')}"

# Model, Loss, Optimizer
model = LSTM_Generative(input_size, hidden_size, num_layers, output_size)
criterion = nn.CrossEntropyLoss(ignore_index=20)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the Model
train_lstm(model, train_loader, val_loader, criterion, optimizer, tensorboard_log_dir, num_epochs)


ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])


ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: 

Epoch 1/30, TrainLoss: 4.8296, ValLoss: 4.6929


ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| output

Epoch 6/30, TrainLoss: 4.1966, ValLoss: 4.2045


ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| output

Epoch 11/30, TrainLoss: 4.1914, ValLoss: 4.1993


ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 127])
ic| target_indices.shape: torch.Size([16, 20])
ic| outputs.shape: torch.Size([16, 20, 127])
ic| x.shape: torch.Size([16, 20, 127])
ic| target_seq.shape: torch.Size([16, 20, 

In [116]:
outputs


NameError: name 'outputs' is not defined

In [None]:

, target_indices.view(-1))

In [98]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score

def evaluate_lstm(model, test_loader, criterion):
    model.eval()  # Set model to evaluation mode
    
    total_loss = 0
    total_accuracy = 0
    total_perplexity = 0
    num_samples = 0

    with torch.no_grad():
        for input_seq, target_seq in test_loader:
            outputs = model(input_seq)  # Predictions (batch_size, seq_length, num_classes)

            # Compute loss
            loss = criterion(outputs.view(-1, outputs.shape[-1]), target_seq.view(-1, target_seq.shape[-1]))
            total_loss += loss.item()

            # Compute accuracy (compare predicted residues with ground truth)
            predicted_indices = torch.argmax(outputs, dim=-1)  # Get highest probability residues
            target_indices = torch.argmax(target_seq, dim=-1)  # Get actual residues
            batch_accuracy = (predicted_indices == target_indices).float().mean().item()
            total_accuracy += batch_accuracy

            # Compute perplexity (exp of cross-entropy loss)
            batch_perplexity = torch.exp(loss).item()
            total_perplexity += batch_perplexity

            num_samples += 1

    # Calculate averages
    avg_loss = total_loss / num_samples
    avg_accuracy = total_accu
