In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from icecream import ic
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from xgboost import XGBClassifier

import random
import numpy as np
import torch

from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix
import numpy as np
import time

from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
import optuna

import random
import numpy as np
from collections import Counter
import torch
import torch.nn.functional as F
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [20]:
import os
import random
import numpy as np

# If using PyTorch
import torch

# If using TensorFlow

# Optional: If using Python hash-based functions
os.environ["PYTHONHASHSEED"] = "42"

# Set seed for base Python random
random.seed(42)

# Set seed for NumPy
np.random.seed(42)

# Set seed for PyTorch (CPU and GPU)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)  # if using multi-GPU

# Force deterministic behavior in PyTorch
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


## General AMP

### Data

In [21]:
dbbasp = pd.read_csv("../models1/database_check/DBAASP_peptides.csv")
dbbasp = dbbasp[dbbasp["SEQUENCE"].str.len() >= 10]
dbbasp = dbbasp[~dbbasp["TARGET GROUP"].str.contains("Fungus", na=False)]
dbbasp = dbbasp[["ID", "SEQUENCE"]]
dbbasp.columns = ["Peptide ID", "Sequence"]
adam_df = pd.read_csv("../data/naturalAMPs_APD2024a-ADAM.csv")
adam_df = pd.concat([adam_df, dbbasp], ignore_index=True)
# uniprot_df = pd.read_csv("../data/uniprotkb_length_10_TO_80_NOT_antimicro_2025_04_14.fasta.csv")
# uniprot_df1 = pd.read_csv("../data/uniprotkb_length_10_TO_80_NOT_antimicro_2025_04_14.fasta1.csv")
# uniprot_df = pd.concat([uniprot_df, uniprot_df1], ignore_index=True)

#### raw data gen

In [22]:
import torch
import pandas as pd
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Load only positive (AMP) sequences
adam_df = pd.read_csv("../data/naturalAMPs_APD2024a-ADAM.csv")

unique_letters = set(''.join(adam_df["Sequence"]))
print(unique_letters)
print(len(unique_letters))
print(f"Number of sequences after filtering: {len(adam_df)}")
adam_df = adam_df.drop_duplicates(subset='Sequence')
tb_df = pd.read_csv('../data/all_seq702.csv')
adam_df = adam_df[~adam_df['Sequence'].isin(tb_df['Sequences'])]
adam_df = adam_df[adam_df["Sequence"].str.len() >= 10]
generation_seqs = adam_df["Sequence"].reset_index(drop=True)

import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

def one_hot_torch(seq: str, dtype=torch.float32):
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"
    seq_bytes = torch.ByteTensor(list(bytes(seq, "utf-8")))
    aa_bytes = torch.ByteTensor(list(bytes(amino_acids, "utf-8")))
    arr = torch.zeros(len(amino_acids), len(seq_bytes), dtype=dtype)
    for i, aa in enumerate(aa_bytes):
        arr[i, seq_bytes == aa] = 1
    return arr

class GenerativeSequenceDataset(Dataset):
    def __init__(self, sequences, one_hot_dtype=torch.float32):
        self.sequences = sequences
        self.one_hot_dtype = one_hot_dtype

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences.iloc[idx]
        input_seq = seq[:-1]  # all residues except the last
        target_seq = seq[1:]  # all residues except the first
        length = len(input_seq.replace("X", ""))  # unpadded length
        input_one_hot = one_hot_torch(input_seq, dtype=self.one_hot_dtype)
        target_one_hot = one_hot_torch(target_seq, dtype=self.one_hot_dtype)
        # target_indices = torch.tensor(["ACDEFGHIKLMNPQRSTVWY".index(res) for res in target_seq], dtype=torch.long)
        return input_one_hot, target_one_hot, length

def generative_collate_and_pack(batch):
    sequences, targets, lengths = zip(*batch)

    lengths = torch.tensor(lengths)
    sorted_indices = torch.argsort(lengths, descending=True)
    sequences = [sequences[i] for i in sorted_indices]
    targets = [targets[i] for i in sorted_indices]
    lengths = lengths[sorted_indices]

    sequences = [seq.T for seq in sequences]  # transpose to [seq_len, features]
    targets = [tgt.T for tgt in targets]      # transpose targets as well

    padded_seqs = pad_sequence(sequences, batch_first=False)
    padded_targets = pad_sequence(targets, batch_first=False)

    packed_input = pack_padded_sequence(padded_seqs, lengths.cpu(), batch_first=False)
    packed_target = pack_padded_sequence(padded_targets, lengths.cpu(), batch_first=False)

    return packed_input, packed_target, lengths


# Train/val/test split
train_seqs, test_seqs = train_test_split(generation_seqs, test_size=0.3, random_state=42)
val_seqs, test_seqs = train_test_split(test_seqs, test_size=0.5, random_state=42)

train_dataset = GenerativeSequenceDataset(train_seqs)
val_dataset = GenerativeSequenceDataset(val_seqs)
test_dataset = GenerativeSequenceDataset(test_seqs)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=generative_collate_and_pack)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=generative_collate_and_pack)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=generative_collate_and_pack)

# Dataset sizes
dataset_sizes = {
    "Train": len(train_dataset),
    "Validation": len(val_dataset),
    "Test": len(test_dataset)
}
print("Dataset sizes:", dataset_sizes)

for x, y, l in train_loader:
    print("Input shape:", x.data.shape)  # [L, B, 20]
    print("Target shape:", y.data.shape)  # [L, B, 20]
    print("Lengths:", y.batch_sizes)  # Lengths of sequences in the batch
    break

{'K', 'C', 'Q', 'G', 'F', 'A', 'D', 'Y', 'P', 'M', 'H', 'E', 'I', 'V', 'L', 'S', 'T', 'W', 'R', 'N'}
20
Number of sequences after filtering: 3306
Dataset sizes: {'Train': 2219, 'Validation': 475, 'Test': 476}
Input shape: torch.Size([1793, 20])
Target shape: torch.Size([1793, 20])
Lengths: tensor([64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 59, 57, 55, 53, 53, 50,
        46, 45, 40, 40, 38, 35, 32, 31, 31, 30, 29, 29, 28, 26, 20, 19, 16, 14,
        13, 10,  7,  5,  5,  4,  4,  4,  4,  3,  2,  2,  2,  2,  2,  2,  2,  2,
         2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])


#### sliding window data gen

In [23]:
# # Re-import required libraries after environment reset
# import torch
# import pandas as pd
# from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
# from torch.utils.data import Dataset, DataLoader
# from sklearn.model_selection import train_test_split

# # Load only positive (AMP) sequences
# adam_df = pd.read_csv("../data/naturalAMPs_APD2024a-ADAM.csv")

# # Clean non-standard amino acids
# unique_letters = set(''.join(adam_df["Sequence"]))
# amino_acids = "ACDEFGHIKLMNPQRSTVWY"
# # non_standard_amino_acids = set(unique_letters) - set(amino_acids)
# # adam_df = adam_df[~adam_df["Sequence"].str.contains('|'.join(non_standard_amino_acids))]

# # Apply sliding window to generate fragments
# def generate_fragments(sequences, window_size=15, stride=5):
#     fragments = []
#     for seq in sequences:
#         for start in range(0, len(seq) - window_size + 1, stride):
#             fragment = seq[start:start + window_size]
#             fragments.append(fragment)
#     return fragments

# generation_fragments = generate_fragments(adam_df["Sequence"].tolist())

# # Define one-hot encoding function
# def one_hot_torch(seq: str, dtype=torch.float32):
#     amino_acids = "ACDEFGHIKLMNPQRSTVWY"
#     seq_bytes = torch.ByteTensor(list(bytes(seq, "utf-8")))
#     aa_bytes = torch.ByteTensor(list(bytes(amino_acids, "utf-8")))
#     arr = torch.zeros(len(amino_acids), len(seq_bytes), dtype=dtype)
#     for i, aa in enumerate(aa_bytes):
#         arr[i, seq_bytes == aa] = 1
#     return arr

# # Dataset using one-hot encoding for generative modeling
# class AMPGenerationOneHotDataset(Dataset):
#     def __init__(self, sequences):
#         self.sequences = sequences

#     def __len__(self):
#         return len(self.sequences)

#     def __getitem__(self, idx):
#         seq = self.sequences[idx]
#         input_seq = seq[:-1]  # all residues except the last
#         target_seq = seq[1:]  # all residues except the first
#         input_one_hot = one_hot_torch(input_seq)  # shape: [20, seq_len - 1]
#         target_one_hot = one_hot_torch(target_seq)  # shape: [20, seq_len - 1]
#         length = input_one_hot.shape[1]
#         return input_one_hot, target_one_hot, length

# # Collate function for packing one-hot sequences
# def collate_and_pack_for_generation(batch):
#     sequences, targets, lengths = zip(*batch)
#     lengths = torch.tensor(lengths)

#     # Sort by length (required for packing)
#     sorted_indices = torch.argsort(lengths, descending=True)
#     sequences = [sequences[i] for i in sorted_indices]
#     targets = [targets[i] for i in sorted_indices]
#     lengths = lengths[sorted_indices]

#     # Transpose each to [L, 20] and pad
#     sequences = [seq.T for seq in sequences]  # from [20, L] to [L, 20]
#     targets = [tgt.T for tgt in targets]      # from [20, L] to [L, 20]

#     padded_seqs = pad_sequence(sequences, batch_first=False)  # [L, B, 20]
#     padded_targets = pad_sequence(targets, batch_first=False)  # [L, B, 20]

#     packed_input = pack_padded_sequence(padded_seqs, lengths.cpu(), batch_first=False)
#     packed_target = pack_padded_sequence(padded_targets, lengths.cpu(), batch_first=False)

#     return packed_input, packed_target, lengths

# # Train/val/test split
# train_seqs, test_seqs = train_test_split(generation_fragments, test_size=0.3, random_state=42)
# val_seqs, test_seqs = train_test_split(test_seqs, test_size=0.5, random_state=42)

# train_dataset = AMPGenerationOneHotDataset(train_seqs)
# val_dataset = AMPGenerationOneHotDataset(val_seqs)
# test_dataset = AMPGenerationOneHotDataset(test_seqs)

# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_and_pack_for_generation)
# val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_and_pack_for_generation)
# test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_and_pack_for_generation)

# # Dataset sizes for verification
# dataset_sizes = {
#     "Train": len(train_dataset),
#     "Validation": len(val_dataset),
#     "Test": len(test_dataset)
# }
# print("Dataset sizes:", dataset_sizes)

# for x, y, l in train_loader:
#     print("Input shape:", x.data.shape)  # [L, B, 20]
#     print("Target shape:", y.data.shape)  # [L, B, 20]
#     print("Lengths:", y.batch_sizes)  # Lengths of sequences in the batch
#     break

### model

In [24]:
class GenerativeLSTM(nn.Module):
    def __init__(self, input_dim=20, hidden_dim=128, num_layers=1, dropout=0.3, output_dim=20):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, dropout=dropout if num_layers > 1 else 0)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, packed_input):
        packed_output, _ = self.lstm(packed_input)
        dropped = self.dropout(packed_output.data)
        logits = self.fc(dropped)
        return logits  # shape: [total_timesteps, 20]

### test

In [None]:
model = GenerativeLSTM()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()

# Train and eval functions
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for packed_input, packed_target, _ in dataloader:
        inputs = packed_input.to(device)
        targets = torch.argmax(packed_target.data, dim=1).to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for packed_input, packed_target, _ in dataloader:
            inputs = packed_input.to(device)
            targets = torch.argmax(packed_target.data, dim=1).to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
    return total_loss / len(dataloader)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    val_loss = evaluate(model, val_loader, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")

# Final test evaluation
test_loss = evaluate(model, test_loader, criterion, device)
print(f"Test Loss: {test_loss:.4f}")


## Model

In [25]:
from torch.nn.utils.rnn import pad_packed_sequence


class GenerativeLSTM(nn.Module):
    def __init__(self, input_dim=20, hidden_dim=128, num_layers=1, dropout=0.3):
        super(GenerativeLSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_dim, hidden_dim, num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.fc = nn.Linear(hidden_dim, input_dim)

    def forward(self, x):
        # Handle packed input
        if isinstance(x, torch.nn.utils.rnn.PackedSequence):
            packed_output, _ = self.lstm(x)
            unpacked_output, _ = pad_packed_sequence(packed_output, batch_first=True)
            return self.fc(unpacked_output)
        else:
            out, _ = self.lstm(x)
            return self.fc(out)



#### General AMP - train

In [9]:
# Re-import necessary packages after reset
import torch
import torch.nn as nn
import torch.optim as optim
import datetime
import optuna
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, roc_auc_score
from torch.utils.tensorboard import SummaryWriter


# Criterion

PAD_IDX = -100
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)


def compute_last_token_loss(output, target_seq, criterion):
    """
    Computes cross-entropy loss on the last time step of each sequence.
    
    Args:
        output: Tensor of shape [B, L, vocab_size]
        target_seq: Tensor of shape [B, L] containing target class indices
    
    Returns:
        loss: Scalar loss computed only on the last token of each sequence
    """
    # Get last time step for each sequence
    last_token_logits = output[:, -1, :]        # [B, vocab_size]
    last_token_targets = target_seq[:, -1, :]      # [B]
    last_token_targets = torch.argmax(last_token_targets, dim=-1)  #  now shape is [batch_size, seq_len]

    # print('last_token_logits',last_token_logits.shape)
    # print('last_token_targets',last_token_targets.shape)

    return criterion(last_token_logits, last_token_targets)

# Training function
def train_model_generation(model, train_loader, val_loader, num_epochs=10, lr=1e-3, weight_decay=1e-4,
                           device='cuda' if torch.cuda.is_available() else 'cpu', verbose=False, train=True):
    model.to(device)
    if not train:
        model.eval()

    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    best_val_loss = float('inf')
    log_dir = f"runs-lstm-gen/AMP_LSTM_GEN_{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}"
    writer = SummaryWriter(log_dir=log_dir)

    for epoch in range(1, num_epochs + 1):
        model.train()
        epoch_loss = 0.0
        for input_seq, target_seq, _ in train_loader:
            input_seq = input_seq.to(device)
            target_seq = target_seq.to(device)

            optimizer.zero_grad()
            output = model(input_seq)
            # output = output.view(-1, output.shape[-1])
            # output = output.reshape(-1, output.shape[-1])      # [B*L, vocab]
            # output = torch.argmax(output, dim=-1)  # now shape is [batch_size, seq_len]

            
            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)
            # # target_seq = target_seq.view(-1)
            # target_seq = torch.argmax(target_seq, dim=-1)  #  now shape is [batch_size, seq_len]

            
            # print('target_shape before reshape',target_seq.shape)
            # target_seq = target_seq.reshape(-1)
            # print(f"Output shape: {output.shape}, Target shape: {target_seq.shape}")

            # loss = criterion(output, target_seq)
            loss = compute_last_token_loss(output, target_seq, criterion)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        avg_train_loss = epoch_loss / len(train_loader)
        val_loss, acc, auc = evaluate_model_generation(model, val_loader, criterion, device, verbose)

        writer.add_scalar('Loss/Train', avg_train_loss, epoch)
        writer.add_scalar('Loss/Validation', val_loss, epoch)
        writer.add_scalar('Accuracy/Validation', acc, epoch)
        writer.add_scalar('AUC/Validation', auc if auc != "undefined" else 0.0, epoch)

        if verbose:
            print(f"Epoch [{epoch}/{num_epochs}] - Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}, Acc: {acc:.4f}, AUC: {auc}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            if train:
                torch.save(model.state_dict(), 'best_model_lstm_generator.pt')

    writer.close()
    return best_val_loss

# Evaluation function
def evaluate_model_generation(model, data_loader, criterion, device='cuda' if torch.cuda.is_available() else 'cpu', verbose=False):
    model.eval()
    all_labels = []
    all_preds = []
    total_loss = 0.0
    with torch.no_grad():
        for input_seq, target_seq, _ in data_loader:
            input_seq = input_seq.to(device)
            target_seq = target_seq.to(device)

            output = model(input_seq)
            # output = output.view(-1, output.shape[-1])
            # output = output.reshape(-1, output.shape[-1])  # [B*L, vocab]
            # output = torch.argmax(output, dim=-1)  # now shape is [batch_size, seq_len]

            # # target_seq = target_seq.view(-1)
            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)
            # # target_seq = target_seq.view(-1)
            # # target_seq = target_seq.reshape(-1)
            # # target_seq = target_seq.reshape(-1, target_seq.shape[-1])
            # target_seq = torch.argmax(target_seq, dim=-1)  #  now shape is [batch_size, seq_len]

            # assert output.size(0) == target_seq.size(0), f"Mismatch: {output.size(0)} vs {target_seq.size(0)}"

            # loss = criterion(output, target_seq)
            loss = compute_last_token_loss(output, target_seq, criterion)
            # print('loss done')
            total_loss += loss.item()
            
            
            preds = output[:, -1, :]        # shape: [B, vocab_size]
            preds = torch.argmax(preds, dim=1)  # shape: [B]

            targets = target_seq[:, -1, :]      # shape: [B, vocab_size]
            targets = torch.argmax(targets, dim=-1)  # shape: [B]
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    acc = accuracy_score(all_labels, all_preds)
    # try:
    #     auc = roc_auc_score(
    #         torch.nn.functional.one_hot(torch.tensor(all_labels), num_classes=20),
    #         torch.nn.functional.one_hot(torch.tensor(all_preds), num_classes=20),
    #         multi_class='ovr', average='macro'
    #     )
    # except:
    auc = "undefined"

    return avg_loss, acc, auc

# Objective for Optuna tuning
def objective_generation(trial):
    hidden_dim = trial.suggest_int("hidden_dim", 64, 256)
    num_layers = trial.suggest_int("num_layers", 1, 3)
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    lr = trial.suggest_float("lr", 1e-4, 1e-2)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3)

    model = GenerativeLSTM(hidden_dim=hidden_dim, num_layers=num_layers, dropout=dropout)
    val_loss = train_model_generation(model, train_loader, val_loader, num_epochs=10, lr=lr, weight_decay=weight_decay, verbose=False)
    return val_loss

study = optuna.create_study(direction="minimize")
study.optimize(objective_generation, n_trials=20)

lstm_gen_best_params = study.best_trial.params
print(lstm_gen_best_params)

[I 2025-05-01 10:15:27,715] A new study created in memory with name: no-name-f1f9add1-e643-4a26-a9ec-34bc8c6709d0
[I 2025-05-01 10:16:06,930] Trial 0 finished with value: 0.9415398016571999 and parameters: {'hidden_dim': 188, 'num_layers': 3, 'dropout': 0.1784113931084026, 'lr': 0.003916594127886763, 'weight_decay': 0.0005862332247206099}. Best is trial 0 with value: 0.9415398016571999.
[I 2025-05-01 10:16:46,849] Trial 1 finished with value: 2.5822975635528564 and parameters: {'hidden_dim': 197, 'num_layers': 3, 'dropout': 0.2848177027449981, 'lr': 0.0006319414902667608, 'weight_decay': 0.0006788940039732188}. Best is trial 0 with value: 0.9415398016571999.
[I 2025-05-01 10:17:30,858] Trial 2 finished with value: 2.237904667854309 and parameters: {'hidden_dim': 233, 'num_layers': 3, 'dropout': 0.3197777425446538, 'lr': 0.001122873472308211, 'weight_decay': 0.0002095358074265867}. Best is trial 0 with value: 0.9415398016571999.
[I 2025-05-01 10:18:13,597] Trial 3 finished with value: 0

{'hidden_dim': 106, 'num_layers': 1, 'dropout': 0.14740305321048008, 'lr': 0.009969866325800866, 'weight_decay': 5.187258217307484e-06}


In [27]:
lstm_gen_best_params = {'hidden_dim': 106, 'num_layers': 1, 'dropout': 0.14740305321048008, 'lr': 0.009969866325800866, 'weight_decay': 5.187258217307484e-06}


In [None]:
!tensorboard --logdir runs-lstm-gen

#### test

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, roc_auc_score
import math

# --- Assumes you already have these from your previous steps ---
# lstm_gen_best_params
# train_loader, val_loader, test_loader
# GenerativeLSTM
# compute_last_token_loss

PAD_IDX = -100
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_final_model(model, train_loader, val_loader, num_epochs=20):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lstm_gen_best_params["lr"], weight_decay=lstm_gen_best_params["weight_decay"])
    writer = SummaryWriter(log_dir=f"runs-lstm-gen/AMPGen_LSTM_final")

    best_val_loss = float('inf')

    for epoch in range(1, num_epochs + 1):
        model.train()
        total_loss = 0.0

        for input_seq, target_seq, _ in train_loader:
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)
            optimizer.zero_grad()

            output = model(input_seq)
            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)

            loss = compute_last_token_loss(output, target_seq, criterion)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        val_loss, acc, auc, perp = evaluate_final_model(model, test_loader)

        writer.add_scalar('Loss/Train', avg_train_loss, epoch)
        writer.add_scalar('Loss/Validation', val_loss, epoch)
        writer.add_scalar('Accuracy/Validation', acc, epoch)
        writer.add_scalar('AUC/Validation', auc if auc != "undefined" else 0.0, epoch)

        print(f"Epoch {epoch}: Train Loss = {avg_train_loss:.4f} | Val Loss = {val_loss:.4f} | Acc = {acc:.4f} | AUC = {auc} | Perplexity = {perp:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            # torch.save(model.state_dict(), "best_model_lstm_generator.pt")

    writer.close()
    return model

def evaluate_final_model(model, data_loader):
    model.eval()
    all_preds, all_labels = [], []
    total_loss = 0.0

    with torch.no_grad():
        for input_seq, target_seq, _ in data_loader:
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)
            output = model(input_seq)

            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)

            loss = compute_last_token_loss(output, target_seq, criterion)
            total_loss += loss.item()

            preds = output[:, -1, :]  # [B, vocab]
            preds = torch.argmax(preds, dim=1)
            targets = target_seq[:, -1, :]
            targets = torch.argmax(targets, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    # try:
    #     auc = roc_auc_score(
    #         torch.nn.functional.one_hot(torch.tensor(all_labels), num_classes=20),
    #         torch.nn.functional.one_hot(torch.tensor(all_preds), num_classes=20),
    #         multi_class='ovr', average='macro'
    #     )
    # except Exception:
    auc = "undefined"

    avg_loss = total_loss / len(data_loader)
    perplexity = math.exp(avg_loss)
    return avg_loss, acc, auc, perplexity

# --- Build and train final model using best parameters ---
# lstm_gen_best_params = {'hidden_dim': 133, 'num_layers': 2, 'dropout': 0.10063270147175422, 'lr': 0.003237280156212186, 'weight_decay': 2.2594437829479466e-05}

final_model = GenerativeLSTM(
    input_dim=20,
    hidden_dim=lstm_gen_best_params["hidden_dim"],
    num_layers=lstm_gen_best_params["num_layers"],
    dropout=lstm_gen_best_params["dropout"]
)

trained_model = train_final_model(final_model, train_loader, val_loader, num_epochs=20)

# --- Evaluate on test set ---
test_loss, test_acc, test_auc, perp = evaluate_final_model(trained_model, test_loader)
print(f"\n✅ Final Test Metrics:\nLoss = {test_loss:.4f}, Accuracy = {test_acc:.4f}, AUC = {test_auc}, Perplexity = {perp:.4f}")
torch.save(trained_model.state_dict(), 'best_model_lstm_generator_final.pt')


Epoch 1: Train Loss = 2.7806 | Val Loss = 2.4566 | Acc = 0.9832 | AUC = undefined | Perplexity = 11.6652
Epoch 2: Train Loss = 2.1617 | Val Loss = 1.8715 | Acc = 0.9832 | AUC = undefined | Perplexity = 6.4982
Epoch 3: Train Loss = 1.6136 | Val Loss = 1.3766 | Acc = 0.9832 | AUC = undefined | Perplexity = 3.9615
Epoch 4: Train Loss = 1.1763 | Val Loss = 0.9982 | Acc = 0.9832 | AUC = undefined | Perplexity = 2.7134
Epoch 5: Train Loss = 0.8453 | Val Loss = 0.7492 | Acc = 0.9832 | AUC = undefined | Perplexity = 2.1154
Epoch 6: Train Loss = 0.6172 | Val Loss = 0.5569 | Acc = 0.9832 | AUC = undefined | Perplexity = 1.7452
Epoch 7: Train Loss = 0.4617 | Val Loss = 0.4207 | Acc = 0.9853 | AUC = undefined | Perplexity = 1.5230
Epoch 8: Train Loss = 0.3594 | Val Loss = 0.3449 | Acc = 0.9853 | AUC = undefined | Perplexity = 1.4118
Epoch 9: Train Loss = 0.2935 | Val Loss = 0.2825 | Acc = 0.9853 | AUC = undefined | Perplexity = 1.3265
Epoch 10: Train Loss = 0.2424 | Val Loss = 0.2422 | Acc = 0.983

## tb amp

In [4]:
df = pd.read_csv('../data/all_seq702.csv')
df = df.drop_duplicates(subset='Sequences')

max_length = df['Sequences'].str.len().max()
print(max_length)
# df['Sequences'] = df['Sequences'].apply(lambda x: x.ljust(max_length, 'X'))

unique_letters = set(''.join(df["Sequences"]))
print(unique_letters)
print(len(unique_letters))
amino_acids = set("ACDEFGHIKLMNPQRSTVWY")
non_standard_amino_acids = unique_letters - amino_acids
print(non_standard_amino_acids)
b_count = df["Sequences"].str.count('B').sum()
print(f"Number of 'B' values: {b_count}")
# manually replaced one of the B with D and the other with N

X = df["Sequences"]
y = df["AMP"]

# df_filtered = df[
#     (df['Sequences'].str.len() >= 10) &
#     (df['Sequences'].apply(lambda x: len(set(x)) > 1)) &
#     (~df['Sequences'].str.contains('X'))
# ]
df_filtered = df

def split_sequence(seq, chunk_size=20):
    return [seq[i:i+chunk_size] for i in range(0, len(seq), chunk_size)]

new_rows = []
for _, row in df_filtered.iterrows():
    seq = row['Sequences']
    amp_label = row['AMP']
    if len(seq) > 40:
        for chunk in split_sequence(seq, 20):
            new_rows.append({'Sequences': chunk, 'AMP': amp_label})
    else:
        new_rows.append({'Sequences': seq, 'AMP': amp_label})

df_filtered = pd.DataFrame(new_rows)


df_filtered = df_filtered[
    (df_filtered['Sequences'].str.len() >= 10) &
    (df_filtered['Sequences'].apply(lambda x: len(set(x)) > 1)) &
    (~df_filtered['Sequences'].str.contains('X'))
]
df_filtered = df_filtered[df_filtered['AMP']==1]
df_filtered = df_filtered.drop_duplicates(subset='Sequences')


128
{'K', 'C', 'Q', 'G', 'X', 'F', 'A', 'D', 'Y', 'P', 'M', 'H', 'E', 'I', 'V', 'L', 'S', 'T', 'W', 'R', 'N'}
21
{'X'}
Number of 'B' values: 0


In [5]:
print(min([len(x) for x in df_filtered['Sequences']]))
print(max([len(x) for x in df_filtered['Sequences']]))

10
40


In [6]:
# Re-import libraries after environment reset
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from sklearn.model_selection import train_test_split

# Load data
df = df_filtered

# Clean and inspect
amino_acids = set("ACDEFGHIKLMNPQRSTVWY")
unique_letters = set(''.join(df["Sequences"]))
# non_standard_amino_acids = unique_letters - amino_acids
# df = df[~df["Sequences"].str.contains('|'.join(non_standard_amino_acids))]

# Extract sequences
sequences = df["Sequences"].reset_index(drop=True)

# Define one-hot function
def one_hot_torch(seq: str, dtype=torch.float32):
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"
    seq_bytes = torch.ByteTensor(list(bytes(seq, "utf-8")))
    aa_bytes = torch.ByteTensor(list(bytes(amino_acids, "utf-8")))
    arr = torch.zeros(len(amino_acids), len(seq_bytes), dtype=dtype)
    for i, aa in enumerate(aa_bytes):
        arr[i, seq_bytes == aa] = 1
    return arr

# Define dataset class
class GenerativeSequenceDataset(Dataset):
    def __init__(self, sequences, one_hot_dtype=torch.float32):
        self.sequences = sequences
        self.one_hot_dtype = one_hot_dtype

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences.iloc[idx]
        input_seq = seq[:-1]
        target_seq = seq[1:]
        length = len(input_seq.replace("X", ""))
        input_one_hot = one_hot_torch(input_seq, dtype=self.one_hot_dtype)
        target_one_hot = one_hot_torch(target_seq, dtype=self.one_hot_dtype)
        return input_one_hot, target_one_hot, length

# Define collate function
def generative_collate_and_pack(batch):
    sequences, targets, lengths = zip(*batch)
    lengths = torch.tensor(lengths)
    sorted_indices = torch.argsort(lengths, descending=True)
    sequences = [sequences[i] for i in sorted_indices]
    targets = [targets[i] for i in sorted_indices]
    lengths = lengths[sorted_indices]
    sequences = [seq.T for seq in sequences]
    targets = [tgt.T for tgt in targets]
    padded_seqs = pad_sequence(sequences, batch_first=False)
    padded_targets = pad_sequence(targets, batch_first=False)
    packed_input = pack_padded_sequence(padded_seqs, lengths.cpu(), batch_first=False)
    packed_target = pack_padded_sequence(padded_targets, lengths.cpu(), batch_first=False)
    return packed_input, packed_target, lengths

# Split and load data
train_seqs, test_seqs = train_test_split(sequences, test_size=0.3, random_state=42)
val_seqs, test_seqs = train_test_split(test_seqs, test_size=0.5, random_state=42)
train_dataset = GenerativeSequenceDataset(train_seqs)
val_dataset = GenerativeSequenceDataset(val_seqs)
test_dataset = GenerativeSequenceDataset(test_seqs)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=generative_collate_and_pack)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=generative_collate_and_pack)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=generative_collate_and_pack)

# Preview batch
batch_sample = next(iter(train_loader))
batch_sample_shapes = {
    "Input shape": batch_sample[0].data.shape,
    "Target shape": batch_sample[1].data.shape,
    "Lengths": batch_sample[0].batch_sizes
}
batch_sample_shapes


{'Input shape': torch.Size([1203, 20]),
 'Target shape': torch.Size([1203, 20]),
 'Lengths': tensor([64, 64, 64, 64, 64, 64, 64, 64, 64, 55, 53, 46, 42, 40, 39, 39, 34, 34,
         34, 25, 25, 23, 22, 21, 15, 11, 10, 10,  9,  7,  6,  6,  6,  6,  3,  3,
          2,  1])}

In [7]:
dataset_sizes = {
    "Train": len(train_dataset),
    "Validation": len(val_dataset),
    "Test": len(test_dataset)
}
print("Dataset sizes:")
for name, size in dataset_sizes.items():
    print(f"{name}: {size}")

Dataset sizes:
Train: 171
Validation: 37
Test: 37


### train for full backprop

In [28]:

# Criterion
PAD_IDX = -100
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

# Transfer Learning Loader
def load_pretrained_weights(model, checkpoint_path):
    pretrained_dict = torch.load(checkpoint_path, map_location='cpu')
    model.load_state_dict(pretrained_dict, strict=False)
    return model

def compute_last_token_loss(output, target_seq, criterion):
    last_token_logits = output[:, -1, :]
    last_token_targets = target_seq[:, -1, :]
    last_token_targets = torch.argmax(last_token_targets, dim=-1)
    return criterion(last_token_logits, last_token_targets)

# Training function
def train_model_generation(model, train_loader, val_loader, num_epochs=10, lr=1e-3, weight_decay=1e-4,
                           device='cuda' if torch.cuda.is_available() else 'cpu', verbose=False, train=True):
    model.to(device)
    if not train:
        model.eval()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    best_val_loss = float('inf')
    log_dir = f"runs-lstm-gen-tb/AMP_LSTM_GEN_TRANSFER_{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}"
    writer = SummaryWriter(log_dir=log_dir)

    for epoch in range(1, num_epochs + 1):
        model.train()
        epoch_loss = 0.0
        for input_seq, target_seq, _ in train_loader:
            input_seq = input_seq.to(device)
            target_seq = target_seq.to(device)

            optimizer.zero_grad()
            output = model(input_seq)

            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)

            loss = compute_last_token_loss(output, target_seq, criterion)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        avg_train_loss = epoch_loss / len(train_loader)
        val_loss, acc, auc = evaluate_model_generation(model, val_loader, criterion, device, verbose)

        writer.add_scalar('Loss/Train', avg_train_loss, epoch)
        writer.add_scalar('Loss/Validation', val_loss, epoch)
        writer.add_scalar('Accuracy/Validation', acc, epoch)
        writer.add_scalar('AUC/Validation', auc if auc != "undefined" else 0.0, epoch)

        if verbose:
            print(f"Epoch [{epoch}/{num_epochs}] - Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}, Acc: {acc:.4f}, AUC: {auc}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            # if train:
            #     torch.save(model.state_dict(), 'best_model_lstm_transfer.pt')

    writer.close()
    return best_val_loss

# Evaluation function
def evaluate_model_generation(model, data_loader, criterion, device='cuda' if torch.cuda.is_available() else 'cpu', verbose=False):
    model.eval()
    all_labels = []
    all_preds = []
    total_loss = 0.0
    with torch.no_grad():
        for input_seq, target_seq, _ in data_loader:
            input_seq = input_seq.to(device)
            target_seq = target_seq.to(device)

            output = model(input_seq)

            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)

            loss = compute_last_token_loss(output, target_seq, criterion)
            total_loss += loss.item()

            preds = output[:, -1, :]
            preds = torch.argmax(preds, dim=1)

            targets = target_seq[:, -1, :]
            targets = torch.argmax(targets, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    acc = accuracy_score(all_labels, all_preds)
    # try:
    #     auc = roc_auc_score(
    #         torch.nn.functional.one_hot(torch.tensor(all_labels), num_classes=20),
    #         torch.nn.functional.one_hot(torch.tensor(all_preds), num_classes=20),
    #         multi_class='ovr', average='macro'
    #     )
    # except:
    auc = "undefined"
    
    return avg_loss, acc, auc



# Optuna objective for BiLSTM transfer
def load_partial_weights(model, checkpoint_path, max_layers=None):
    """
    Load up to `max_layers` compatible layers from a checkpoint into the model.
    If max_layers is None, load all compatible layers.
    """
    pretrained_dict = torch.load(checkpoint_path, map_location='cpu')
    model_dict = model.state_dict()

    # Filter compatible layers
    compatible_items = [
        (k, v) for k, v in pretrained_dict.items()
        if k in model_dict and model_dict[k].shape == v.shape
    ]

    # Limit number of layers to load
    if max_layers is not None:
        compatible_items = compatible_items[:max_layers]

    # Convert list of tuples back to dict
    compatible_dict = dict(compatible_items)

    # Update model state dict
    model_dict.update(compatible_dict)
    model.load_state_dict(model_dict)
    print(f"✅ Loaded {len(compatible_dict)} matching layers from checkpoint.")
    return model


def freeze_encoder(model, num_layers_to_freeze):
    """
    Freezes the first `num_layers_to_freeze` LSTM layers of the model.
    Assumes parameter names follow standard PyTorch LSTM naming.
    """
    if num_layers_to_freeze <= 0:
        print("⚠️ No LSTM layers frozen.")
        return

    layer_prefixes = [f'lstm.weight_ih_l{i}' for i in range(num_layers_to_freeze)]
    for name, param in model.named_parameters():
        if any(prefix in name for prefix in layer_prefixes):
            param.requires_grad = False

    print(f"✅ Frozen first {num_layers_to_freeze} LSTM layers.")

# Optuna objective for fine-tuning
def objective_generation(trial):
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    lr = trial.suggest_float("lr", 1e-4, 1e-2)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3)

    model = GenerativeLSTM(    
                hidden_dim=lstm_gen_best_params["hidden_dim"],
                num_layers=lstm_gen_best_params["num_layers"],
                dropout=dropout
                # dropout=lstm_gen_best_params["dropout"]
                )
    model = load_partial_weights(model, 'best_model_lstm_generator_final.pt', 6)  # path to the general AMP model
    val_loss = train_model_generation(model, train_loader, val_loader, num_epochs=20, lr=lr, weight_decay=weight_decay, verbose=False)
    return val_loss

# Run Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective_generation, n_trials=20)
lstm_gen_best_params_tb = study.best_trial.params
print("Best transfer learning hyperparameters:", lstm_gen_best_params_tb)

[I 2025-05-02 11:19:55,547] A new study created in memory with name: no-name-9b82d52b-315f-49f9-a9a9-5218d9c4a7ad


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:20:47,954] Trial 0 finished with value: 0.05580922123044729 and parameters: {'dropout': 0.16137250444161905, 'lr': 0.006821184839221076, 'weight_decay': 0.00017561080126782017}. Best is trial 0 with value: 0.05580922123044729.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:21:38,999] Trial 1 finished with value: 0.07151259062811732 and parameters: {'dropout': 0.2281357258980474, 'lr': 0.003554233540590442, 'weight_decay': 0.0004775635364938583}. Best is trial 0 with value: 0.05580922123044729.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:22:29,497] Trial 2 finished with value: 0.06283493572846055 and parameters: {'dropout': 0.3382564231642894, 'lr': 0.006296652275817763, 'weight_decay': 0.0009204393829331862}. Best is trial 0 with value: 0.05580922123044729.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:23:21,786] Trial 3 finished with value: 0.06758854305371642 and parameters: {'dropout': 0.4943587720542153, 'lr': 0.001464581255892956, 'weight_decay': 0.00016449516026215914}. Best is trial 0 with value: 0.05580922123044729.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:24:13,156] Trial 4 finished with value: 0.08432310912758112 and parameters: {'dropout': 0.15163581451791633, 'lr': 0.0005873878373832947, 'weight_decay': 0.0004997069899391908}. Best is trial 0 with value: 0.05580922123044729.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:25:05,761] Trial 5 finished with value: 0.053100173361599445 and parameters: {'dropout': 0.2576315958956391, 'lr': 0.009252915314094317, 'weight_decay': 0.00031734323068074543}. Best is trial 5 with value: 0.053100173361599445.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:25:59,607] Trial 6 finished with value: 0.06669122399762273 and parameters: {'dropout': 0.10414419140251759, 'lr': 0.0018215218966184843, 'weight_decay': 0.00021165010912347886}. Best is trial 5 with value: 0.053100173361599445.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:26:53,112] Trial 7 finished with value: 0.06706126127392054 and parameters: {'dropout': 0.3693911979617652, 'lr': 0.005460634568352292, 'weight_decay': 0.0005155368426372023}. Best is trial 5 with value: 0.053100173361599445.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:27:45,595] Trial 8 finished with value: 0.060398975387215614 and parameters: {'dropout': 0.3803639375351807, 'lr': 0.004894591853007688, 'weight_decay': 0.0006872145869630581}. Best is trial 5 with value: 0.053100173361599445.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:28:38,054] Trial 9 finished with value: 0.0717171560972929 and parameters: {'dropout': 0.19387977778283602, 'lr': 0.0030558343383497765, 'weight_decay': 0.00023680260504264667}. Best is trial 5 with value: 0.053100173361599445.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:29:33,841] Trial 10 finished with value: 0.05382060678675771 and parameters: {'dropout': 0.25233062823986974, 'lr': 0.009942797154413093, 'weight_decay': 1.053008942079089e-05}. Best is trial 5 with value: 0.053100173361599445.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:30:25,718] Trial 11 finished with value: 0.059463777113705873 and parameters: {'dropout': 0.2664666922601705, 'lr': 0.009758181158408883, 'weight_decay': 1.579044533030503e-05}. Best is trial 5 with value: 0.053100173361599445.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:31:17,483] Trial 12 finished with value: 0.05704248289112002 and parameters: {'dropout': 0.2841165592620222, 'lr': 0.009909311181013677, 'weight_decay': 4.4136037235088506e-05}. Best is trial 5 with value: 0.053100173361599445.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:32:11,270] Trial 13 finished with value: 0.07085083518177271 and parameters: {'dropout': 0.24052347961213183, 'lr': 0.008526777573588228, 'weight_decay': 0.00035267988749535135}. Best is trial 5 with value: 0.053100173361599445.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:33:05,159] Trial 14 finished with value: 0.05653113918378949 and parameters: {'dropout': 0.32814414741748715, 'lr': 0.008270792278931982, 'weight_decay': 0.0003481660971995551}. Best is trial 5 with value: 0.053100173361599445.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:34:00,289] Trial 15 finished with value: 0.06229461310431361 and parameters: {'dropout': 0.43750116969994557, 'lr': 0.008014231522366848, 'weight_decay': 0.0006949059443235783}. Best is trial 5 with value: 0.053100173361599445.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:34:53,572] Trial 16 finished with value: 0.06175466626882553 and parameters: {'dropout': 0.201131982976489, 'lr': 0.008881799414471652, 'weight_decay': 0.0003320709981493193}. Best is trial 5 with value: 0.053100173361599445.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:35:47,050] Trial 17 finished with value: 0.060917569790035486 and parameters: {'dropout': 0.2992276981158616, 'lr': 0.007088610964283416, 'weight_decay': 9.190345278821998e-05}. Best is trial 5 with value: 0.053100173361599445.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:36:41,585] Trial 18 finished with value: 0.07095807697623968 and parameters: {'dropout': 0.25449697798070703, 'lr': 0.009311996216002795, 'weight_decay': 0.0009831429548891662}. Best is trial 5 with value: 0.053100173361599445.


✅ Loaded 6 matching layers from checkpoint.


[I 2025-05-02 11:37:33,788] Trial 19 finished with value: 0.07094359816983342 and parameters: {'dropout': 0.394241434335746, 'lr': 0.007543332937379096, 'weight_decay': 0.0006249489292065357}. Best is trial 5 with value: 0.053100173361599445.


Best transfer learning hyperparameters: {'dropout': 0.2576315958956391, 'lr': 0.009252915314094317, 'weight_decay': 0.00031734323068074543}


#### test

In [53]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, roc_auc_score
import math

# --- Assumes you already have these from your previous steps ---
# lstm_gen_best_params
# train_loader, val_loader, test_loader
# GenerativeLSTM
# compute_last_token_loss

PAD_IDX = -100
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_final_model(model, train_loader, val_loader, num_epochs=20):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lstm_gen_best_params_tb["lr"], weight_decay=lstm_gen_best_params_tb["weight_decay"])
    writer = SummaryWriter(log_dir=f"runs-lstm-gen-tb/AMPGen_LSTM_final")

    best_val_loss = float('inf')

    for epoch in range(1, num_epochs + 1):
        model.train()
        total_loss = 0.0

        for input_seq, target_seq, _ in train_loader:
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)
            optimizer.zero_grad()

            output = model(input_seq)
            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)

            loss = compute_last_token_loss(output, target_seq, criterion)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        val_loss, acc, auc, perp = evaluate_final_model(model, test_loader)

        writer.add_scalar('Loss/Train', avg_train_loss, epoch)
        writer.add_scalar('Loss/Validation', val_loss, epoch)
        writer.add_scalar('Accuracy/Validation', acc, epoch)
        writer.add_scalar('AUC/Validation', auc if auc != "undefined" else 0.0, epoch)

        print(f"Epoch {epoch}: Train Loss = {avg_train_loss:.4f} | Val Loss = {val_loss:.4f} | Acc = {acc:.4f} | AUC = {auc} | Perplexity = {perp:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            # torch.save(model.state_dict(), "best_model_lstm_generator.pt")

    writer.close()
    return model

def evaluate_final_model(model, data_loader):
    model.eval()
    all_preds, all_labels = [], []
    total_loss = 0.0

    with torch.no_grad():
        for input_seq, target_seq, _ in data_loader:
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)
            output = model(input_seq)

            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)

            loss = compute_last_token_loss(output, target_seq, criterion)
            total_loss += loss.item()

            preds = output[:, -1, :]  # [B, vocab]
            preds = torch.argmax(preds, dim=1)
            targets = target_seq[:, -1, :]
            targets = torch.argmax(targets, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    # try:
    #     auc = roc_auc_score(
    #         torch.nn.functional.one_hot(torch.tensor(all_labels), num_classes=20),
    #         torch.nn.functional.one_hot(torch.tensor(all_preds), num_classes=20),
    #         multi_class='ovr', average='macro'
    #     )
    # except Exception:
    auc = "undefined"

    avg_loss = total_loss / len(data_loader)
    perplexity = math.exp(avg_loss)
    return avg_loss, acc, auc, perplexity

# --- Build and train final model using best parameters ---

# lstm_gen_best_params = {'hidden_dim': 133, 'num_layers': 2}

final_model = GenerativeLSTM(
    input_dim=20,
    hidden_dim=lstm_gen_best_params["hidden_dim"],
    num_layers=lstm_gen_best_params["num_layers"],
    dropout=lstm_gen_best_params_tb["dropout"],
    # weights_decay=lstm_gen_best_params_tb["weight_decay"],
    # lr=lstm_gen_best_params_tb["lr"]
)
final_model = load_partial_weights(final_model, 'best_model_lstm_generator_final.pt', 4)  # path to the general AMP model

trained_model = train_final_model(final_model, train_loader, val_loader, num_epochs=200)

# --- Evaluate on test set ---
test_loss, test_acc, test_auc, perp = evaluate_final_model(trained_model, test_loader)
print(f" Final Test Metrics:\nLoss = {test_loss:.4f}, Accuracy = {test_acc:.4f}, AUC = {test_auc}, Perplexity = {perp:.4f}")
# Save model weights
torch.save(trained_model.state_dict(), "final_amp_generator_lstm.pt")
print("Model saved to final_amp_generator_lstm.pt")


✅ Loaded 4 matching layers from checkpoint.
Epoch 1: Train Loss = 2.9824 | Val Loss = 2.9351 | Acc = 0.0270 | AUC = undefined | Perplexity = 18.8228
Epoch 2: Train Loss = 2.9245 | Val Loss = 2.8535 | Acc = 1.0000 | AUC = undefined | Perplexity = 17.3477
Epoch 3: Train Loss = 2.8669 | Val Loss = 2.8353 | Acc = 0.9730 | AUC = undefined | Perplexity = 17.0356
Epoch 4: Train Loss = 2.8081 | Val Loss = 2.7463 | Acc = 1.0000 | AUC = undefined | Perplexity = 15.5846
Epoch 5: Train Loss = 2.7636 | Val Loss = 2.6848 | Acc = 1.0000 | AUC = undefined | Perplexity = 14.6560
Epoch 6: Train Loss = 2.6948 | Val Loss = 2.6642 | Acc = 0.9730 | AUC = undefined | Perplexity = 14.3566
Epoch 7: Train Loss = 2.6443 | Val Loss = 2.6036 | Acc = 0.9730 | AUC = undefined | Perplexity = 13.5118
Epoch 8: Train Loss = 2.5905 | Val Loss = 2.5321 | Acc = 1.0000 | AUC = undefined | Perplexity = 12.5796
Epoch 9: Train Loss = 2.5280 | Val Loss = 2.4807 | Acc = 1.0000 | AUC = undefined | Perplexity = 11.9492
Epoch 10: T

#### Generation

In [54]:
import torch
import torch.nn.functional as F
import numpy as np

# Recreate your amino acid vocab
aa_vocab = 'ACDEFGHIKLMNPQRSTVWY'
aa_to_idx = {aa: i for i, aa in enumerate(aa_vocab)}
idx_to_aa = {i: aa for aa, i in aa_to_idx.items()}

def one_hot_encode_amino_acid(aa, vocab=aa_vocab):
    vec = torch.zeros(len(vocab))
    vec[aa_to_idx[aa]] = 1.0
    return vec

def generate_sequence_from_seed(model, seed, max_length=30, temperature=1.0, device='cpu'):
    model.eval()
    input_seq = [one_hot_encode_amino_acid(aa).to(device) for aa in seed]
    input_tensor = torch.stack(input_seq).unsqueeze(0)  # [1, L, 20]

    generated = seed.copy()

    with torch.no_grad():
        for _ in range(max_length - len(seed)):
            output = model(input_tensor)  # [1, L, vocab]
            logits = output[0, -1, :]  # Last time step → [vocab]

            # Apply temperature and sample
            probs = F.softmax(logits / temperature, dim=-1).cpu().numpy()
            next_idx = np.random.choice(len(aa_vocab), p=probs)
            next_aa = idx_to_aa[next_idx]

            # Update sequence
            next_aa_vec = one_hot_encode_amino_acid(next_aa).to(device).unsqueeze(0).unsqueeze(0)  # [1, 1, 20]
            input_tensor = torch.cat([input_tensor, next_aa_vec], dim=1)
            generated.append(next_aa)

    return ''.join(generated)

import numpy as np
from collections import Counter

class LengthSampler:
    def __init__(self, sequence_lengths):
        """
        Initialize sampler from observed sequence lengths.
        
        Args:
            sequence_lengths (list[int]): List of sequence lengths (e.g., [20, 21, 20, 23, ...])
        """
        self.length_counts = Counter(sequence_lengths)
        self.lengths = np.array(sorted(self.length_counts.keys()))
        counts = np.array([self.length_counts[l] for l in self.lengths])
        self.probs = counts / counts.sum()  # Empirical probabilities

    def sample(self, n=1):
        """
        Sample one or more lengths based on the learned distribution.

        Returns:
            np.ndarray of sampled lengths
        """
        return np.random.choice(self.lengths, size=n, p=self.probs)
length_sampler = LengthSampler([len(seq) for seq in df.loc[df['AMP'] == 1, :]['Sequences']])

In [55]:
import random
import numpy as np

# Set global seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Define the amino acid vocabulary
amino_acids = "ACDEFGHIKLMNPQRSTVWY"

def sample_start_amino_acid():
    return random.choice(amino_acids)

# Reload the trained model
gen_model = GenerativeLSTM(
    input_dim=20,
    hidden_dim=lstm_gen_best_params["hidden_dim"],
    num_layers=lstm_gen_best_params["num_layers"],
    # dropout=lstm_gen_best_params["dropout"]
)
gen_model.load_state_dict(torch.load("final_amp_generator_lstm.pt"))
gen_model.to(device)
gen_model.eval()
# Define a seed and generate a sequence
sampled_length = length_sampler.sample()[0]
start_aa = sample_start_amino_acid()
seed_sequence = list(start_aa)  # start with valine
generated_peptide = generate_sequence_from_seed(gen_model, seed_sequence, max_length=sampled_length, temperature=1.0, device=device)

print("Generated AMP sequence:", generated_peptide)

Generated AMP sequence: ESKLDGDYRYDYYKHH


#### batch gen

In [56]:
import random
import numpy as np

# Set global seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Define the amino acid vocabulary
amino_acids = "ACDEFGHIKLMNPQRSTVWY"

def sample_start_amino_acid():
    return random.choice(amino_acids)

# Reload the trained model
gen_model = GenerativeLSTM(
    input_dim=20,
    hidden_dim=lstm_gen_best_params["hidden_dim"],
    num_layers=lstm_gen_best_params["num_layers"],
    # dropout=lstm_gen_best_params["dropout"]
)
gen_model.load_state_dict(torch.load("final_amp_generator_lstm.pt"))
gen_model.to(device)
gen_model.eval()
generated_peptides = []
# (Re-run the generation loop to collect sequences)
for x in range(100):
    sampled_length = length_sampler.sample()[0]
    # sampled_length = 20
    start_aa = sample_start_amino_acid()
    seed_sequence = list(start_aa)
    generated_peptide = generate_sequence_from_seed(gen_model, seed_sequence, max_length=sampled_length, temperature=1, device=device)
    generated_peptides.append(generated_peptide)
    print("Generated AMP sequence:", generated_peptide)

# Save all generated sequences into a text file
with open("generated_peptidesFullback.fasta", "w") as f:
    for i, peptide in enumerate(generated_peptides):
        f.write(f">peptide{i}\n")
        f.write(peptide + "\n")

Generated AMP sequence: ESKLDGDYRYDYYKHH
Generated AMP sequence: AAAAGAGGGLGPPD
Generated AMP sequence: KAAVYPGDPPGPDYLWPPPK
Generated AMP sequence: IATSKYDHDNPLYPLPHYDYYKAYYYYDPHYSPDPPYSY
Generated AMP sequence: IAHNLSPPPDGDSPPYKPY
Generated AMP sequence: FAAATLKRPHYP
Generated AMP sequence: EMAADKSYAPLKHPYPPYPYYLPPND
Generated AMP sequence: VAAAQCGGYGPSKYPSSPEY
Generated AMP sequence: DAAGKAKHPHSPYH
Generated AMP sequence: WASSENVPPKEYYS
Generated AMP sequence: QAGSTQPDHYRCHT
Generated AMP sequence: CAAKKGPIKY
Generated AMP sequence: ADAGAGGHYNYSYPPPKYMD
Generated AMP sequence: DATWVKAYLYYYLPYPHPYY
Generated AMP sequence: HADYAKYSSSPLYYYYPPYS
Generated AMP sequence: IDSGKDPDPPLQDDYPHPYK
Generated AMP sequence: TAAGKKPYPLYLPDDYYYPH
Generated AMP sequence: YAAKKGYRPPPK
Generated AMP sequence: AAAAAAKGGGKKGPP
Generated AMP sequence: VAAAGGLKGDPDPYPPSPPY
Generated AMP sequence: HTSAACADPDKYDYLH
Generated AMP sequence: VAPKPGGTYYPPYPYYPYYH
Generated AMP sequence: QALASHAVDKYYPRPNPWYYYFPDP

### frozen

In [45]:

# Criterion
PAD_IDX = -100
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

# Transfer Learning Loader
def load_pretrained_weights(model, checkpoint_path):
    pretrained_dict = torch.load(checkpoint_path, map_location='cpu')
    model.load_state_dict(pretrained_dict, strict=False)
    return model

def freeze_encoder(model):
    for name, param in model.named_parameters():
        if 'lstm' in name:
            param.requires_grad = False

    def freeze_encoder(model, num_layers_to_freeze):
        """
        Freezes the first `num_layers_to_freeze` LSTM layers of the model.
        Assumes parameter names follow standard PyTorch LSTM naming.
        """
        if num_layers_to_freeze <= 0:
            print("⚠️ No LSTM layers frozen.")
            return

        layer_prefixes = [f'lstm.weight_ih_l{i}' for i in range(num_layers_to_freeze)]
        for name, param in model.named_parameters():
            if any(prefix in name for prefix in layer_prefixes):
                param.requires_grad = False

        print(f"✅ Frozen first {num_layers_to_freeze} LSTM layers.")



def freeze_encoder(model, num_layers_to_freeze):
    """
    Freezes the first `num_layers_to_freeze` LSTM layers of the model.
    Assumes parameter names follow standard PyTorch LSTM naming.
    """
    if num_layers_to_freeze <= 0:
        print("⚠️ No LSTM layers frozen.")
        return

    layer_prefixes = [f'lstm.weight_ih_l{i}' for i in range(num_layers_to_freeze)]
    for name, param in model.named_parameters():
        if any(prefix in name for prefix in layer_prefixes):
            param.requires_grad = False

    print(f"✅ Frozen first {num_layers_to_freeze} LSTM layers.")
    
def compute_last_token_loss(output, target_seq, criterion):
    last_token_logits = output[:, -1, :]
    last_token_targets = target_seq[:, -1, :]
    last_token_targets = torch.argmax(last_token_targets, dim=-1)
    return criterion(last_token_logits, last_token_targets)

# Training function
def train_model_generation(model, train_loader, val_loader, num_epochs=10, lr=1e-3, weight_decay=1e-4,
                           device='cuda' if torch.cuda.is_available() else 'cpu', verbose=False, train=True):
    model.to(device)
    if not train:
        model.eval()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    best_val_loss = float('inf')
    log_dir = f"runs-lstm-gen-tb/AMP_LSTM_GEN_TRANSFER_{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}"
    writer = SummaryWriter(log_dir=log_dir)

    for epoch in range(1, num_epochs + 1):
        model.train()
        epoch_loss = 0.0
        for input_seq, target_seq, _ in train_loader:
            input_seq = input_seq.to(device)
            target_seq = target_seq.to(device)

            optimizer.zero_grad()
            output = model(input_seq)

            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)

            loss = compute_last_token_loss(output, target_seq, criterion)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        avg_train_loss = epoch_loss / len(train_loader)
        val_loss, acc, auc = evaluate_model_generation(model, val_loader, criterion, device, verbose)

        writer.add_scalar('Loss/Train', avg_train_loss, epoch)
        writer.add_scalar('Loss/Validation', val_loss, epoch)
        writer.add_scalar('Accuracy/Validation', acc, epoch)
        writer.add_scalar('AUC/Validation', auc if auc != "undefined" else 0.0, epoch)

        if verbose:
            print(f"Epoch [{epoch}/{num_epochs}] - Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}, Acc: {acc:.4f}, AUC: {auc}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            # if train:
            #     torch.save(model.state_dict(), 'best_model_lstm_transfer.pt')

    writer.close()
    return best_val_loss

# Evaluation function
def evaluate_model_generation(model, data_loader, criterion, device='cuda' if torch.cuda.is_available() else 'cpu', verbose=False):
    model.eval()
    all_labels = []
    all_preds = []
    total_loss = 0.0
    with torch.no_grad():
        for input_seq, target_seq, _ in data_loader:
            input_seq = input_seq.to(device)
            target_seq = target_seq.to(device)

            output = model(input_seq)

            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)

            loss = compute_last_token_loss(output, target_seq, criterion)
            total_loss += loss.item()

            preds = output[:, -1, :]
            preds = torch.argmax(preds, dim=1)

            targets = target_seq[:, -1, :]
            targets = torch.argmax(targets, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    acc = accuracy_score(all_labels, all_preds)
    # try:
    #     auc = roc_auc_score(
    #         torch.nn.functional.one_hot(torch.tensor(all_labels), num_classes=20),
    #         torch.nn.functional.one_hot(torch.tensor(all_preds), num_classes=20),
    #         multi_class='ovr', average='macro'
    #     )
    # except:
    auc = "undefined"
    
    return avg_loss, acc, auc

# Optuna objective for fine-tuning
def objective_generation(trial):
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    lr = trial.suggest_float("lr", 1e-4, 1e-2)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3)

    model = GenerativeLSTM(    
                hidden_dim=lstm_gen_best_params["hidden_dim"],
                num_layers=lstm_gen_best_params["num_layers"],
                dropout=dropout
                # dropout=lstm_gen_best_params["dropout"]
                )
    model = load_pretrained_weights(model, 'best_model_lstm_generator_final.pt')  # path to the general AMP model

    freeze_encoder(model, 4)

    val_loss = train_model_generation(model, train_loader, val_loader, num_epochs=20, lr=lr, weight_decay=weight_decay, verbose=False)
    return val_loss

# Run Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective_generation, n_trials=20)
lstm_gen_frozen_best_params_tb = study.best_trial.params
print("Best transfer learning hyperparameters:", lstm_gen_frozen_best_params_tb)

[I 2025-05-01 11:35:58,944] A new study created in memory with name: no-name-11effea8-22fc-45fe-8f19-2c3770a74e45


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:36:03,260] Trial 0 finished with value: 0.1755082756280899 and parameters: {'dropout': 0.48226628572434316, 'lr': 0.0006821168350646403, 'weight_decay': 0.00023604294180079854}. Best is trial 0 with value: 0.1755082756280899.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:36:07,509] Trial 1 finished with value: 0.17400945723056793 and parameters: {'dropout': 0.38551514827720557, 'lr': 0.0006341134747285574, 'weight_decay': 0.0007398026271855571}. Best is trial 1 with value: 0.17400945723056793.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:36:12,277] Trial 2 finished with value: 0.18788397312164307 and parameters: {'dropout': 0.3817009386510213, 'lr': 0.006140314607255589, 'weight_decay': 2.6150613802277304e-05}. Best is trial 1 with value: 0.17400945723056793.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:36:16,381] Trial 3 finished with value: 0.1828785389661789 and parameters: {'dropout': 0.25018051076906866, 'lr': 0.006911200001676759, 'weight_decay': 0.0008366391036811805}. Best is trial 1 with value: 0.17400945723056793.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:36:20,214] Trial 4 finished with value: 0.20788905024528503 and parameters: {'dropout': 0.35157497265151116, 'lr': 0.009712639719830562, 'weight_decay': 0.00022174905614099342}. Best is trial 1 with value: 0.17400945723056793.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:36:23,865] Trial 5 finished with value: 0.1966792792081833 and parameters: {'dropout': 0.4857775314557615, 'lr': 0.009360694886012572, 'weight_decay': 1.6502116263242312e-05}. Best is trial 1 with value: 0.17400945723056793.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:36:27,672] Trial 6 finished with value: 0.16907334327697754 and parameters: {'dropout': 0.3064821494977372, 'lr': 0.008385604009341855, 'weight_decay': 0.0008831466393808281}. Best is trial 6 with value: 0.16907334327697754.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:36:31,609] Trial 7 finished with value: 0.19636847078800201 and parameters: {'dropout': 0.3712213862120247, 'lr': 0.0027867958751891485, 'weight_decay': 0.0004589811274669594}. Best is trial 6 with value: 0.16907334327697754.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:36:35,556] Trial 8 finished with value: 0.1686840057373047 and parameters: {'dropout': 0.43762387817195103, 'lr': 0.00818905875426827, 'weight_decay': 0.00017032490159272614}. Best is trial 8 with value: 0.1686840057373047.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:36:39,573] Trial 9 finished with value: 0.17050457000732422 and parameters: {'dropout': 0.47486530446396935, 'lr': 0.0043724729847743, 'weight_decay': 0.0002157185826018631}. Best is trial 8 with value: 0.1686840057373047.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:36:44,502] Trial 10 finished with value: 0.270433634519577 and parameters: {'dropout': 0.1747956881853147, 'lr': 0.007573748516733935, 'weight_decay': 0.0005611213262369222}. Best is trial 8 with value: 0.1686840057373047.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:36:48,534] Trial 11 finished with value: 0.18712729215621948 and parameters: {'dropout': 0.2596102894620508, 'lr': 0.007741494150564815, 'weight_decay': 0.0009845084023799241}. Best is trial 8 with value: 0.1686840057373047.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:36:52,106] Trial 12 finished with value: 0.21051616966724396 and parameters: {'dropout': 0.12158797919965172, 'lr': 0.008436311628558244, 'weight_decay': 0.00048653773834579547}. Best is trial 8 with value: 0.1686840057373047.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:36:55,941] Trial 13 finished with value: 0.18539530038833618 and parameters: {'dropout': 0.30449714535246053, 'lr': 0.005407928391727309, 'weight_decay': 0.0007396917787039752}. Best is trial 8 with value: 0.1686840057373047.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:36:59,932] Trial 14 finished with value: 0.17021183669567108 and parameters: {'dropout': 0.41661675057378666, 'lr': 0.0036785520266853903, 'weight_decay': 0.00034699197108381834}. Best is trial 8 with value: 0.1686840057373047.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:37:03,919] Trial 15 finished with value: 0.20867696404457092 and parameters: {'dropout': 0.29710624107867256, 'lr': 0.008601039075634543, 'weight_decay': 0.0006145654782784139}. Best is trial 8 with value: 0.1686840057373047.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:37:07,848] Trial 16 finished with value: 0.26581302285194397 and parameters: {'dropout': 0.17657640214145703, 'lr': 0.006244980301077365, 'weight_decay': 0.0009808126734961603}. Best is trial 8 with value: 0.1686840057373047.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:37:12,420] Trial 17 finished with value: 0.25367027521133423 and parameters: {'dropout': 0.42457773857777714, 'lr': 0.009851501412763404, 'weight_decay': 0.0003505495287414805}. Best is trial 8 with value: 0.1686840057373047.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:37:16,483] Trial 18 finished with value: 0.2031043916940689 and parameters: {'dropout': 0.32707182307592225, 'lr': 0.008336894500320755, 'weight_decay': 0.0008308701021206918}. Best is trial 8 with value: 0.1686840057373047.


✅ Frozen first 4 LSTM layers.


[I 2025-05-01 11:37:20,526] Trial 19 finished with value: 0.20394910871982574 and parameters: {'dropout': 0.43345592233747454, 'lr': 0.006926688238652978, 'weight_decay': 0.00012292462298675778}. Best is trial 8 with value: 0.1686840057373047.


Best transfer learning hyperparameters: {'dropout': 0.43762387817195103, 'lr': 0.00818905875426827, 'weight_decay': 0.00017032490159272614}


In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, roc_auc_score
import math

# --- Assumes you already have these from your previous steps ---
# lstm_gen_best_params
# train_loader, val_loader, test_loader
# GenerativeLSTM
# compute_last_token_loss

PAD_IDX = -100
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_final_model(model, train_loader, val_loader, num_epochs=20):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lstm_gen_frozen_best_params_tb["lr"], weight_decay=lstm_gen_frozen_best_params_tb["weight_decay"])
    writer = SummaryWriter(log_dir=f"runs-lstm-frozen-gen-tb/AMPGen_LSTM_final")

    best_val_loss = float('inf')

    for epoch in range(1, num_epochs + 1):
        model.train()
        total_loss = 0.0

        for input_seq, target_seq, _ in train_loader:
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)
            optimizer.zero_grad()

            output = model(input_seq)
            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)

            loss = compute_last_token_loss(output, target_seq, criterion)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        val_loss, acc, auc, perp = evaluate_final_model(model, test_loader)

        writer.add_scalar('Loss/Train', avg_train_loss, epoch)
        writer.add_scalar('Loss/Validation', val_loss, epoch)
        writer.add_scalar('Accuracy/Validation', acc, epoch)
        writer.add_scalar('AUC/Validation', auc if auc != "undefined" else 0.0, epoch)

        print(f"Epoch {epoch}: Train Loss = {avg_train_loss:.4f} | Val Loss = {val_loss:.4f} | Acc = {acc:.4f} | AUC = {auc} | Perplexity = {perp:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            # torch.save(model.state_dict(), "best_model_lstm_generator.pt")

    writer.close()
    return model

def evaluate_final_model(model, data_loader):
    model.eval()
    all_preds, all_labels = [], []
    total_loss = 0.0

    with torch.no_grad():
        for input_seq, target_seq, _ in data_loader:
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)
            output = model(input_seq)

            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)

            loss = compute_last_token_loss(output, target_seq, criterion)
            total_loss += loss.item()

            preds = output[:, -1, :]  # [B, vocab]
            preds = torch.argmax(preds, dim=1)
            targets = target_seq[:, -1, :]
            targets = torch.argmax(targets, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    # try:
    #     auc = roc_auc_score(
    #         torch.nn.functional.one_hot(torch.tensor(all_labels), num_classes=20),
    #         torch.nn.functional.one_hot(torch.tensor(all_preds), num_classes=20),
    #         multi_class='ovr', average='macro'
    #     )
    # except Exception:
    auc = "undefined"

    avg_loss = total_loss / len(data_loader)
    perplexity = math.exp(avg_loss)
    return avg_loss, acc, auc, perplexity

# --- Build and train final model using best parameters ---

# lstm_gen_best_params = {'hidden_dim': 133, 'num_layers': 2, 'dropout': 0.10063270147175422, 'lr': 0.003237280156212186, 'weight_decay': 2.2594437829479466e-05}

final_model = GenerativeLSTM(
    input_dim=20,
    hidden_dim=lstm_gen_best_params["hidden_dim"],
    num_layers=lstm_gen_best_params["num_layers"],
    dropout=lstm_gen_frozen_best_params_tb["dropout"],
    # weights_decay=lstm_gen_frozen_best_params_tb["weight_decay"],
    # lr=lstm_gen_frozen_best_params_tb["lr"]
)
freeze_encoder(final_model, 4)

trained_model = train_final_model(final_model, train_loader, val_loader, num_epochs=60)

# --- Evaluate on test set ---
test_loss, test_acc, test_auc, perp = evaluate_final_model(trained_model, test_loader)
print(f" Final Test Metrics:\nLoss = {test_loss:.4f}, Accuracy = {test_acc:.4f}, AUC = {test_auc}, Perplexity = {perp:.4f}")
# Save model weights
torch.save(trained_model.state_dict(), "final_amp_frozen_generator_lstm.pt")
print("Model saved to final_amp_frozen_generator_lstm.pt")


✅ Frozen first 2 LSTM layers.
Epoch 1: Train Loss = 2.9264 | Val Loss = 2.9156 | Acc = 0.9730 | AUC = undefined | Perplexity = 18.4606
Epoch 2: Train Loss = 2.9097 | Val Loss = 2.9004 | Acc = 0.9730 | AUC = undefined | Perplexity = 18.1817
Epoch 3: Train Loss = 2.8965 | Val Loss = 2.8853 | Acc = 0.9730 | AUC = undefined | Perplexity = 17.9091
Epoch 4: Train Loss = 2.8804 | Val Loss = 2.8699 | Acc = 0.9730 | AUC = undefined | Perplexity = 17.6361
Epoch 5: Train Loss = 2.8649 | Val Loss = 2.8537 | Acc = 0.9730 | AUC = undefined | Perplexity = 17.3521
Epoch 6: Train Loss = 2.8514 | Val Loss = 2.8351 | Acc = 0.9730 | AUC = undefined | Perplexity = 17.0327
Epoch 7: Train Loss = 2.8321 | Val Loss = 2.8113 | Acc = 0.9730 | AUC = undefined | Perplexity = 16.6314
Epoch 8: Train Loss = 2.8114 | Val Loss = 2.7805 | Acc = 1.0000 | AUC = undefined | Perplexity = 16.1273
Epoch 9: Train Loss = 2.7812 | Val Loss = 2.7606 | Acc = 1.0000 | AUC = undefined | Perplexity = 15.8086
Epoch 10: Train Loss = 2.

In [46]:
import random
import numpy as np

# Set global seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Define the amino acid vocabulary
amino_acids = "ACDEFGHIKLMNPQRSTVWY"

def sample_start_amino_acid():
    return random.choice(amino_acids)

# Reload the trained model
gen_model = GenerativeLSTM(
    input_dim=20,
    hidden_dim=lstm_gen_best_params["hidden_dim"],
    num_layers=lstm_gen_best_params["num_layers"],
    dropout=lstm_gen_best_params["dropout"]
)
gen_model.load_state_dict(torch.load("final_amp_generator_lstm.pt"))
gen_model.to(device)

generated_peptides = []
# (Re-run the generation loop to collect sequences)
for x in range(100):
    sampled_length = length_sampler.sample()[0]
    # sampled_length = 20
    start_aa = sample_start_amino_acid()
    seed_sequence = list(start_aa)
    generated_peptide = generate_sequence_from_seed(gen_model, seed_sequence, max_length=sampled_length, temperature=1, device=device)
    generated_peptides.append(generated_peptide)
    print("Generated AMP sequence:", generated_peptide)

# Save all generated sequences into a text file
with open("generated_peptidesFrozen.fasta", "w") as f:
    for i, peptide in enumerate(generated_peptides):
        f.write(f">peptide{i}\n")
        f.write(peptide + "\n")

Generated AMP sequence: EKAHKHKHKHKSKHKH
Generated AMP sequence: AAAHKHKHKHKHKH
Generated AMP sequence: KAASYKHKHKHKHKHKHKHK
Generated AMP sequence: IARSKSDKHKHKHKHKHKHSHKDKHKHKHKHKHKHKHKH
Generated AMP sequence: IAANKRHKHKHKHKHKHKH
Generated AMP sequence: FAAAVRHKHKHK
Generated AMP sequence: EAAAGHKHCHKHKHKHKHKSKHKHKH
Generated AMP sequence: VAAASKHKYKHKHKHKHKHK
Generated AMP sequence: DAAHLAHKHKHKLK
Generated AMP sequence: WAQRKHKHKHKHKH
Generated AMP sequence: QAASRHKHKHKDKH
Generated AMP sequence: CAALMIHKHK
Generated AMP sequence: AAAHKHKHSHKHKHKHKHKH
Generated AMP sequence: DAQVTKAKHRRKHKHKHKHK
Generated AMP sequence: HAAYAHSHKHKHKHKHKHKH
Generated AMP sequence: IAKAHKHKHKHKHKHKHKHK
Generated AMP sequence: TAAKHKHSHKHKHKHKHKHK
Generated AMP sequence: YAAMKHRHKHKH
Generated AMP sequence: AAAAHHKHKHKHKHK
Generated AMP sequence: VAAAHKHKHKHKHKHKHKHK
Generated AMP sequence: HNHAADADKHKHKHKH
Generated AMP sequence: VAHLRKHKHSHKHKHKHKHK
Generated AMP sequence: QAGASIAPKHKPKHKHKHKHKHKHK

In [242]:
torch.cuda.is_available() 

False

# TB no transfer

In [10]:
# Re-import necessary packages after reset
import torch
import torch.nn as nn
import torch.optim as optim
import datetime
import optuna
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, roc_auc_score
from torch.utils.tensorboard import SummaryWriter

from torch.nn.utils.rnn import pad_packed_sequence


class GenerativeLSTM(nn.Module):
    def __init__(self, input_dim=20, hidden_dim=128, num_layers=1, dropout=0.3):
        super(GenerativeLSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_dim, hidden_dim, num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.fc = nn.Linear(hidden_dim, input_dim)

    def forward(self, x):
        # Handle packed input
        if isinstance(x, torch.nn.utils.rnn.PackedSequence):
            packed_output, _ = self.lstm(x)
            unpacked_output, _ = pad_packed_sequence(packed_output, batch_first=True)
            return self.fc(unpacked_output)
        else:
            out, _ = self.lstm(x)
            return self.fc(out)

# Criterion

PAD_IDX = -100
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)


def compute_last_token_loss(output, target_seq, criterion):
    """
    Computes cross-entropy loss on the last time step of each sequence.
    
    Args:
        output: Tensor of shape [B, L, vocab_size]
        target_seq: Tensor of shape [B, L] containing target class indices
    
    Returns:
        loss: Scalar loss computed only on the last token of each sequence
    """
    # Get last time step for each sequence
    last_token_logits = output[:, -1, :]        # [B, vocab_size]
    last_token_targets = target_seq[:, -1, :]      # [B]
    last_token_targets = torch.argmax(last_token_targets, dim=-1)  #  now shape is [batch_size, seq_len]

    # print('last_token_logits',last_token_logits.shape)
    # print('last_token_targets',last_token_targets.shape)

    return criterion(last_token_logits, last_token_targets)

# Training function
def train_model_generation(model, train_loader, val_loader, num_epochs=10, lr=1e-3, weight_decay=1e-4,
                           device='cuda' if torch.cuda.is_available() else 'cpu', verbose=False, train=True):
    model.to(device)
    if not train:
        model.eval()

    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    best_val_loss = float('inf')
    log_dir = f"runs-lstm-gen-notrans-tb/AMP_LSTM_GEN_{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}"
    writer = SummaryWriter(log_dir=log_dir)

    for epoch in range(1, num_epochs + 1):
        model.train()
        epoch_loss = 0.0
        for input_seq, target_seq, _ in train_loader:
            input_seq = input_seq.to(device)
            target_seq = target_seq.to(device)

            optimizer.zero_grad()
            output = model(input_seq)
            # output = output.view(-1, output.shape[-1])
            # output = output.reshape(-1, output.shape[-1])      # [B*L, vocab]
            # output = torch.argmax(output, dim=-1)  # now shape is [batch_size, seq_len]

            
            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)
            # # target_seq = target_seq.view(-1)
            # target_seq = torch.argmax(target_seq, dim=-1)  #  now shape is [batch_size, seq_len]

            
            # print('target_shape before reshape',target_seq.shape)
            # target_seq = target_seq.reshape(-1)
            # print(f"Output shape: {output.shape}, Target shape: {target_seq.shape}")

            # loss = criterion(output, target_seq)
            loss = compute_last_token_loss(output, target_seq, criterion)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        avg_train_loss = epoch_loss / len(train_loader)
        val_loss, acc, auc = evaluate_model_generation(model, val_loader, criterion, device, verbose)

        writer.add_scalar('Loss/Train', avg_train_loss, epoch)
        writer.add_scalar('Loss/Validation', val_loss, epoch)
        writer.add_scalar('Accuracy/Validation', acc, epoch)
        writer.add_scalar('AUC/Validation', auc if auc != "undefined" else 0.0, epoch)

        if verbose:
            print(f"Epoch [{epoch}/{num_epochs}] - Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}, Acc: {acc:.4f}, AUC: {auc}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            if train:
                torch.save(model.state_dict(), 'best_model_lstm_generator-notrans-tb.pt')

    writer.close()
    return best_val_loss

# Evaluation function
def evaluate_model_generation(model, data_loader, criterion, device='cuda' if torch.cuda.is_available() else 'cpu', verbose=False):
    model.eval()
    all_labels = []
    all_preds = []
    total_loss = 0.0
    with torch.no_grad():
        for input_seq, target_seq, _ in data_loader:
            input_seq = input_seq.to(device)
            target_seq = target_seq.to(device)

            output = model(input_seq)
            # output = output.view(-1, output.shape[-1])
            # output = output.reshape(-1, output.shape[-1])  # [B*L, vocab]
            # output = torch.argmax(output, dim=-1)  # now shape is [batch_size, seq_len]

            # # target_seq = target_seq.view(-1)
            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)
            # # target_seq = target_seq.view(-1)
            # # target_seq = target_seq.reshape(-1)
            # # target_seq = target_seq.reshape(-1, target_seq.shape[-1])
            # target_seq = torch.argmax(target_seq, dim=-1)  #  now shape is [batch_size, seq_len]

            # assert output.size(0) == target_seq.size(0), f"Mismatch: {output.size(0)} vs {target_seq.size(0)}"

            # loss = criterion(output, target_seq)
            loss = compute_last_token_loss(output, target_seq, criterion)
            # print('loss done')
            total_loss += loss.item()
            
            
            preds = output[:, -1, :]        # shape: [B, vocab_size]
            preds = torch.argmax(preds, dim=1)  # shape: [B]

            targets = target_seq[:, -1, :]      # shape: [B, vocab_size]
            targets = torch.argmax(targets, dim=-1)  # shape: [B]
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    acc = accuracy_score(all_labels, all_preds)
    # try:
    #     auc = roc_auc_score(
    #         torch.nn.functional.one_hot(torch.tensor(all_labels), num_classes=20),
    #         torch.nn.functional.one_hot(torch.tensor(all_preds), num_classes=20),
    #         multi_class='ovr', average='macro'
    #     )
    # except:
    auc = "undefined"

    return avg_loss, acc, auc

# Objective for Optuna tuning
def objective_generation(trial):
    hidden_dim = trial.suggest_int("hidden_dim", 64, 256)
    num_layers = trial.suggest_int("num_layers", 1, 3)
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    lr = trial.suggest_float("lr", 1e-4, 1e-2)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3)

    model = GenerativeLSTM(hidden_dim=hidden_dim, num_layers=num_layers, dropout=dropout)
    val_loss = train_model_generation(model, train_loader, val_loader, num_epochs=10, lr=lr, weight_decay=weight_decay, verbose=False)
    return val_loss

study = optuna.create_study(direction="minimize")
study.optimize(objective_generation, n_trials=20)

lstm_gen_notrans_tb_best_params = study.best_trial.params
print(lstm_gen_notrans_tb_best_params)

[I 2025-05-02 10:55:14,561] A new study created in memory with name: no-name-f7a5cdc2-a106-4dcf-a41d-0c75ebdcb8f2
[I 2025-05-02 10:55:22,868] Trial 0 finished with value: 2.7173891067504883 and parameters: {'hidden_dim': 144, 'num_layers': 3, 'dropout': 0.15437274023341982, 'lr': 0.005771881998865653, 'weight_decay': 0.0002682713506887346}. Best is trial 0 with value: 2.7173891067504883.
[I 2025-05-02 10:55:25,214] Trial 1 finished with value: 2.7665016651153564 and parameters: {'hidden_dim': 227, 'num_layers': 2, 'dropout': 0.4424763441823395, 'lr': 0.005704011585947213, 'weight_decay': 0.00041701060111245934}. Best is trial 0 with value: 2.7173891067504883.
[I 2025-05-02 10:55:27,423] Trial 2 finished with value: 2.9343535900115967 and parameters: {'hidden_dim': 250, 'num_layers': 1, 'dropout': 0.32673827998102084, 'lr': 0.00154077252086226, 'weight_decay': 0.0008423535539033275}. Best is trial 0 with value: 2.7173891067504883.
[I 2025-05-02 10:55:29,530] Trial 3 finished with value:

{'hidden_dim': 149, 'num_layers': 2, 'dropout': 0.4350981856205326, 'lr': 0.009996913527093828, 'weight_decay': 0.00012991309919791892}


### test

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, roc_auc_score
import math

# --- Assumes you already have these from your previous steps ---
# lstm_gen_best_params
# train_loader, val_loader, test_loader
# GenerativeLSTM
# compute_last_token_loss

PAD_IDX = -100
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_final_model(model, train_loader, val_loader, num_epochs=20):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lstm_gen_notrans_tb_best_params["lr"], weight_decay=lstm_gen_notrans_tb_best_params["weight_decay"])
    writer = SummaryWriter(log_dir=f"runs-lstm-gen-notrans-tb/AMPGen_LSTM_final")

    best_val_loss = float('inf')

    for epoch in range(1, num_epochs + 1):
        model.train()
        total_loss = 0.0

        for input_seq, target_seq, _ in train_loader:
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)
            optimizer.zero_grad()

            output = model(input_seq)
            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)

            loss = compute_last_token_loss(output, target_seq, criterion)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        val_loss, acc, auc, perp = evaluate_final_model(model, test_loader)

        writer.add_scalar('Loss/Train', avg_train_loss, epoch)
        writer.add_scalar('Loss/Validation', val_loss, epoch)
        writer.add_scalar('Accuracy/Validation', acc, epoch)
        writer.add_scalar('AUC/Validation', auc if auc != "undefined" else 0.0, epoch)

        print(f"Epoch {epoch}: Train Loss = {avg_train_loss:.4f} | Val Loss = {val_loss:.4f} | Acc = {acc:.4f} | AUC = {auc} | Perplexity = {perp:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            # torch.save(model.state_dict(), "best_model_lstm_generator.pt")

    writer.close()
    return model

def evaluate_final_model(model, data_loader):
    model.eval()
    all_preds, all_labels = [], []
    total_loss = 0.0

    with torch.no_grad():
        for input_seq, target_seq, _ in data_loader:
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)
            output = model(input_seq)

            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)

            loss = compute_last_token_loss(output, target_seq, criterion)
            total_loss += loss.item()

            preds = output[:, -1, :]  # [B, vocab]
            preds = torch.argmax(preds, dim=1)
            targets = target_seq[:, -1, :]
            targets = torch.argmax(targets, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    # try:
    #     auc = roc_auc_score(
    #         torch.nn.functional.one_hot(torch.tensor(all_labels), num_classes=20),
    #         torch.nn.functional.one_hot(torch.tensor(all_preds), num_classes=20),
    #         multi_class='ovr', average='macro'
    #     )
    # except Exception:
    auc = "undefined"

    avg_loss = total_loss / len(data_loader)
    perplexity = math.exp(avg_loss)
    return avg_loss, acc, auc, perplexity

# --- Build and train final model using best parameters ---

final_model = GenerativeLSTM(
    input_dim=20,
    hidden_dim=lstm_gen_notrans_tb_best_params["hidden_dim"],
    num_layers=lstm_gen_notrans_tb_best_params["num_layers"],
    dropout=lstm_gen_notrans_tb_best_params["dropout"]
)

trained_model = train_final_model(final_model, train_loader, val_loader, num_epochs=60)

# --- Evaluate on test set ---
test_loss, test_acc, test_auc, perp = evaluate_final_model(trained_model, test_loader)
print(f"\n✅ Final Test Metrics:\nLoss = {test_loss:.4f}, Accuracy = {test_acc:.4f}, AUC = {test_auc}, Perplexity = {perp:.4f}")


Epoch 1: Train Loss = 3.0548 | Val Loss = 3.0090 | Acc = 0.0270 | AUC = undefined | Perplexity = 20.2662
Epoch 2: Train Loss = 2.9965 | Val Loss = 2.9435 | Acc = 0.0270 | AUC = undefined | Perplexity = 18.9822
Epoch 3: Train Loss = 2.9277 | Val Loss = 2.8642 | Acc = 0.9730 | AUC = undefined | Perplexity = 17.5345
Epoch 4: Train Loss = 2.8628 | Val Loss = 2.7988 | Acc = 1.0000 | AUC = undefined | Perplexity = 16.4257
Epoch 5: Train Loss = 2.8049 | Val Loss = 2.7393 | Acc = 1.0000 | AUC = undefined | Perplexity = 15.4760
Epoch 6: Train Loss = 2.7479 | Val Loss = 2.6935 | Acc = 0.9730 | AUC = undefined | Perplexity = 14.7829
Epoch 7: Train Loss = 2.6925 | Val Loss = 2.6404 | Acc = 0.9730 | AUC = undefined | Perplexity = 14.0191
Epoch 8: Train Loss = 2.6369 | Val Loss = 2.5883 | Acc = 0.9730 | AUC = undefined | Perplexity = 13.3075
Epoch 9: Train Loss = 2.5930 | Val Loss = 2.5379 | Acc = 0.9730 | AUC = undefined | Perplexity = 12.6535
Epoch 10: Train Loss = 2.5679 | Val Loss = 2.4886 | Acc

### batch gen

In [18]:
import random
import numpy as np

# Set global seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Define the amino acid vocabulary
amino_acids = "ACDEFGHIKLMNPQRSTVWY"

aa_vocab = 'ACDEFGHIKLMNPQRSTVWY'
aa_to_idx = {aa: i for i, aa in enumerate(aa_vocab)}
idx_to_aa = {i: aa for aa, i in aa_to_idx.items()}

def sample_start_amino_acid():
    return random.choice(amino_acids)

def one_hot_encode_amino_acid(aa, vocab=aa_vocab):
    vec = torch.zeros(len(vocab))
    vec[aa_to_idx[aa]] = 1.0
    return vec


def generate_sequence_from_seed(model, seed, max_length=30, temperature=1.0, device='cpu'):
    model.eval()
    input_seq = [one_hot_encode_amino_acid(aa).to(device) for aa in seed]
    input_tensor = torch.stack(input_seq).unsqueeze(0)  # [1, L, 20]

    generated = seed.copy()

    with torch.no_grad():
        for _ in range(max_length - len(seed)):
            output = model(input_tensor)  # [1, L, vocab]
            logits = output[0, -1, :]  # Last time step → [vocab]

            # Apply temperature and sample
            probs = F.softmax(logits / temperature, dim=-1).cpu().numpy()
            next_idx = np.random.choice(len(aa_vocab), p=probs)
            next_aa = idx_to_aa[next_idx]

            # Update sequence
            next_aa_vec = one_hot_encode_amino_acid(next_aa).to(device).unsqueeze(0).unsqueeze(0)  # [1, 1, 20]
            input_tensor = torch.cat([input_tensor, next_aa_vec], dim=1)
            generated.append(next_aa)

    return ''.join(generated)


class LengthSampler:
    def __init__(self, sequence_lengths):
        """
        Initialize sampler from observed sequence lengths.
        
        Args:
            sequence_lengths (list[int]): List of sequence lengths (e.g., [20, 21, 20, 23, ...])
        """
        self.length_counts = Counter(sequence_lengths)
        self.lengths = np.array(sorted(self.length_counts.keys()))
        counts = np.array([self.length_counts[l] for l in self.lengths])
        self.probs = counts / counts.sum()  # Empirical probabilities

    def sample(self, n=1):
        """
        Sample one or more lengths based on the learned distribution.

        Returns:
            np.ndarray of sampled lengths
        """
        return np.random.choice(self.lengths, size=n, p=self.probs)
length_sampler = LengthSampler([len(seq) for seq in df.loc[df['AMP'] == 1, :]['Sequences']])

# Reload the trained model
gen_model = trained_model
gen_model.to(device)

generated_peptides = []
# (Re-run the generation loop to collect sequences)
for x in range(100):
    sampled_length = length_sampler.sample()[0]
    # sampled_length = 20
    start_aa = sample_start_amino_acid()
    seed_sequence = list(start_aa)
    generated_peptide = generate_sequence_from_seed(gen_model, seed_sequence, max_length=sampled_length, temperature=1, device=device)
    generated_peptides.append(generated_peptide)
    print("Generated AMP sequence:", generated_peptide)

# Save all generated sequences into a text file
with open("generated_peptides-notrans.fasta", "w") as f:
    for i, peptide in enumerate(generated_peptides):
        f.write(f">peptide{i}\n")
        f.write(peptide + "\n")

Generated AMP sequence: EYPPGGGYPPGYYGGG
Generated AMP sequence: AHIHPGHKKSGLPG
Generated AMP sequence: KAAYYYHGPKGKGYHPHPPG
Generated AMP sequence: IQYYPYGGGHKHYIHPGYGYRGDYPPRGKGYPHGHHPPY
Generated AMP sequence: IAPRPRKLKGGGPHLYHKR
Generated AMP sequence: FAGGYYPYYGYP
Generated AMP sequence: EVHGGKYYDLKGGHYHPPKYYHKHHG
Generated AMP sequence: VHAHYHGKYHPRHPKPPPGY
Generated AMP sequence: DAAPPGKGPGPKYG
Generated AMP sequence: WAYYHPYPNHGYYP
Generated AMP sequence: QEPYYRPGGYPFGP
Generated AMP sequence: CAKPPGPHHR
Generated AMP sequence: ASPLGIHHYKYPYKPKGPHG
Generated AMP sequence: DAYYYKGYKYYYHKYHGPYP
Generated AMP sequence: HANYGLYQPPHHYYYYLKYP
Generated AMP sequence: IRYHKGPGKPHPGGYKGPRG
Generated AMP sequence: TAAKLPPYLHYHKGGYYPKG
Generated AMP sequence: YAKPPHYPPPKH
Generated AMP sequence: APAGGGYPKGKKGKK
Generated AMP sequence: VLAHPKYPGGPGPYPKPKPY
Generated AMP sequence: HYYGGGGGPGHYGYHG
Generated AMP sequence: VLVPSHGRYYKKRHYYKRRG
Generated AMP sequence: QHSHYKFYGHYYPPKHHPRYWGKGP

# production

In [41]:
from torch.nn.utils.rnn import pad_packed_sequence


class GenerativeLSTM(nn.Module):
    def __init__(self, input_dim=20, hidden_dim=128, num_layers=1, dropout=0.3):
        super(GenerativeLSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_dim, hidden_dim, num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.fc = nn.Linear(hidden_dim, input_dim)

    def forward(self, x):
        # Handle packed input
        if isinstance(x, torch.nn.utils.rnn.PackedSequence):
            packed_output, _ = self.lstm(x)
            unpacked_output, _ = pad_packed_sequence(packed_output, batch_first=True)
            return self.fc(unpacked_output)
        else:
            out, _ = self.lstm(x)
            return self.fc(out)


### no trans - test

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, roc_auc_score
import math

# --- Assumes you already have these from your previous steps ---
# lstm_gen_best_params
# train_loader, val_loader, test_loader
# GenerativeLSTM
# compute_last_token_loss

PAD_IDX = -100
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def compute_last_token_loss(output, target_seq, criterion):
    """
    Computes cross-entropy loss on the last time step of each sequence.
    
    Args:
        output: Tensor of shape [B, L, vocab_size]
        target_seq: Tensor of shape [B, L] containing target class indices
    
    Returns:
        loss: Scalar loss computed only on the last token of each sequence
    """
    # Get last time step for each sequence
    last_token_logits = output[:, -1, :]        # [B, vocab_size]
    last_token_targets = target_seq[:, -1, :]      # [B]
    last_token_targets = torch.argmax(last_token_targets, dim=-1)  #  now shape is [batch_size, seq_len]

    # print('last_token_logits',last_token_logits.shape)
    # print('last_token_targets',last_token_targets.shape)

    return criterion(last_token_logits, last_token_targets)


def train_final_model(model, train_loader, val_loader, num_epochs=20):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lstm_gen_notrans_tb_best_params["lr"], weight_decay=lstm_gen_notrans_tb_best_params["weight_decay"])
    writer = SummaryWriter(log_dir=f"runs-lstm-gen-notrans-tb/AMPGen_LSTM_final")

    best_val_loss = float('inf')

    for epoch in range(1, num_epochs + 1):
        model.train()
        total_loss = 0.0

        for input_seq, target_seq, _ in train_loader:
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)
            optimizer.zero_grad()

            output = model(input_seq)
            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)

            loss = compute_last_token_loss(output, target_seq, criterion)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        val_loss, acc, auc, perp = evaluate_final_model(model, test_loader)

        writer.add_scalar('Loss/Train', avg_train_loss, epoch)
        writer.add_scalar('Loss/Validation', val_loss, epoch)
        writer.add_scalar('Accuracy/Validation', acc, epoch)
        writer.add_scalar('AUC/Validation', auc if auc != "undefined" else 0.0, epoch)

        print(f"Epoch {epoch}: Train Loss = {avg_train_loss:.4f} | Val Loss = {val_loss:.4f} | Acc = {acc:.4f} | AUC = {auc} | Perplexity = {perp:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            # torch.save(model.state_dict(), "best_model_lstm_generator.pt")

    writer.close()
    return model

def evaluate_final_model(model, data_loader):
    model.eval()
    all_preds, all_labels = [], []
    total_loss = 0.0

    with torch.no_grad():
        for input_seq, target_seq, _ in data_loader:
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)
            output = model(input_seq)

            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)

            loss = compute_last_token_loss(output, target_seq, criterion)
            total_loss += loss.item()

            preds = output[:, -1, :]  # [B, vocab]
            preds = torch.argmax(preds, dim=1)
            targets = target_seq[:, -1, :]
            targets = torch.argmax(targets, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    # try:
    #     auc = roc_auc_score(
    #         torch.nn.functional.one_hot(torch.tensor(all_labels), num_classes=20),
    #         torch.nn.functional.one_hot(torch.tensor(all_preds), num_classes=20),
    #         multi_class='ovr', average='macro'
    #     )
    # except Exception:
    auc = "undefined"

    avg_loss = total_loss / len(data_loader)
    perplexity = math.exp(avg_loss)
    return avg_loss, acc, auc, perplexity

# --- Build and train final model using best parameters ---
lstm_gen_notrans_tb_best_params = {'hidden_dim': 125, 'num_layers': 3, 'dropout': 0.1809741409069741, 'lr': 0.009903431049726066, 'weight_decay': 0.00095634187480499}

final_model = GenerativeLSTM(
    input_dim=20,
    hidden_dim=lstm_gen_notrans_tb_best_params["hidden_dim"],
    num_layers=lstm_gen_notrans_tb_best_params["num_layers"],
    dropout=lstm_gen_notrans_tb_best_params["dropout"]
)

trained_model = train_final_model(final_model, train_loader, val_loader, num_epochs=60)

# --- Evaluate on test set ---
test_loss, test_acc, test_auc, perp = evaluate_final_model(trained_model, test_loader)
print(f"\n✅ Final Test Metrics:\nLoss = {test_loss:.4f}, Accuracy = {test_acc:.4f}, AUC = {test_auc}, Perplexity = {perp:.4f}")
# torch.save(trained_model.state_dict(), "final_amp_notrans_generator_lstm.pt")


Epoch 1: Train Loss = 2.9193 | Val Loss = 2.8783 | Acc = 0.9730 | AUC = undefined | Perplexity = 17.7838
Epoch 2: Train Loss = 2.8637 | Val Loss = 2.8221 | Acc = 0.9730 | AUC = undefined | Perplexity = 16.8114
Epoch 3: Train Loss = 2.8002 | Val Loss = 2.7504 | Acc = 0.9730 | AUC = undefined | Perplexity = 15.6490
Epoch 4: Train Loss = 2.7289 | Val Loss = 2.6782 | Acc = 0.9730 | AUC = undefined | Perplexity = 14.5591
Epoch 5: Train Loss = 2.6731 | Val Loss = 2.6309 | Acc = 0.9730 | AUC = undefined | Perplexity = 13.8861
Epoch 6: Train Loss = 2.6404 | Val Loss = 2.5762 | Acc = 0.9730 | AUC = undefined | Perplexity = 13.1466
Epoch 7: Train Loss = 2.5704 | Val Loss = 2.5244 | Acc = 0.9730 | AUC = undefined | Perplexity = 12.4831
Epoch 8: Train Loss = 2.5093 | Val Loss = 2.4685 | Acc = 0.9730 | AUC = undefined | Perplexity = 11.8052
Epoch 9: Train Loss = 2.5077 | Val Loss = 2.4155 | Acc = 0.9730 | AUC = undefined | Perplexity = 11.1951
Epoch 10: Train Loss = 2.4048 | Val Loss = 2.3679 | Acc

#### batch gen

In [None]:
import random
import numpy as np
from collections import Counter
import torch
import torch.nn.functional as F
import numpy as np

# Set global seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# final_model = GenerativeLSTM(
#     input_dim=20,
#     hidden_dim=lstm_gen_notrans_tb_best_params["hidden_dim"],
#     num_layers=lstm_gen_notrans_tb_best_params["num_layers"],
#     dropout=lstm_gen_notrans_tb_best_params["dropout"]
# )
# trained_model = GenerativeLSTM(
#     input_dim=20,
#     hidden_dim=176,
#     num_layers=lstm_gen_notrans_tb_best_params["num_layers"],
#     dropout=lstm_gen_notrans_tb_best_params["dropout"]
# )


# final_model.load_state_dict(torch.load("best_model_lstm_generator-notrans-tb.pt"))

# Define the amino acid vocabulary
amino_acids = "ACDEFGHIKLMNPQRSTVWY"

def sample_start_amino_acid():
    return random.choice(amino_acids)

class LengthSampler:
    def __init__(self, sequence_lengths):
        """
        Initialize sampler from observed sequence lengths.
        
        Args:
            sequence_lengths (list[int]): List of sequence lengths (e.g., [20, 21, 20, 23, ...])
        """
        self.length_counts = Counter(sequence_lengths)
        self.lengths = np.array(sorted(self.length_counts.keys()))
        counts = np.array([self.length_counts[l] for l in self.lengths])
        self.probs = counts / counts.sum()  # Empirical probabilities

    def sample(self, n=1):
        """
        Sample one or more lengths based on the learned distribution.

        Returns:
            np.ndarray of sampled lengths
        """
        return np.random.choice(self.lengths, size=n, p=self.probs)
length_sampler = LengthSampler([len(seq) for seq in df.loc[df['AMP'] == 1, :]['Sequences']])


# Recreate your amino acid vocab
aa_vocab = 'ACDEFGHIKLMNPQRSTVWY'
aa_to_idx = {aa: i for i, aa in enumerate(aa_vocab)}
idx_to_aa = {i: aa for aa, i in aa_to_idx.items()}

def one_hot_encode_amino_acid(aa, vocab=aa_vocab):
    vec = torch.zeros(len(vocab))
    vec[aa_to_idx[aa]] = 1.0
    return vec

def generate_sequence_from_seed(model, seed, max_length=30, temperature=1.0, device='cpu'):
    model.eval()
    input_seq = [one_hot_encode_amino_acid(aa).to(device) for aa in seed]
    input_tensor = torch.stack(input_seq).unsqueeze(0)  # [1, L, 20]

    generated = seed.copy()

    with torch.no_grad():
        for _ in range(max_length - len(seed)):
            output = model(input_tensor)  # [1, L, vocab]
            logits = output[0, -1, :]  # Last time step → [vocab]

            # Apply temperature and sample
            probs = F.softmax(logits / temperature, dim=-1).cpu().numpy()
            next_idx = np.random.choice(len(aa_vocab), p=probs)
            next_aa = idx_to_aa[next_idx]

            # Update sequence
            next_aa_vec = one_hot_encode_amino_acid(next_aa).to(device).unsqueeze(0).unsqueeze(0)  # [1, 1, 20]
            input_tensor = torch.cat([input_tensor, next_aa_vec], dim=1)
            generated.append(next_aa)

    return ''.join(generated)

# Reload the trained model
gen_model = trained_model
gen_model.to(device)


generated_peptides = []
# (Re-run the generation loop to collect sequences)
for x in range(100):
    sampled_length = length_sampler.sample()[0]
    # sampled_length = 20
    start_aa = sample_start_amino_acid()
    seed_sequence = list(start_aa)
    generated_peptide = generate_sequence_from_seed(gen_model, seed_sequence, max_length=sampled_length, temperature=1, device=device)
    generated_peptides.append(generated_peptide)
    print("Generated AMP sequence:", generated_peptide)
model
# Save all generated sequences into a text file
with open("generated_peptides-notrans.fasta", "w") as f:
    for i, peptide in enumerate(generated_peptides):
        f.write(f">peptide{i}\n")
        f.write(peptide + "\n")

## No trans

In [42]:
import random
import numpy as np

# Set global seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Define the amino acid vocabulary
amino_acids = "ACDEFGHIKLMNPQRSTVWY"

def sample_start_amino_acid():
    return random.choice(amino_acids)

lstm_gen_best_params = {'hidden_dim': 125, 'num_layers': 3, 'dropout': 0.1809741409069741, 'lr': 0.009903431049726066, 'weight_decay': 0.00095634187480499}

# lstm_gen_best_params = {'dropout': 0.13882938162931305, 'lr': 0.001464264209101335, 'weight_decay': 0.0002912839503135125}

# Reload the trained model
gen_model = GenerativeLSTM(
    input_dim=20,
    hidden_dim=lstm_gen_best_params["hidden_dim"],
    num_layers=lstm_gen_best_params["num_layers"],
    dropout=lstm_gen_best_params["dropout"]
)
gen_model.load_state_dict(torch.load("final_amp_notrans_generator_lstm.pt"))
gen_model.to(device)

generated_peptides = []
# (Re-run the generation loop to collect sequences)
for x in range(500):
    sampled_length = length_sampler.sample()[0]
    # sampled_length = 20
    start_aa = sample_start_amino_acid()
    seed_sequence = list(start_aa)
    generated_peptide = generate_sequence_from_seed(gen_model, seed_sequence, max_length=sampled_length, temperature=1, device=device)
    generated_peptides.append(generated_peptide)
    print("Generated AMP sequence:", generated_peptide)

# Save all generated sequences into a text file
with open("generated_peptides-notrans1-500.fasta", "w") as f:
    for i, peptide in enumerate(generated_peptides):
        f.write(f">peptide{i}\n")
        f.write(peptide + "\n")

Generated AMP sequence: EWPKCECWKPCYSGGG
Generated AMP sequence: AEFDLDGHKQGKKD
Generated AMP sequence: KAAYYSHENKGKDYGKHKKG
Generated AMP sequence: IPWVKYDGDHHHSHHKGRGYPGASNPPGHGYKHEHHPKY
Generated AMP sequence: IANQKQKKKCGDKHKYGHP
Generated AMP sequence: FAAAYSLWRGYK
Generated AMP sequence: ETAAEHSVAKIGGHYHKMHYYGKHHD
Generated AMP sequence: VDADWGGKYGKPGPHKKKGS
Generated AMP sequence: DAAKPAKGKGMHYG
Generated AMP sequence: WAWTGNSKKGGYYK
Generated AMP sequence: QANVVRLDGYKAGK
Generated AMP sequence: CAHNMGPGHP
Generated AMP sequence: ARLKAHGGYHYKQKKKGPHD
Generated AMP sequence: DAWYYHAYKYYVHHTHGKYL
Generated AMP sequence: HAKYAKWPPPHHRRYYKKQK
Generated AMP sequence: IPTFHCKAKKHKDDSHGKPG
Generated AMP sequence: TAAKKLPYKHQHKGDYSLHG
Generated AMP sequence: YAHPNGYPKKIG
Generated AMP sequence: ANAAAATPKGKKGKH
Generated AMP sequence: VIAGLKTLGDKDKYKHKKKY
Generated AMP sequence: HWVAACADNDHSDSHG
Generated AMP sequence: VHTPRGGPSYHHPHYWKPPG
Generated AMP sequence: QDRFWHAYFHYYKKKHHKPPPGKDK

## Frozen

In [15]:
import random
import numpy as np

# Set global seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Define the amino acid vocabulary
amino_acids = "ACDEFGHIKLMNPQRSTVWY"

def sample_start_amino_acid():
    return random.choice(amino_acids)

lstm_gen_best_params = {'hidden_dim': 247, 'num_layers': 3, 'dropout': 0.12915638774083413, 'lr': 0.009983517577195865, 'weight_decay': 0.0007901836295730717}

# lstm_gen_best_params = {'dropout': 0.13882938162931305, 'lr': 0.001464264209101335, 'weight_decay': 0.0002912839503135125}

# Reload the trained model
gen_model = GenerativeLSTM(
    input_dim=20,
    hidden_dim=lstm_gen_best_params["hidden_dim"],
    num_layers=lstm_gen_best_params["num_layers"],
    dropout=lstm_gen_best_params["dropout"]
)
gen_model.load_state_dict(torch.load("final_amp_frozen_generator_lstm.pt"))
gen_model.to(device)

generated_peptides = []
# (Re-run the generation loop to collect sequences)
for x in range(100):
    sampled_length = length_sampler.sample()[0]
    # sampled_length = 20
    start_aa = sample_start_amino_acid()
    seed_sequence = list(start_aa)
    generated_peptide = generate_sequence_from_seed(gen_model, seed_sequence, max_length=sampled_length, temperature=1, device=device)
    generated_peptides.append(generated_peptide)
    print("Generated AMP sequence:", generated_peptide)

# Save all generated sequences into a text file
with open("generated_peptides-.fasta", "w") as f:
    for i, peptide in enumerate(generated_peptides):
        f.write(f">peptide{i}\n")
        f.write(peptide + "\n")

Generated AMP sequence: EWPLAAATLPAYSDDD
Generated AMP sequence: AIHFMAGGHRDKLA
Generated AMP sequence: KAAYYSGAPHAIAWFNGKKD
Generated AMP sequence: IQWVLWADAGHFSGFKASAYRDASPPRAGATMGAGGPNV
Generated AMP sequence: IAPQKRIKHAAANGKWEHQ
Generated AMP sequence: FAEAWSNTSDVK
Generated AMP sequence: EVEADHSTAKHDAGYGKPGYYEIGFA
Generated AMP sequence: VHAEWEAIYEPQEPGMNKAS
Generated AMP sequence: DAALPAKDNCPHYA
Generated AMP sequence: WAWVENSKKEAVVN
Generated AMP sequence: QEPVVRNACVMAAP
Generated AMP sequence: CAKPNDPEGQ
Generated AMP sequence: ASNKAGFEYHVMRKLIDPFA
Generated AMP sequence: DAWYWGAWHYYTGGTGCKYP
Generated AMP sequence: HAMYAKVQPPGGSSTWKKRN
Generated AMP sequence: IRVGGALAHKGLAASGAKRD
Generated AMP sequence: TAAKKNPYKGRFHAAYSPHC
Generated AMP sequence: YAKPNFYQKMHE
Generated AMP sequence: AQAAAATPIAIICHH
Generated AMP sequence: VMAGMKTNCANALYLHNHKY
Generated AMP sequence: HYVCAAAAPAGTASFA
Generated AMP sequence: VMTQSFCQSYHGRGWTHQQA
Generated AMP sequence: QHSFVGAWAGYYLMHGGPQRRAIAK

## Full backprop

In [16]:
import random
import numpy as np

# Set global seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Define the amino acid vocabulary
amino_acids = "ACDEFGHIKLMNPQRSTVWY"

def sample_start_amino_acid():
    return random.choice(amino_acids)

lstm_gen_best_params = {'hidden_dim': 247, 'num_layers': 3, 'dropout': 0.12915638774083413, 'lr': 0.009983517577195865, 'weight_decay': 0.0007901836295730717}

# lstm_gen_best_params = {'dropout': 0.13882938162931305, 'lr': 0.001464264209101335, 'weight_decay': 0.0002912839503135125}

# Reload the trained model
gen_model = GenerativeLSTM(
    input_dim=20,
    hidden_dim=lstm_gen_best_params["hidden_dim"],
    num_layers=lstm_gen_best_params["num_layers"],
    dropout=lstm_gen_best_params["dropout"]
)
gen_model.load_state_dict(torch.load("final_amp_generator_lstm.pt"))
gen_model.to(device)

generated_peptides = []
# (Re-run the generation loop to collect sequences)
for x in range(100):
    sampled_length = length_sampler.sample()[0]
    # sampled_length = 20
    start_aa = sample_start_amino_acid()
    seed_sequence = list(start_aa)
    generated_peptide = generate_sequence_from_seed(gen_model, seed_sequence, max_length=sampled_length, temperature=1, device=device)
    generated_peptides.append(generated_peptide)
    print("Generated AMP sequence:", generated_peptide)

# Save all generated sequences into a text file
with open("generated_peptides-fullback1.fasta", "w") as f:
    for i, peptide in enumerate(generated_peptides):
        f.write(f">peptide{i}\n")
        f.write(peptide + "\n")

Generated AMP sequence: EWQMDDAVNPAYRGGG
Generated AMP sequence: AKHFNDGHKRGKKD
Generated AMP sequence: KAAYYSGDPIGKDYHPHKKG
Generated AMP sequence: IRWVMYCGAHHHRHHKGPGYPGDPPPPGHGRKHGHHPLY
Generated AMP sequence: IAQRLRKKIAGDPHKYGHP
Generated AMP sequence: FAFCYSPVRGYK
Generated AMP sequence: EVFAEISTAKHGGHYHKPHYYGKHHG
Generated AMP sequence: VKAFWGDKYGPPGPHPPKGP
Generated AMP sequence: DAAMPAKGPGPHYG
Generated AMP sequence: WAWVFPSLKGFYYP
Generated AMP sequence: QFQVVRPDGYNAGP
Generated AMP sequence: CALPPFPGHP
Generated AMP sequence: ATPLAHGGYHYPPKKKGPHD
Generated AMP sequence: DAWYWHAYIYYSHHSHGKYP
Generated AMP sequence: HANYCKVQPPHHQQWYKKPP
Generated AMP sequence: ISVGHAMAKKHLDDQHGKPG
Generated AMP sequence: TAAKLPQYKHRHHGDYQPHG
Generated AMP sequence: YDLQPGYQKPHG
Generated AMP sequence: ARAAAATPKEKKGHH
Generated AMP sequence: VNAHNKTPGDPDKYKHPHKY
Generated AMP sequence: HYVDAAADPDHSDQHG
Generated AMP sequence: VNVQSGEQRYHHPHYSHPPG
Generated AMP sequence: QKSGVHAYDHYYKPHHHPPPPGKGK