In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from icecream import ic
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from xgboost import XGBClassifier

import random
import numpy as np
import torch

from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix
import numpy as np
import time

from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
import optuna

## General AMP

### Data

In [10]:
adam_df = pd.read_csv("../data/naturalAMPs_APD2024a-ADAM.csv")
# uniprot_df = pd.read_csv("../data/uniprotkb_length_10_TO_80_NOT_antimicro_2025_04_14.fasta.csv")
# uniprot_df1 = pd.read_csv("../data/uniprotkb_length_10_TO_80_NOT_antimicro_2025_04_14.fasta1.csv")
# uniprot_df = pd.concat([uniprot_df, uniprot_df1], ignore_index=True)

#### raw data gen

In [None]:
import torch
import pandas as pd
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Load only positive (AMP) sequences
adam_df = pd.read_csv("../data/naturalAMPs_APD2024a-ADAM.csv")
generation_seqs = adam_df["Sequence"].reset_index(drop=True)
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

def one_hot_torch(seq: str, dtype=torch.float32):
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"
    seq_bytes = torch.ByteTensor(list(bytes(seq, "utf-8")))
    aa_bytes = torch.ByteTensor(list(bytes(amino_acids, "utf-8")))
    arr = torch.zeros(len(amino_acids), len(seq_bytes), dtype=dtype)
    for i, aa in enumerate(aa_bytes):
        arr[i, seq_bytes == aa] = 1
    return arr

class GenerativeSequenceDataset(Dataset):
    def __init__(self, sequences, one_hot_dtype=torch.float32):
        self.sequences = sequences
        self.one_hot_dtype = one_hot_dtype

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences.iloc[idx]
        input_seq = seq[:-1]  # all residues except the last
        target_seq = seq[1:]  # all residues except the first
        length = len(input_seq.replace("X", ""))  # unpadded length
        input_one_hot = one_hot_torch(input_seq, dtype=self.one_hot_dtype)
        target_one_hot = one_hot_torch(target_seq, dtype=self.one_hot_dtype)
        # target_indices = torch.tensor(["ACDEFGHIKLMNPQRSTVWY".index(res) for res in target_seq], dtype=torch.long)
        return input_one_hot, target_one_hot, length

def generative_collate_and_pack(batch):
    sequences, targets, lengths = zip(*batch)

    lengths = torch.tensor(lengths)
    sorted_indices = torch.argsort(lengths, descending=True)
    sequences = [sequences[i] for i in sorted_indices]
    targets = [targets[i] for i in sorted_indices]
    lengths = lengths[sorted_indices]

    sequences = [seq.T for seq in sequences]  # transpose to [seq_len, features]
    targets = [tgt.T for tgt in targets]      # transpose targets as well

    padded_seqs = pad_sequence(sequences, batch_first=False)
    padded_targets = pad_sequence(targets, batch_first=False)

    packed_input = pack_padded_sequence(padded_seqs, lengths.cpu(), batch_first=False)
    packed_target = pack_padded_sequence(padded_targets, lengths.cpu(), batch_first=False)

    return packed_input, packed_target, lengths


# Train/val/test split
train_seqs, test_seqs = train_test_split(generation_seqs, test_size=0.3, random_state=42)
val_seqs, test_seqs = train_test_split(test_seqs, test_size=0.5, random_state=42)

train_dataset = GenerativeSequenceDataset(train_seqs)
val_dataset = GenerativeSequenceDataset(val_seqs)
test_dataset = GenerativeSequenceDataset(test_seqs)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=generative_collate_and_pack)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=generative_collate_and_pack)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=generative_collate_and_pack)

# Dataset sizes
dataset_sizes = {
    "Train": len(train_dataset),
    "Validation": len(val_dataset),
    "Test": len(test_dataset)
}
print("Dataset sizes:", dataset_sizes)

for x, y, l in train_loader:
    print("Input shape:", x.data.shape)  # [L, B, 20]
    print("Target shape:", y.data.shape)  # [L, B, 20]
    print("Lengths:", y.batch_sizes)  # Lengths of sequences in the batch
    break

Dataset sizes: {'Train': 2314, 'Validation': 496, 'Test': 496}


#### sliding window data gen

In [None]:
# Re-import required libraries after environment reset
import torch
import pandas as pd
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Load only positive (AMP) sequences
adam_df = pd.read_csv("../data/naturalAMPs_APD2024a-ADAM.csv")

# Clean non-standard amino acids
unique_letters = set(''.join(adam_df["Sequence"]))
amino_acids = "ACDEFGHIKLMNPQRSTVWY"
# non_standard_amino_acids = set(unique_letters) - set(amino_acids)
# adam_df = adam_df[~adam_df["Sequence"].str.contains('|'.join(non_standard_amino_acids))]

# Apply sliding window to generate fragments
def generate_fragments(sequences, window_size=15, stride=5):
    fragments = []
    for seq in sequences:
        for start in range(0, len(seq) - window_size + 1, stride):
            fragment = seq[start:start + window_size]
            fragments.append(fragment)
    return fragments

generation_fragments = generate_fragments(adam_df["Sequence"].tolist())

# Define one-hot encoding function
def one_hot_torch(seq: str, dtype=torch.float32):
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"
    seq_bytes = torch.ByteTensor(list(bytes(seq, "utf-8")))
    aa_bytes = torch.ByteTensor(list(bytes(amino_acids, "utf-8")))
    arr = torch.zeros(len(amino_acids), len(seq_bytes), dtype=dtype)
    for i, aa in enumerate(aa_bytes):
        arr[i, seq_bytes == aa] = 1
    return arr

# Dataset using one-hot encoding for generative modeling
class AMPGenerationOneHotDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        input_seq = seq[:-1]  # all residues except the last
        target_seq = seq[1:]  # all residues except the first
        input_one_hot = one_hot_torch(input_seq)  # shape: [20, seq_len - 1]
        target_one_hot = one_hot_torch(target_seq)  # shape: [20, seq_len - 1]
        length = input_one_hot.shape[1]
        return input_one_hot, target_one_hot, length

# Collate function for packing one-hot sequences
def collate_and_pack_for_generation(batch):
    sequences, targets, lengths = zip(*batch)
    lengths = torch.tensor(lengths)

    # Sort by length (required for packing)
    sorted_indices = torch.argsort(lengths, descending=True)
    sequences = [sequences[i] for i in sorted_indices]
    targets = [targets[i] for i in sorted_indices]
    lengths = lengths[sorted_indices]

    # Transpose each to [L, 20] and pad
    sequences = [seq.T for seq in sequences]  # from [20, L] to [L, 20]
    targets = [tgt.T for tgt in targets]      # from [20, L] to [L, 20]

    padded_seqs = pad_sequence(sequences, batch_first=False)  # [L, B, 20]
    padded_targets = pad_sequence(targets, batch_first=False)  # [L, B, 20]

    packed_input = pack_padded_sequence(padded_seqs, lengths.cpu(), batch_first=False)
    packed_target = pack_padded_sequence(padded_targets, lengths.cpu(), batch_first=False)

    return packed_input, packed_target, lengths

# Train/val/test split
train_seqs, test_seqs = train_test_split(generation_fragments, test_size=0.3, random_state=42)
val_seqs, test_seqs = train_test_split(test_seqs, test_size=0.5, random_state=42)

train_dataset = AMPGenerationOneHotDataset(train_seqs)
val_dataset = AMPGenerationOneHotDataset(val_seqs)
test_dataset = AMPGenerationOneHotDataset(test_seqs)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_and_pack_for_generation)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_and_pack_for_generation)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_and_pack_for_generation)

# Dataset sizes for verification
dataset_sizes = {
    "Train": len(train_dataset),
    "Validation": len(val_dataset),
    "Test": len(test_dataset)
}
print("Dataset sizes:", dataset_sizes)

for x, y, l in train_loader:
    print("Input shape:", x.data.shape)  # [L, B, 20]
    print("Target shape:", y.data.shape)  # [L, B, 20]
    print("Lengths:", y.batch_sizes)  # Lengths of sequences in the batch
    break

Dataset sizes: {'Train': 10392, 'Validation': 2227, 'Test': 2227}


### model

In [79]:
class GenerativeLSTM(nn.Module):
    def __init__(self, input_dim=20, hidden_dim=128, num_layers=1, dropout=0.3, output_dim=20):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, dropout=dropout if num_layers > 1 else 0)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, packed_input):
        packed_output, _ = self.lstm(packed_input)
        dropped = self.dropout(packed_output.data)
        logits = self.fc(dropped)
        return logits  # shape: [total_timesteps, 20]



### test

In [80]:
model = GenerativeLSTM()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()

# Train and eval functions
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for packed_input, packed_target, _ in dataloader:
        inputs = packed_input.to(device)
        targets = torch.argmax(packed_target.data, dim=1).to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for packed_input, packed_target, _ in dataloader:
            inputs = packed_input.to(device)
            targets = torch.argmax(packed_target.data, dim=1).to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
    return total_loss / len(dataloader)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    val_loss = evaluate(model, val_loader, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")

# Final test evaluation
test_loss = evaluate(model, test_loader, criterion, device)
print(f"Test Loss: {test_loss:.4f}")


Epoch 1/10 - Train Loss: 2.8637 - Val Loss: 2.8260
Epoch 2/10 - Train Loss: 2.8100 - Val Loss: 2.8008
Epoch 3/10 - Train Loss: 2.7908 - Val Loss: 2.7863
Epoch 4/10 - Train Loss: 2.7726 - Val Loss: 2.7596
Epoch 5/10 - Train Loss: 2.7420 - Val Loss: 2.7366
Epoch 6/10 - Train Loss: 2.7251 - Val Loss: 2.7261
Epoch 7/10 - Train Loss: 2.7148 - Val Loss: 2.7164
Epoch 8/10 - Train Loss: 2.7054 - Val Loss: 2.7068
Epoch 9/10 - Train Loss: 2.6965 - Val Loss: 2.6990
Epoch 10/10 - Train Loss: 2.6893 - Val Loss: 2.6951
Test Loss: 2.6848


## Model

In [100]:
from torch.nn.utils.rnn import pad_packed_sequence


# class GenerativeLSTM(nn.Module):
#     def __init__(self, input_dim=20, hidden_dim=128, num_layers=1, dropout=0.3, output_dim=20):
#         super().__init__()
#         self.lstm = nn.LSTM(
#             input_size=input_dim,
#             hidden_size=hidden_dim,
#             num_layers=num_layers,
#             batch_first=False,
#             dropout=dropout if num_layers > 1 else 0
#         )
#         self.dropout = nn.Dropout(dropout)
#         self.fc = nn.Linear(hidden_dim, output_dim)

#     def forward(self, packed_input):
#         packed_output, _ = self.lstm(packed_input)
#         dropped = self.dropout(packed_output.data)
#         logits = self.fc(dropped)
#         return logits
    
#     from torch.nn.utils.rnn import pad_packed_sequence

class GenerativeLSTM(nn.Module):
    def __init__(self, input_dim=20, hidden_dim=128, num_layers=1, dropout=0.3):
        super(GenerativeLSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_dim, hidden_dim, num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.fc = nn.Linear(hidden_dim, input_dim)

    def forward(self, x):
        if isinstance(x, torch.nn.utils.rnn.PackedSequence):
            packed_output, _ = self.lstm(x)
            unpacked_output, _ = pad_packed_sequence(packed_output, batch_first=True)
            return self.fc(unpacked_output)
        else:
            out, _ = self.lstm(x)
            return self.fc(out)
        
from torch.nn.utils.rnn import pad_packed_sequence

class GenerativeLSTM(nn.Module):
    def __init__(self, input_dim=20, hidden_dim=128, num_layers=1, dropout=0.3):
        super(GenerativeLSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_dim, hidden_dim, num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.fc = nn.Linear(hidden_dim, input_dim)

    def forward(self, x):
        # Handle packed input
        if isinstance(x, torch.nn.utils.rnn.PackedSequence):
            packed_output, _ = self.lstm(x)
            unpacked_output, _ = pad_packed_sequence(packed_output, batch_first=True)
            return self.fc(unpacked_output)
        else:
            out, _ = self.lstm(x)
            return self.fc(out)



### General AMP - train

In [None]:
# Re-import necessary packages after reset
import torch
import torch.nn as nn
import torch.optim as optim
import datetime
import optuna
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, roc_auc_score
from torch.utils.tensorboard import SummaryWriter


# Criterion

PAD_IDX = -100
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)


def compute_last_token_loss(output, target_seq, criterion):
    """
    Computes cross-entropy loss on the last time step of each sequence.
    
    Args:
        output: Tensor of shape [B, L, vocab_size]
        target_seq: Tensor of shape [B, L] containing target class indices
    
    Returns:
        loss: Scalar loss computed only on the last token of each sequence
    """
    # Get last time step for each sequence
    last_token_logits = output[:, -1, :]        # [B, vocab_size]
    last_token_targets = target_seq[:, -1, :]      # [B]
    last_token_targets = torch.argmax(last_token_targets, dim=-1)  #  now shape is [batch_size, seq_len]

    # print('last_token_logits',last_token_logits.shape)
    # print('last_token_targets',last_token_targets.shape)

    return criterion(last_token_logits, last_token_targets)

# Training function
def train_model_generation(model, train_loader, val_loader, num_epochs=10, lr=1e-3, weight_decay=1e-4,
                           device='cuda' if torch.cuda.is_available() else 'cpu', verbose=False, train=True):
    model.to(device)
    if not train:
        model.eval()

    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    best_val_loss = float('inf')
    log_dir = f"runs-lstm-gen/AMP_LSTM_GEN_{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}"
    writer = SummaryWriter(log_dir=log_dir)

    for epoch in range(1, num_epochs + 1):
        model.train()
        epoch_loss = 0.0
        for input_seq, target_seq, _ in train_loader:
            input_seq = input_seq.to(device)
            target_seq = target_seq.to(device)

            optimizer.zero_grad()
            output = model(input_seq)
            # output = output.view(-1, output.shape[-1])
            # output = output.reshape(-1, output.shape[-1])      # [B*L, vocab]
            # output = torch.argmax(output, dim=-1)  # now shape is [batch_size, seq_len]

            
            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)
            # # target_seq = target_seq.view(-1)
            # target_seq = torch.argmax(target_seq, dim=-1)  #  now shape is [batch_size, seq_len]

            
            # print('target_shape before reshape',target_seq.shape)
            # target_seq = target_seq.reshape(-1)
            # print(f"Output shape: {output.shape}, Target shape: {target_seq.shape}")

            # loss = criterion(output, target_seq)
            loss = compute_last_token_loss(output, target_seq, criterion)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        avg_train_loss = epoch_loss / len(train_loader)
        val_loss, acc, auc = evaluate_model_generation(model, val_loader, criterion, device, verbose)

        writer.add_scalar('Loss/Train', avg_train_loss, epoch)
        writer.add_scalar('Loss/Validation', val_loss, epoch)
        writer.add_scalar('Accuracy/Validation', acc, epoch)
        writer.add_scalar('AUC/Validation', auc if auc != "undefined" else 0.0, epoch)

        if verbose:
            print(f"Epoch [{epoch}/{num_epochs}] - Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}, Acc: {acc:.4f}, AUC: {auc}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            if train:
                torch.save(model.state_dict(), 'best_model_lstm_generator.pt')

    writer.close()
    return best_val_loss

# Evaluation function
def evaluate_model_generation(model, data_loader, criterion, device='cuda' if torch.cuda.is_available() else 'cpu', verbose=False):
    model.eval()
    all_labels = []
    all_preds = []
    total_loss = 0.0
    with torch.no_grad():
        for input_seq, target_seq, _ in data_loader:
            input_seq = input_seq.to(device)
            target_seq = target_seq.to(device)

            output = model(input_seq)
            # output = output.view(-1, output.shape[-1])
            # output = output.reshape(-1, output.shape[-1])  # [B*L, vocab]
            # output = torch.argmax(output, dim=-1)  # now shape is [batch_size, seq_len]

            # # target_seq = target_seq.view(-1)
            if isinstance(target_seq, torch.nn.utils.rnn.PackedSequence):
                target_seq, _ = pad_packed_sequence(target_seq, batch_first=True)
            # # target_seq = target_seq.view(-1)
            # # target_seq = target_seq.reshape(-1)
            # # target_seq = target_seq.reshape(-1, target_seq.shape[-1])
            # target_seq = torch.argmax(target_seq, dim=-1)  #  now shape is [batch_size, seq_len]

            # assert output.size(0) == target_seq.size(0), f"Mismatch: {output.size(0)} vs {target_seq.size(0)}"

            # loss = criterion(output, target_seq)
            loss = compute_last_token_loss(output, target_seq, criterion)
            # print('loss done')
            total_loss += loss.item()
            
            
            preds = output[:, -1, :]        # shape: [B, vocab_size]
            preds = torch.argmax(preds, dim=1)  # shape: [B]

            targets = target_seq[:, -1, :]      # shape: [B, vocab_size]
            targets = torch.argmax(targets, dim=-1)  # shape: [B]
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    acc = accuracy_score(all_labels, all_preds)
    try:
        auc = roc_auc_score(
            torch.nn.functional.one_hot(torch.tensor(all_labels), num_classes=20),
            torch.nn.functional.one_hot(torch.tensor(all_preds), num_classes=20),
            multi_class='ovr', average='macro'
        )
    except:
        auc = "undefined"

    return avg_loss, acc, auc

# Objective for Optuna tuning
def objective_generation(trial):
    hidden_dim = trial.suggest_int("hidden_dim", 64, 256)
    num_layers = trial.suggest_int("num_layers", 1, 3)
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    lr = trial.suggest_float("lr", 1e-4, 1e-2)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3)

    model = GenerativeLSTM(hidden_dim=hidden_dim, num_layers=num_layers, dropout=dropout)
    val_loss = train_model_generation(model, train_loader, val_loader, num_epochs=10, lr=lr, weight_decay=weight_decay, verbose=False)
    return val_loss

study = optuna.create_study(direction="minimize")
study.optimize(objective_generation, n_trials=20)

lstm_gen_best_params = study.best_trial.params

[I 2025-04-21 00:07:02,076] A new study created in memory with name: no-name-8c2f498f-e0ce-40b2-bf7a-81b25e572e07


In [91]:
!tensorboard --logdir runs-lstm-gen

TensorFlow installation not found - running with reduced feature set.
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.19.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C
