In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from icecream import ic
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from xgboost import XGBClassifier

import random
import numpy as np
import torch

def set_seed(seed_value=42):
    """Set seed for reproducibility across numpy, torch, and random."""
    random.seed(seed_value)  # Python's built-in random module
    np.random.seed(seed_value)  # NumPy random seed

    torch.manual_seed(seed_value)  # PyTorch random seed for CPU
    torch.cuda.manual_seed(seed_value)  # PyTorch random seed for CUDA
    torch.cuda.manual_seed_all(seed_value)  # PyTorch seed for all GPUs

    torch.backends.cudnn.deterministic = True  # Ensures deterministic behavior in cuDNN
    torch.backends.cudnn.benchmark = False  # Disables cuDNN auto-optimization

# Usage
set_seed(42)  # Call this function before running models or data splits to ensure reproducibility

device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [None]:
# Re-import necessary libraries after execution state reset
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split


df = pd.read_csv('all_seq722.csv')

# Define One-Hot Encoding Function for DNA Sequences in PyTorch
def one_hot_torch(seq: str, dtype=torch.float32):
    seq_bytes = torch.ByteTensor(list(bytes(seq, "utf-8")))
    acgt_bytes = torch.ByteTensor(list(bytes("ACGT", "utf-8")))
    arr = torch.zeros(4, (len(seq_bytes)), dtype=dtype)
    arr[0, seq_bytes == acgt_bytes[0]] = 1  # 'A'
    arr[1, seq_bytes == acgt_bytes[1]] = 1  # 'C'
    arr[2, seq_bytes == acgt_bytes[2]] = 1  # 'G'
    arr[3, seq_bytes == acgt_bytes[3]] = 1  # 'T'
    return arr

# Define custom dataset class with transformation

# Updating the Dataset class with the OneHotEncoder function at the end
class SequenceDataset(Dataset):
    def __init__(self, sequences, labels, one_hot_dtype=torch.int8):
        self.sequences = sequences  # Raw sequences
        self.labels = labels  # Labels
        self.one_hot_dtype = one_hot_dtype  # Data type for one-hot encoding

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seqs_comb = self.sequences[idx]  # Get sequence
        amp_label = self.labels[idx]  # Get corresponding label

        # Apply one-hot encoding transformation at the end
        return one_hot_torch(seqs_comb, dtype=self.one_hot_dtype), torch.tensor(amp_label)  # Return transformed sequence and label


# Convert dataset into PyTorch Dataset

X = df["Sequences"]
y = df["AMP"]
# Split into train (70%), validation (15%), test (15%)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

# Step 2: Split train+val into train and val (stratified)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.15, random_state=42, stratify=y_train_val
)  # 0.1765 to maintain 15% of original dataset

# Convert back to PyTorch datasets
train_dataset = SequenceDataset(X_train, y_train)
val_dataset = SequenceDataset(X_val, y_val)
test_dataset = SequenceDataset(X_test, y_test)

# Define DataLoaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Display dataset sizes
dataset_sizes = {
    "Train": len(train_dataset),
    "Validation": len(val_dataset),
    "Test": len(test_dataset)
}


In [14]:
X[0]
one_hot_torch(X[0], dtype=torch.int8)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=torch.int8)

In [10]:
for x,y in train_loader:
    print(x)
    print(y)
    break

KeyError: 332

## basic lstm

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix
import numpy as np

# Define the Basic LSTM Model
class BasicLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(BasicLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n[-1])
        return self.sigmoid(out)

# Function to calculate specificity
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

# Training and Evaluation Function
def train_and_evaluate(model, train_loader, val_loader, criterion, optimizer, tensorboard_,num_epochs=100):
    writer = SummaryWriter(tensorboard_)
    for epoch in range(num_epochs):
        model.train()
        train_losses = []
        for sequences, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, labels.unsqueeze(1))
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
        
        avg_train_loss = np.mean(train_losses)
        writer.add_scalar('Loss/Train', avg_train_loss, epoch)

        # Evaluation
        model.eval()
        all_labels = []
        all_preds = []
        with torch.no_grad():
            for sequences, labels in val_loader:
                outputs = model(sequences)
                preds = (outputs > 0.5).float()
                all_labels.extend(labels.numpy())
                all_preds.extend(preds.numpy())
        
        all_labels = np.array(all_labels)
        all_preds = np.array(all_preds)
        accuracy = accuracy_score(all_labels, all_preds)
        auc = roc_auc_score(all_labels, all_preds)
        sensitivity = recall_score(all_labels, all_preds)
        specificity = specificity_score(all_labels, all_preds)

        writer.add_scalar('Metrics/Accuracy', accuracy, epoch)
        writer.add_scalar('Metrics/AUC', auc, epoch)
        writer.add_scalar('Metrics/Sensitivity', sensitivity, epoch)
        writer.add_scalar('Metrics/Specificity', specificity, epoch)
        if epoch % 5 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_train_loss:.4f}, Accuracy: {accuracy:.4f}, AUC: {auc:.4f}, Sensitivity: {sensitivity:.4f}, Specificity: {specificity:.4f}')
    
    writer.close()

# Hyperparameters
input_size = 4       # For one-hot encoded DNA sequences (A, C, G, T)
hidden_size = 128    # Number of features in the hidden state
num_layers = 2       # Number of stacked LSTM layers
output_size = 1      # Binary classification (AMP or not)
batch_size = 16
num_epochs = 10
learning_rate = 0.001
tensorboard_ = 'runs/basic_lstm'


# Model, Loss, Optimizer
model = BasicLSTM(input_size, hidden_size, num_layers, output_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train and Evaluate
train_and_evaluate(model, train_loader, val_loader, criterion, optimizer, tensorboard_,num_epochs)



KeyError: 214