# PHASE 2
This code incorporates the modifications to the SequenceDataset class and tokenization process, enabling the model to handle combined inputs of sequence and property vectors, with the [L] token separating them.

In [1]:
%%capture
!pip install transformers
!pip install datasets
# install this module for extracting info from fas file instead of doing by hand
!pip install biopython

In [None]:
import pandas as pd
import torch
import os
import torch.nn as nn
import torch.optim as optim
from Bio import SeqIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModel, AutoTokenizer

In [None]:
# Read the sequence DataFrame from the CSV file
df = pd.read_csv('/content/drive/MyDrive/Sequence DataFrame.csv')

# Split the data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the sequences
train_encodings = tokenizer(train_df['seq'].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_df['seq'].tolist(), truncation=True, padding=True)

# Prepare the property vectors
train_property_vectors = train_df['property'].tolist()
val_property_vectors = val_df['property'].tolist()


In [None]:
# Prepare the labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['class or file name'].tolist())
val_labels = label_encoder.transform(val_df['class or file name'].tolist())

In [None]:
# Define the dataset
class SequenceDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, property_vectors):
        self.encodings = encodings
        self.labels = labels
        self.property_vectors = property_vectors

    def __getitem__(self, idx):
        input_ids = self.encodings['input_ids'][idx]
        attention_mask = self.encodings['attention_mask'][idx]
        property_vector = self.property_vectors[idx]

        # Combine sequence and property with [L] token
        combined_input = input_ids + [tokenizer.token_to_id('[L]')] + property_vector

        item = {
            'input_ids': torch.tensor(combined_input),
            'attention_mask': torch.tensor(attention_mask),
            'labels': torch.tensor(self.labels[idx])
        }
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
# Create instances of the dataset
train_dataset = SequenceDataset(train_encodings, train_labels, train_property_vectors)
val_dataset = SequenceDataset(val_encodings, val_labels, val_property_vectors)

In [None]:
# Define the model
class MyModel(nn.Module):
    def __init__(self, num_classes):
        super(MyModel, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.linear(pooled_output)
        return logits

In [None]:
# Set the device to use
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create an instance of the model and move it to the device
model = MyModel(num_classes=len(label_encoder.classes_)).to(device)

# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = torch.max(outputs, dim=1)
        train_correct += (predicted == labels).sum().item()
        train_total += labels.size(0)

    train_loss /= len(train_loader)
    train_accuracy = train_correct / train_total

    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            _, predicted = torch.max(outputs, dim=1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)

    val_loss /= len(val_loader)
    val_accuracy = val_correct / val_total

    print(f"Epoch {epoch+1}/{num_epochs}:")
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_accuracy:.2%}")
    print(f"Validation Loss: {val_loss:.4f} | Validation Acc: {val_accuracy:.2%}")
