Switch to GPU

In [None]:
#Installs for NVIDIA GeForce RTX 3080
#pip install transformers
#pip install scikit-learn
#pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
#separately download cuda_12.3.1_546.12_windows & follow express install instructions

import torch

# Check if GPU is available
if torch.cuda.is_available():
    # Print the name of the GPU
    print(torch.cuda.get_device_name(0))
else:
    print("No GPU available, using CPU.")

Hugging Face implementation of [10.34133/research.0004](https://doi.org/10.1101/2020.07.12.199554) by Rostlab/prot_bert 

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Load data from excel spreadsheet
df = pd.read_excel("Aggregated.xlsx")

sequence_lengths = df['sequence'].apply(len)
plt.figure(figsize=(10,6))
plt.hist(sequence_lengths, bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Sequence Lengths')
plt.xlabel('Length of Sequences')
plt.ylabel('Frequency')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from transformers import BertModel, TrainingArguments, AutoTokenizer, TrainingArguments, AutoModel, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn as nn
import torch.nn.init as init


class ProteinSequenceDataset(torch.utils.data.Dataset):
    def __init__(self, sequences, targets, tokenizer, max_length):
        self.sequences = sequences
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = str(self.sequences[idx])
        target = self.targets[idx]
        encoding = self.tokenizer.encode_plus(
            sequence,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(target, dtype=torch.long)
        }

PRE_TRAINED_MODEL_NAME = 'Rostlab/prot_bert_bfd_localization'

class ProteinClassifier(nn.Module):
    def __init__(self, n_classes):
        super(ProteinClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.classifier = nn.Sequential(nn.Dropout(p=0.4),
                                        nn.Linear(self.bert.config.hidden_size, n_classes),
                                        nn.Tanh())
        self.init_weights()
        
    def init_weights(self):
        init.xavier_uniform_(self.classifier[1].weight)
        init.constant_(self.classifier[1].bias, 0)
        
    def forward(self, input_ids, attention_mask):
        output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        return self.classifier(output.pooler_output)


# Load the ProtBERT tokenizer and model
pretrained_model_name = "Rostlab/prot_bert"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
model = ProteinClassifier(1)

# Extract sequences and labels
X = df['sequence'].tolist()
y = df['label'].values

# Split the data into training and testing sets
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

X_train_preprocessed = [' '.join(seq) for seq in X_train]
X_test_preprocessed = [' '.join(seq) for seq in X_test]

# Tokenize and encode the data
max_length = 60  # based on distribution of seq lengths
train_encodings = tokenizer(X_train_preprocessed, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(X_test_preprocessed, truncation=True, padding=True, max_length=max_length)

# Convert tokenized data into a torch Dataset
train_dataset = ProteinSequenceDataset(sequences=X_train_preprocessed, targets=y_train, tokenizer=tokenizer, max_length=max_length)
valid_dataset = ProteinSequenceDataset(sequences=X_test_preprocessed, targets=y_test, tokenizer=tokenizer, max_length=max_length)

In [None]:
# Tokenize a sample sequence
sequence = "S D P K I G D G C F G L P L D H I G S V S G L G C N R P V Q N R P K K"
tokenized_sequence = tokenizer(sequence)

# Print the tokenized sequence
print("Original sequence:", sequence)
print("Tokenized sequence:", tokenized_sequence)

In [None]:
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm
import torch.optim as optim
import torch.nn as nn
import matplotlib.pyplot as plt

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(valid_dataset, batch_size=64, shuffle=True)

optimizer = optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.2) #changed from 1e-5 and 0.2
criterion = nn.BCEWithLogitsLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

num_epochs = 10

train_losses = []
test_losses = []
train_accuracies = []
test_accuracies = []

for epoch in range(num_epochs):
    model.train()
    epoch_train_loss = 0.0
    correct_train = 0
    total_train = 0
    
    # Training
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        labels = labels.unsqueeze(1)

        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask)
        labels = labels.to(outputs.dtype)
        
        loss = criterion(outputs, labels)
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        probabilities = torch.sigmoid(outputs)
        predictions = (probabilities > 0.5).float()
        
        epoch_train_loss += loss.item()
        correct_train += (predictions == labels).sum().item()
        total_train += labels.size(0)
    
    avg_train_loss = epoch_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    train_accuracy = correct_train / total_train
    train_accuracies.append(train_accuracy)
    
    # Testing
    model.eval()
    epoch_test_loss = 0.0
    correct_test = 0
    total_test = 0

    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            labels = labels.unsqueeze(1)

            outputs = model(input_ids, attention_mask)
            labels = labels.to(outputs.dtype)

            loss = criterion(outputs, labels)
            epoch_test_loss += loss.item()

            probabilities = torch.sigmoid(outputs)
            predictions = (probabilities > 0.5).float()

            correct_test += (predictions == labels).sum().item()
            total_test += labels.size(0)

    avg_test_loss = epoch_test_loss / len(test_loader)
    test_losses.append(avg_test_loss)
    test_accuracy = correct_test / total_test
    test_accuracies.append(test_accuracy)

    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}, Testing Loss: {avg_test_loss:.4f}, Testing Accuracy: {test_accuracy:.4f}')


# Creating subplots with 1 row and 2 columns
fig, axs = plt.subplots(1, 2, figsize=(8, 4))

# Plotting the loss curves
axs[0].plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
axs[0].plot(range(1, num_epochs + 1), test_losses, label='Testing Loss')
axs[0].set_xlabel('Epoch')
axs[0].set_ylabel('Loss')
axs[0].legend()

# Plotting the accuracy curves
axs[1].plot(range(1, num_epochs + 1), train_accuracies, label='Training Accuracy')
axs[1].plot(range(1, num_epochs + 1), test_accuracies, label='Testing Accuracy')
axs[1].set_xlabel('Epoch')
axs[1].set_ylabel('Accuracy')
axs[1].legend()

plt.show()

In [None]:
for batch in (test_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    labels = labels.unsqueeze(1)

    outputs = model(input_ids, attention_mask)
    labels = labels.to(outputs.dtype)

    loss = criterion(outputs, labels)
    epoch_test_loss += loss.item()

    probabilities = torch.sigmoid(outputs)
    predictions = (probabilities > 0.5).float()

    correct_test += (predictions == labels).sum().item()
    total_test += labels.size(0)
    
    # Print the original sequence, the ground truth label, and the predicted label
    original_sequence = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    ground_truth_label = labels[0].item()
    predicted_label = predictions[0].item()
    print(f"Original Sequence: {original_sequence}") 
    print(f"Ground Truth Label: {ground_truth_label}")
    print(f"Predicted Label: {predicted_label}")
