## BERT with BPE Tokenizer

In [28]:
import numpy as np
import pandas as pd
import os
from pathlib import Path

import torch.nn as nn
import torch
from torch.optim import Adam

import time

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

from sklearn.metrics import matthews_corrcoef, accuracy_score

from DeepLibrary.datasets import *
from DeepLibrary.models import *

results_path = "./BPE_BERT_Pipeline_results"
os.makedirs(results_path, exist_ok=True)

**hyperparameters**

In [39]:
# Tokenizer parameters
voc_size = 1000
max_length = 512
batch_size = 10000

# BERT Model parameters
hidden_size = 128
layers_num = 5
attention_heads_num = 8
epochs = 50 # 300
loss_fn = nn.CrossEntropyLoss()

**Classification metrics**

In [5]:
def calc_metrics_classification(model, generator):
    """
    Calculate classification metrics including loss, Matthews correlation coefficient (MCC),
    and accuracy for a given model and data generator.

    Parameters:
    - model (nn.Module): The PyTorch model for classification.
    - generator: Data generator yielding batches of input features and labels.

    Returns:
    - loss (float): Average loss over the entire dataset.
    - mcc (float): Matthews correlation coefficient.
    - accuracy (float): Classification accuracy.
    """
    
    loss = 0
    y_true = []
    y_pred = []
    with torch.no_grad():
        for x,y in generator:
            outputs = model(x)  
            y_pred.append(torch.argmax(outputs, dim=1).int().item())
            loss += loss_fn(outputs, y).item()
            y_true.append(y.int().item())
        loss = loss / len(generator)
        mcc = matthews_corrcoef(y_true, y_pred)
        accuracy = accuracy_score(y_true, y_pred)
    return loss, mcc, accuracy

**GPU Memory Management and Device Setup**


In [6]:
# Release unoccupied GPU memory
torch.cuda.empty_cache()

# Check if a GPU (CUDA) is available on the machine
# If available, set the device to "cuda"; otherwise, set it to "cpu"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'device = {device}')

# Creating result folder
os.makedirs(results_path, exist_ok=True)

device = cpu


**Data**

Numeric labels

In [7]:
# Read CSV files into pandas DataFrames
df_train  = pd.read_csv("../random_split/train.csv", index_col=0)
df_dev  = pd.read_csv("../random_split/dev.csv", index_col=0)
df_test  = pd.read_csv("../random_split/test.csv", index_col=0)

# Factorize the 'family_accession' column and create a new 'label_numeric' column with numeric labels
df_train['label_numeric'] = pd.factorize(df_train['family_accession'], sort=True)[0]
df_dev['label_numeric'] = pd.factorize(df_dev['family_accession'], sort=True)[0]
df_test['label_numeric'] = pd.factorize(df_test['family_accession'], sort=True)[0]

# Convert label columns to integer lists
y_train = df_train['label_numeric'].astype(int).tolist()
y_dev = df_dev['label_numeric'].astype(int).tolist()
y_test = df_test['label_numeric'].astype(int).tolist()

# Count the number of different classes
num_of_classes = len(list(set(y_train))) # counts the number different classes

# Extract sequence as string lists
X_train = df_train['sequence'].astype(str).tolist()
X_dev = df_dev['sequence'].astype(str).tolist()
X_test = df_test['sequence'].astype(str).tolist()

Train tokenizer

In [34]:
# Create a Byte Pair Encoding (BPE) tokenizer with an unknown token ("<UNK>")
tokenizer = Tokenizer(BPE(unk_token = "<UNK>"))
# Configure BPE trainer with special tokens and vocabulary size
trainer = BpeTrainer(special_tokens = ["<UNK>"], vocab_size=voc_size)

# Set pre-tokenizer to Whitespace
tokenizer.pre_tokenizer = Whitespace()

if Path(results_path, "tokenizer.json").exists():
    tokenizer = Tokenizer.from_file(str(Path(results_path, "tokenizer.json")))
    print(f"Tokenizer loaded from {Path(results_path, 'tokenizer.json')}.")
else:
    # Train the tokenizer on the training data using batch iterator and trainer
    tokenizer.train_from_iterator(batch_iterator(X_train, batch_size = batch_size), trainer)

# Enable padding for sequences with a specified maximum length
tokenizer.enable_padding(length=max_length)

# Print and save the trained tokenizer
print(f'saving tokenizer to {results_path}...')
tokenizer.save(os.path.join(results_path, "tokenizer.json"))

Tokenizer loaded from Results BPE Bert/tokenizer.json.
saving tokenizer to ./Results BPE Bert...


Encode data

In [40]:
# Encode protein sequences into numerical tokens using the trained tokenizer
X_train_ids = encode(X_train, tokenizer, max_length = max_length)
X_dev_ids = encode(X_dev, tokenizer, max_length = max_length)
X_test_ids = encode(X_test, tokenizer, max_length = max_length)

# Convert encoded sequences and labels to PyTorch tensors
X_train_ids = [torch.tensor(item).to(device) for item in X_train_ids]
y_train = [torch.tensor(item).to(device) for item in y_train]
print(f'loaded train data to device')

# Similar steps for the validation dataset
X_dev_ids = [torch.tensor(item).to(device) for item in X_dev_ids]
y_dev = [torch.tensor(item).to(device) for item in y_dev]
print(f'loaded dev data to device')

# Similar steps for the test dataset
X_test_ids = [torch.tensor(item).to(device) for item in X_test_ids]
y_test = [torch.tensor(item).to(device) for item in y_test]
print(f'loaded test data to device')

# Create PyTorch datasets for training, validation, and testing
train_dataset = Dataset(X_train_ids, y_train)
dev_dataset = Dataset(X_dev_ids, y_dev)
test_dataset = Dataset(X_test_ids, y_test)

  y_train = [torch.tensor(item).to(device) for item in y_train]


loaded train data to device


  y_dev = [torch.tensor(item).to(device) for item in y_dev]


loaded dev data to device


  y_test = [torch.tensor(item).to(device) for item in y_test]


loaded test data to device


**BERT Model**

Model initialization

In [41]:
# Create an instance of the BioBERTModel
model = BioBERTModel(hidden_size, layers_num, attention_heads_num, num_of_classes)

# Move the model to the specified device (CPU or GPU)
model.to(device)
print(f'Model loaded to device')
print(f'Device is {device}')

# Count and print the total number of parameters in the model
total_params = sum(p.numel() for p in model.parameters())
print(f'Number of parameters: {total_params}')

Model loaded to device
Device is cpu
Number of parameters: 9460489


Prepare data

In [42]:
# Create a torch random number generator with a fixed seed for reproducibility
g = torch.Generator()
g.manual_seed(0)

# Create DataLoader instances for training, validation, and testing datasets
# DataLoader is used to efficiently load data in batches for training neural networks
train_generator = torch.utils.data.DataLoader(train_dataset, shuffle=True, num_workers=0, batch_size=8, generator=g)
dev_generator = torch.utils.data.DataLoader(dev_dataset, shuffle=True, num_workers=0, batch_size=1, generator=g)
test_generator = torch.utils.data.DataLoader(test_dataset, shuffle=True, num_workers=0, batch_size=1, generator=g)

Training model

In [43]:
# Initialize the Adam optimizer for updating the model parameters during training
optimizer = Adam(model.parameters(), lr=0.0001, weight_decay=0.00002)

# Initialize an empty list to store results for each epoch
list_of_rows = []

for epoch in range(1, epochs + 1):
    print(f'----- starting epoch = {epoch} -----')
    
    epoch_loss = 0.0
    running_loss = 0.0
    
    # Training
    start_time = time.time()
    model.train()
    
    # Loop through batches in the training generator
    for idx, (x, y) in enumerate(train_generator):
        # Zero the gradients
        optimizer.zero_grad()
        # Forward pass
        outputs = model(x) #attention_mask = ...
        # Compute the loss
        loss = loss_fn(outputs, y)
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Print statistics (every 10 batch)
        running_loss += loss.item()
        if idx % 10 == 9:
            end_time = time.time()
            print('[%d, %5d] time: %.3f loss: %.3f' %
                  (epoch, idx + 1, end_time - start_time, running_loss / 10))
            running_loss = 0.0
            start_time = time.time()
        
    # Evaluation on validation and test sets
    model.eval()
    dev_loss, dev_mcc, accuracy_dev = calc_metrics_classification(model, dev_generator)
    test_loss, test_mcc, accuracy_test = calc_metrics_classification(model, test_generator)
    
    # Print and store results for the current epoch
    print(f'epoch = {epoch}, dev_loss = {dev_loss}, val_mcc = {dev_mcc}, test_loss = {test_loss}, test_mcc = {test_mcc}, accuracy_dev = {accuracy_dev}, accuracy_test = {accuracy_test}')
    list_of_rows.append({'epoch': epoch, 'dev_loss': dev_loss, 'dev_mcc': dev_mcc, 'test_loss': test_loss, 'test_mcc': test_mcc, 'accuracy_dev': accuracy_dev, 'accuracy_test': accuracy_test})
    
    # Save the model checkpoint
    torch.save(model.state_dict(), os.path.join(results_path, f"checkpoint_{epoch}.pt"))

# Create a DataFrame from the list of results and save it as a CSV file
df_loss = pd.DataFrame(list_of_rows)
df_loss.to_csv(os.path.join(results_path, f"results.csv"), index=False)


----- starting epoch = 1 -----
[1,    10] time: 57.519 loss: 9.859
[1,    20] time: 933.892 loss: 9.853
[1,    30] time: 935.802 loss: 9.854
[1,    40] time: 904.764 loss: 9.843
[1,    50] time: 59.275 loss: 9.873
[1,    60] time: 63.940 loss: 9.802
[1,    70] time: 68.292 loss: 9.861
[1,    80] time: 70.460 loss: 9.841
[1,    90] time: 71.979 loss: 9.877
[1,   100] time: 73.057 loss: 9.815
[1,   110] time: 72.639 loss: 9.822
[1,   120] time: 74.030 loss: 9.865
[1,   130] time: 74.690 loss: 9.859
[1,   140] time: 75.470 loss: 9.925
[1,   150] time: 78.560 loss: 9.830
[1,   160] time: 76.150 loss: 9.829
[1,   170] time: 80.558 loss: 9.802
[1,   180] time: 79.805 loss: 9.767
[1,   190] time: 76.032 loss: 9.831
[1,   200] time: 77.845 loss: 9.832
[1,   210] time: 1048.439 loss: 9.826
[1,   220] time: 50.697 loss: 9.764
[1,   230] time: 882.498 loss: 9.835
[1,   240] time: 872.550 loss: 9.764
[1,   250] time: 902.252 loss: 9.879
[1,   260] time: 50.634 loss: 9.774
[1,   270] time: 3580.194