# BERT sequence classification

## Load libraries

In [None]:
!pip install transformers

In [2]:
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification
import torch.optim as optim

## Data preprocessing

In [3]:
# Define the sequences and labels
x = [
    [[1, 2], [3, 4], [5, 6]],
    [[0, -2], [3, 4]],
    []
]
y = [0, 1, 0]

In [4]:
# Make more data (fake) for example
x = x*100
y = y*100

In [5]:
# Initialize a BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and pad sequences
# Get Max length of the sequence. Each element contains 4 tokens
max_seq_length = max(len(seq) for seq in x) * 4

# List of tokenized representations of the input sequences.
input_ids = []
# Listof attention masks (binary values (0 or 1))
attention_masks = []

for seq in x:
    input_seq = []
    attention_mask = []
    for item in seq:
        tokens = [str(num) for num in item]
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        input_seq.extend(tokens)
        attention_mask.extend([1] * len(tokens))

    # Pad sequences to the same length
    while len(input_seq) < max_seq_length:
        input_seq.append('[PAD]')
        attention_mask.append(0)

    input_ids.append(tokenizer.convert_tokens_to_ids(input_seq))
    attention_masks.append(attention_mask)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
# Convert lists to PyTorch tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
labels = torch.tensor(y)

## Data loader

In [7]:
# Create a DataLoader
dataset = TensorDataset(input_ids, attention_masks, labels)
batch_size = 16  # Adjustable

### Train-Validation split

In [8]:
def train_test_split(dataset, train_ratio):
  """Train-test split"""
  train_size = int(train_ratio * len(dataset))
  val_size = len(dataset) - train_size
  train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
  return train_dataset, val_dataset

In [9]:
# Divide the dataset by randomly selecting samples.
torch.manual_seed(0)
train_dataset, val_dataset = train_test_split(dataset, train_ratio=0.9)

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

## Fine-Tuning BERT Model

In [10]:
# Initialize a pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define optimizer and learning rate
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Fine-tuning
num_epochs = 3  # Adjustable
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids_batch, attention_masks_batch, labels_batch = batch

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids_batch, attention_mask=attention_masks_batch, labels=labels_batch)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()

    # # Calculate average loss for this epoch
    # avg_loss = total_loss / len(dataloader)
    # print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}')
        # Calculate average training loss for this epoch
        avg_train_loss = total_loss / len(train_dataloader)

        # Validation loop
        model.eval()
        val_loss = 0
        num_val_steps = 0

        for batch in val_dataloader:
            input_ids_batch, attention_masks_batch, labels_batch = batch

            with torch.no_grad():
                outputs = model(input_ids_batch, attention_mask=attention_masks_batch, labels=labels_batch)
                val_loss += outputs.loss.item()
                num_val_steps += 1

        avg_val_loss = val_loss / num_val_steps
        print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')

Epoch 1/3, Train Loss: 0.0374, Validation Loss: 0.6439
Epoch 1/3, Train Loss: 0.0761, Validation Loss: 0.5557
Epoch 1/3, Train Loss: 0.1106, Validation Loss: 0.4577
Epoch 1/3, Train Loss: 0.1376, Validation Loss: 0.3860
Epoch 1/3, Train Loss: 0.1604, Validation Loss: 0.3523
Epoch 1/3, Train Loss: 0.1807, Validation Loss: 0.3252
Epoch 1/3, Train Loss: 0.1999, Validation Loss: 0.2932
Epoch 1/3, Train Loss: 0.2168, Validation Loss: 0.2664
Epoch 1/3, Train Loss: 0.2320, Validation Loss: 0.2441
Epoch 1/3, Train Loss: 0.2446, Validation Loss: 0.2249
Epoch 1/3, Train Loss: 0.2580, Validation Loss: 0.2098
Epoch 1/3, Train Loss: 0.2713, Validation Loss: 0.1885
Epoch 1/3, Train Loss: 0.2821, Validation Loss: 0.1754
Epoch 1/3, Train Loss: 0.2919, Validation Loss: 0.1629
Epoch 1/3, Train Loss: 0.3009, Validation Loss: 0.1479
Epoch 1/3, Train Loss: 0.3078, Validation Loss: 0.1365
Epoch 1/3, Train Loss: 0.3144, Validation Loss: 0.1289
Epoch 2/3, Train Loss: 0.0091, Validation Loss: 0.1191
Epoch 2/3,

In [13]:
# Save or use the fine-tuned model
torch.save(model.state_dict(), 'bert_sequence_classification.pth')