<a href="https://colab.research.google.com/github/hamagami/is2024/blob/main/13_TransformerPracticalExample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer Practical Example

This example uses a small but meaningful dataset to demonstrate the training process of a Transformer. While training a Transformer from scratch usually requires a large amount of data, the dataset size has been intentionally kept small here to simplify the learning process. Specifically, we will use the TREC dataset, which is designed for a question classification task. However, please note that this example might not run successfully on the free plan of Google Colab due to resource limitations.

In [None]:
!pip install datasets
# There may be errors at this point, but there should be no issues in the subsequent processing steps.



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from datasets import load_dataset

In [None]:
# Load the TREC dataset
data = load_dataset("trec")
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# During execution this cell, you will see "Do you wish to run the custom code?". Please type "y" to continue.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Preprocess the data
def preprocess_function(examples):
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=50, return_tensors="pt")
    return {
        "input_ids": tokens["input_ids"].squeeze(),
        "attention_mask": tokens["attention_mask"].squeeze(),
        "labels": examples["coarse_label"]
    }

train_data = data["train"].map(preprocess_function, batched=True)
test_data = data["test"].map(preprocess_function, batched=True)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
class TrecDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.data["input_ids"][idx], dtype=torch.long),
            "attention_mask": torch.tensor(self.data["attention_mask"][idx], dtype=torch.float),
            "labels": torch.tensor(self.data["labels"][idx], dtype=torch.long),
        }


In [None]:
train_dataset = TrecDataset(train_data)
test_dataset = TrecDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [None]:
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, input_ids, attention_mask):
        # Embedding expects 2D tensor [batch_size, sequence_length]
        embedded = self.embedding(input_ids)  # Shape: [batch_size, sequence_length, embed_dim]
        embedded = embedded * attention_mask.unsqueeze(-1)  # Apply attention mask
        transformer_output = self.transformer(embedded)  # Shape: [batch_size, sequence_length, embed_dim]
        pooled_output = transformer_output.mean(dim=1)  # Pool across sequence dimension
        return self.fc(pooled_output)  # Shape: [batch_size, num_classes]


In [None]:
# Model initialization
vocab_size = tokenizer.vocab_size
embed_dim = 128
hidden_dim = 256
num_heads = 4
num_layers = 2
num_classes = 6

model = TransformerClassifier(vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-4)



In [None]:
# Training loop
def train_model(model, train_loader, test_loader, criterion, optimizer, epochs=5):
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0

        for batch in train_loader:
            input_ids = torch.tensor(batch["input_ids"])  # Convert to Tensor
            attention_mask = torch.tensor(batch["attention_mask"])  # Convert to Tensor
            labels = torch.tensor(batch["labels"])  # Convert to Tensor

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # Validation step
        model.eval()
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in test_loader:
                input_ids = torch.tensor(batch["input_ids"])  # Convert to Tensor
                attention_mask = torch.tensor(batch["attention_mask"])  # Convert to Tensor
                labels = torch.tensor(batch["labels"])  # Convert to Tensor
                outputs = model(input_ids, attention_mask)
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        accuracy = 100 * correct / total
        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss / len(train_loader):.4f}, Accuracy: {accuracy:.2f}%")


In [None]:
# Train the model....　 it will take a very, very long time...　if you use CPU  it will be over 5h..
train_model(model, train_loader, test_loader, criterion, optimizer, epochs=5)

  input_ids = torch.tensor(batch["input_ids"])  # Convert to Tensor
  attention_mask = torch.tensor(batch["attention_mask"])  # Convert to Tensor
  labels = torch.tensor(batch["labels"])  # Convert to Tensor
  input_ids = torch.tensor(batch["input_ids"])  # Convert to Tensor
  attention_mask = torch.tensor(batch["attention_mask"])  # Convert to Tensor
  labels = torch.tensor(batch["labels"])  # Convert to Tensor


Epoch 1/5, Train Loss: 1.6138, Accuracy: 45.00%
Epoch 2/5, Train Loss: 1.4500, Accuracy: 52.20%
Epoch 3/5, Train Loss: 1.3101, Accuracy: 50.60%
Epoch 4/5, Train Loss: 1.1101, Accuracy: 63.80%
Epoch 5/5, Train Loss: 0.9137, Accuracy: 62.80%
