<a href="https://colab.research.google.com/github/kla55/transformer/blob/main/simple_transformer_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

In [19]:
def model_prediction(input_text):
  # Load pretrained BERT model and tokenizer
  model_name = 'bert-base-uncased'
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForMaskedLM.from_pretrained(model_name)

  model.eval()

  # Tokenized the input text
  inputs = tokenizer(input_text, return_tensors='pt')

  # Get the logits from the model
  with torch.no_grad():
    outputs = model(**inputs)

  # Extract logits and fine the predition for the [MASK] token
  logits = outputs.logits
  mask_token_index = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]

  predicted_token_id = logits[0, mask_token_index].argmax(dim=-1).item()
  prediction_token = tokenizer.decode(predicted_token_id)
  return prediction_token




In [22]:
input_text = "The capital of France is [MASK]."
predicted_token = model_prediction(input_text)
print(f"Original sentence: {input_text}")
print(f"Predicted sentence: The capital of Malaysia is {predicted_token}.")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Original sentence: The capital of France is [MASK].
Predicted sentence: The capital of Malaysia is paris.


In [23]:
input_text = "1 + 1 = [MASK]."
predicted_token = model_prediction(input_text)
print(f"Original sentence: {input_text}")
print(f"Predicted sentence: 1 + 1 =  {predicted_token}.")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Original sentence: 1 + 1 = [MASK].
Predicted sentence: 1 + 1 =  0.


In [38]:
import pandas as pd

# Custom dataset: country-capital pairs
# data = [
#     {"text": "The capital of Malaysia is [MASK].", "label": "Kuala Lumpur"},
#     {"text": "The capital of France is [MASK].", "label": "Paris"},
#     {"text": "The capital of Japan is [MASK].", "label": "Tokyo"},
#     {"text": "The capital of India is [MASK].", "label": "New Delhi"},
# ]
data = [
    {"text": "The capital of Malaysia is [MASK].", "label": "Kuala Lumpur"},
    {"text": "Kuala Lumpur is the capital city of Malaysia.", "label": "Kuala Lumpur"},
    {"text": "Malaysia's capital is [MASK].", "label": "Kuala Lumpur"},
    {"text": "Paris is the capital of France. The capital of Malaysia is [MASK].", "label": "Kuala Lumpur"},
]
# Save as a DataFrame (for simplicity)
df = pd.DataFrame(data)


In [49]:
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader

# Custom PyTorch Dataset
class MaskedLMDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        label = self.data[idx]["label"]

        # Tokenize the input text
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        # Tokenize the label
        label_id = self.tokenizer.convert_tokens_to_ids(label)

        # Find the index of the [MASK] token
        mask_token_index = (encoding["input_ids"] == self.tokenizer.mask_token_id).nonzero(as_tuple=True)[1]

        # Set up labels
        labels = torch.full_like(encoding["input_ids"], -100)  # Ignore all tokens except [MASK]
        labels[0, mask_token_index] = label_id  # Assign the label token ID to [MASK]

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": labels.squeeze(0),
        }


# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(tokenizer.mask_token_id)
print(tokenizer.mask_token)
# Create the dataset
dataset = MaskedLMDataset(data, tokenizer)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)


103
[MASK]


In [50]:
from transformers import AutoModelForMaskedLM

# Load the model
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

### Information on the transformer:
A **scheduler** in the context of deep learning and training models refers to a mechanism that adjusts the learning rate during training. The learning rate (LR) determines how much the model’s weights are updated during each step of training. A learning rate scheduler helps manage this rate over time, usually starting with a higher rate and gradually decreasing it to stabilize training.

**get_scheduler()**: This function creates a learning rate scheduler. The scheduler takes the optimizer and adjusts the learning rate over training steps.

**"linear"**: This is the type of learning rate schedule. In this case, it's a linear scheduler. A **linear scheduler** gradually decreases the learning rate from a maximum value to a minimum value over a set number of training steps (typically during the course of the entire training). The learning rate decreases linearly.**

**optimizer=optimizer**: This refers to the optimizer object that was used to train the model (e.g., Adam, AdamW). The scheduler modifies the learning rate that the optimizer uses during training.

# Why AdamW
Decoupling Weight decay\
**In Adam:** The weight decay term is applied to the gradient during the update:\
gradients
=
original gradients
+
weight decay term
gradients=original gradients+weight decay term

**In AdamW:** The weight decay term is applied separately from the gradient update:\
new parameter
=
old parameter
−
learning rate
×
(
gradient
+
weight decay term
)
new parameter=old parameter−learning rate×(gradient+weight decay term)

In [51]:
from torch.optim import AdamW

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Scheduler (optional, e.g., linear decay)
from transformers import get_scheduler

num_training_steps = len(dataloader) * 3  # 3 epochs
num_warmup_steps = 1
#
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)


In [52]:
from torch.nn import CrossEntropyLoss

# Training loop
epochs = 3
model.train()

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    total_loss = 0

    for batch in dataloader:
        # Move data to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Average loss: {avg_loss}")


Epoch 1/3
Average loss: 10.95367431640625
Epoch 2/3
Average loss: 10.622824668884277
Epoch 3/3
Average loss: 0.5249819159507751


In [53]:
# Test the fine-tuned model
model.eval()
input_text = "The capital of Malaysia is [MASK]."

# Tokenize input
inputs = tokenizer(input_text, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}

# Make prediction
with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
mask_token_index = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
predicted_token_id = logits[0, mask_token_index].argmax(dim=-1).item()
predicted_token = tokenizer.decode(predicted_token_id)

print(f"Original sentence: {input_text}")
print(f"Predicted sentence: The capital of Malaysia is {predicted_token}.")


Original sentence: The capital of Malaysia is [MASK].
Predicted sentence: The capital of Malaysia is [UNK].


# Embedding kuala lumpur into the vocab tokenizer

### Key Takeaways
- Why Random Initialization?: New tokens are not part of the pre-trained vocabulary, so their embeddings need to be initialized randomly before they can be trained.
- How It Works: We resize the embedding layer, then manually initialize the embeddings for the new tokens.
- Why Slicing: The slicing operation ensures that we only modify the embeddings for the new tokens added to the vocabulary.

In [91]:
# Add "Kuala Lumpur" to tokenizer vocabulary
new_tokens = ["Kuala Lumpur"]
tokenizer.add_tokens(new_tokens)

# Resize the model embedding to match the new vocabulary size
model.resize_token_embeddings(len(tokenizer))
# manual initialization of the embedding weights for the newly added tokens.
# model.get_input_embeddings() - This function returns the input embedding layer of the model.
# weight.data This refers to the actual weights of the embedding layer (i.e., the embedding vectors for each token). .data accesses the raw tensor containing these weights.
# [-len(new_tokens):] - This slice operation selects the weights corresponding to the new tokens that were added to the tokenizer.
# The slice [-len(new_tokens):] selects the last len(new_tokens) rows of the embedding weight matrix. This corresponds to the newly added tokens, as they are typically added at the end of the vocabulary.
# torch.randn(len(new_tokens), model.config.hidden_size) - This creates a tensor of shape (len(new_tokens), model.config.hidden_size) filled with random numbers drawn from a normal distribution with mean 0 and variance 1.
# model.config.hidden_size refers to the dimensionality of the token embeddings (e.g., for BERT-base, this is 768).

#When you add new tokens to the tokenizer, the model’s embedding layer needs to have embeddings for those tokens. Since these tokens were not part of the original vocabulary (and hence were not part of the pretraining), their embeddings are not initialized by the pretrained weights. Instead, they are initialized randomly.
# By setting the embeddings to random values using torch.randn(), we give the model the opportunity to learn useful embeddings for these tokens during fine-tuning. This helps the model adjust its representations of the new tokens during training.
model.get_input_embeddings().weight.data[-len(new_tokens):] = torch.randn(len(new_tokens), model.config.hidden_size)

# Now "Kuala Lumpur" will not be tokenized as [UNK]
print(tokenizer.tokenize("Kuala Lumpur"))

dataset = MaskedLMDataset(data, tokenizer)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

['kuala lumpur']


In [92]:
print(f"Original vocabulary size: {len(tokenizer)}")
print(f"Embedding size: {model.get_input_embeddings().weight.shape}")

Original vocabulary size: 30523
Embedding size: torch.Size([30523, 768])


In [93]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30523, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [95]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(dataloader) * 3 # 3 epochs
num_warmup_steps = 1
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

# Fine-tuning loop
model.train()
for epoch in range(3):
    print(f"Epoch {epoch + 1}")
    total_loss = 0

    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1} Loss: {total_loss / len(dataloader)}")


Epoch 1
Epoch 1 Loss: 8.025346755981445
Epoch 2
Epoch 2 Loss: 15.467495918273926
Epoch 3
Epoch 3 Loss: 0.0


In [96]:
input_text = "The capital of Malaysia is [MASK]."
inputs = tokenizer(input_text, return_tensors="pt").to(device)

model.eval()
with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
predicted_token_id = logits[0, mask_token_index].argmax(dim=-1).item()
predicted_token = tokenizer.decode(predicted_token_id)

print(f"Original sentence: {input_text}")
print(f"Predicted sentence: The capital of Malaysia is {predicted_token}.")


Original sentence: The capital of Malaysia is [MASK].
Predicted sentence: The capital of Malaysia is Kuala Lumpur.
