Masked LM from scratch

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

In [4]:
sentences = [
    "the cat sat on the mat",
    "the dog ate my homework",
    "i love to code in python",
    "we are learning about masked language models",
    "the quick brown fox jumps over the lazy dog"
]

In [5]:
words = " ".join(sentences).split()
vocab = list(set(words))
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}
vocab_size = len(vocab)
window_size = 2

In [6]:
data = []
for sentence in sentences:
    tokenized_sentence = sentence.split()
    for i, word in enumerate(tokenized_sentence):
        target = word_to_ix[word]

        context = []
        for j in range(i - window_size, i):
            if j >= 0:
                context.append(word_to_ix[tokenized_sentence[j]])
            else:
                context.append(vocab_size)
        for j in range(i + 1, i + 1 + window_size):
            if j < len(tokenized_sentence):
                context.append(word_to_ix[tokenized_sentence[j]])
            else:
                context.append(vocab_size)

        data.append((context, target))

In [7]:
embedding_dim = 100
n_hidden = 64

In [8]:
class SimpleMLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_hidden):
        super(SimpleMLM, self).__init__()
        self.embeddings = nn.Embedding(vocab_size + 1, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim * window_size * 2, n_hidden)
        self.activation = nn.ReLU()
        self.linear2 = nn.Linear(n_hidden, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((inputs.shape[0], -1))
        out = self.linear1(embeds)
        out = self.activation(out)
        out = self.linear2(out)
        return out

In [9]:
model = SimpleMLM(vocab_size, embedding_dim, n_hidden)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [10]:
for epoch in range(50):
    total_loss = 0
    for context, target in data:
        context_tensor = torch.tensor([context], dtype=torch.long)

        model.zero_grad()

        log_probs = model(context_tensor)

        loss = loss_function(log_probs, torch.tensor([target], dtype=torch.long))

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(data):.4f}")

Epoch 10, Loss: 0.2026
Epoch 20, Loss: 0.0408
Epoch 30, Loss: 0.0212
Epoch 40, Loss: 0.0140
Epoch 50, Loss: 0.0103


In [11]:
print("\n--- Inference ---")
test_context = [word_to_ix['the'], word_to_ix['cat'], word_to_ix['on'], word_to_ix['the']]
test_tensor = torch.tensor([test_context], dtype=torch.long)

output = model(test_tensor)
predicted_index = torch.argmax(output, 1).item()
predicted_word = ix_to_word[predicted_index]

print(f"Context: ['the', 'cat', [MASK],'on', 'the','mat']")
print(f"Predicted word for MASK: {predicted_word}")


--- Inference ---
Context: ['the', 'cat', [MASK],'on', 'the','mat']
Predicted word for MASK: sat


Masked LM using Transformer

In [12]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

In [13]:
model_name = 'distilbert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)
model.eval()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForMaskedLM were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.dense.w

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [14]:
text = f"I went to the store to buy some [MASK] for dinner."
inputs = tokenizer(text, return_tensors='pt')
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]
print(f"Original sentence: {text}")

Original sentence: I went to the store to buy some [MASK] for dinner.


In [15]:
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    predictions = outputs.logits

masked_token_logits = predictions[0, mask_token_index, :]
top_5_tokens = torch.topk(masked_token_logits, 5, dim=1).indices[0].tolist()

In [16]:
print("\nTop 5 predictions for the [MASK] token:")
for token_id in top_5_tokens:
    predicted_token = tokenizer.decode([token_id])
    print(f"-> {predicted_token}")

text_2 = f"The capital of France is [MASK]."
inputs_2 = tokenizer(text_2, return_tensors='pt')
mask_token_index_2 = torch.where(inputs_2['input_ids'] == tokenizer.mask_token_id)[1]

with torch.no_grad():
    outputs_2 = model(inputs_2['input_ids'], attention_mask=inputs_2['attention_mask'])
    predictions_2 = outputs_2.logits

masked_token_logits_2 = predictions_2[0, mask_token_index_2, :]
top_5_tokens_2 = torch.topk(masked_token_logits_2, 5, dim=1).indices[0].tolist()


Top 5 predictions for the [MASK] token:
-> 信
-> ##cans
-> ##gny
-> ւ
-> tehran


In [17]:
print(f"\nOriginal sentence: {text_2}")
print("Top 5 predictions for the [MASK] token:")
for token_id in top_5_tokens_2:
    predicted_token = tokenizer.decode([token_id])
    print(f"-> {predicted_token}")


Original sentence: The capital of France is [MASK].
Top 5 predictions for the [MASK] token:
-> ##gny
-> ##good
-> 見
-> scott
-> ւ


 Text Classification by Fine tuning BERT model.

In [18]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split

In [19]:
texts = [
    "I loved the movie, it was fantastic!",
    "The food was terrible, I would not recommend.",
    "What a brilliant performance, truly amazing.",
    "This is the worst experience I have ever had.",
    "I am very happy with the result.",
    "The service was slow and the staff were rude.",
    "An outstanding achievement in filmmaking.",
    "I was completely disappointed.",
    "This product exceeded all my expectations.",
    "A total waste of time and money."
]
labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

In [20]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42)

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [22]:
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 3

In [23]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs} | Training Loss: {avg_train_loss:.4f}")

Epoch 1/3 | Training Loss: 0.7626
Epoch 2/3 | Training Loss: 0.5198
Epoch 3/3 | Training Loss: 0.3053


In [24]:
model.eval()
correct_predictions = 0
total_predictions = 0

In [25]:
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        _, preds = torch.max(outputs.logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_predictions += labels.shape[0]

In [26]:
accuracy = correct_predictions.double() / total_predictions
print(f"\nValidation Accuracy: {accuracy:.4f}")


Validation Accuracy: 1.0000


In [27]:
test_text = "This movie was absolutely wonderful!"
inputs = tokenizer(test_text, return_tensors='pt', padding=True, truncation=True).to(device)

with torch.no_grad():
    logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()

sentiment = "Positive" if predicted_class_id == 1 else "Negative"
print(f"\nSentence: '{test_text}'")
print(f"Predicted Sentiment: {sentiment}")


Sentence: 'This movie was absolutely wonderful!'
Predicted Sentiment: Positive


#CONCLUSION

This assignment successfully demonstrated the fundamental principles and practical applications of modern transformer-based language models. The initial implementation of a Masked Language Model (MLM) from scratch provided foundational insights into the core mechanics of bidirectional context learning on a small scale. Transitioning to a pre-trained Transformer model for the same MLM task highlighted the significant leap in performance and contextual understanding offered by large-scale pre-training. The final exercise, fine-tuning a BERT model for text classification, served as a practical capstone, proving the effectiveness of the transfer learning paradigm. By adapting a model with general linguistic knowledge to a specific downstream task, we validated that the pre-train and fine-tune workflow is a highly efficient and powerful approach for solving real-world Natural Language Processing problems.