<a href="https://colab.research.google.com/github/human-ai2025/NLP-Codes/blob/master/tokenizerAndMLMfromScratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations

In [1]:
!pip install datasets evaluate transformers[sentencepiece] -q
!pip install accelerate -q

# Libraries

In [2]:
import datasets
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
    ByteLevelBPETokenizer
)
import os
import math

from transformers import RobertaConfig
from transformers import RobertaTokenizerFast
from transformers import RobertaForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments



# Build Model

## Load the dataset

In [3]:
imdb_dataset = datasets.load_dataset("imdb")
imdb_dataset



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

## Tokenizer

In [4]:
with open("imdb_dataset.txt", "w", encoding="utf-8") as f:
    for i in range(len(imdb_dataset['train'])):
        f.write(imdb_dataset['train'][i]["text"] + "\n")

Roberta used BPE tokenization method

In [5]:

# Here we keep the vocab size of 20k
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=["/content/imdb_dataset.txt"], vocab_size=10000, min_frequency=2, special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])

In [6]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['Let', "'s", 'Ġtest', 'Ġthis', 'Ġto', 'ken', 'iz', 'er', '.']


In [8]:
os.mkdir('./la')
tokenizer.save_model('la') 

['la/vocab.json', 'la/merges.txt']

## MLM

In [9]:
config = RobertaConfig(
    vocab_size=10000,
    max_position_embeddings=514,
    num_attention_heads=6,
    num_hidden_layers=2,
    type_vocab_size=1,
)

In [10]:
tokenizer = RobertaTokenizerFast.from_pretrained("./la", max_len=512, truncation=True)

In [11]:
model = RobertaForMaskedLM(config=config)

In [12]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=['text', 'label']
)
tokenized_datasets

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (746 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [13]:
chunk_size = 128

In [14]:
# Slicing produces a list of lists for each feature
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 387'
'>>> Review 1 length: 324'
'>>> Review 2 length: 128'


In [15]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 839'


In [16]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 71'


In [17]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [18]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 63370
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 62200
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 127758
    })
})

In [19]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.20)

In [20]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> <s>I<mask> I<mask><mask> CURIOUS-YELLOW from my video store because<mask> all the controversy that surrounded it<mask> it was first released in 1967. I also heard that<mask><mask> it was seized<mask> U.S. customs if<mask> ever tried to enter this country<mask> therefore being a fan of films considered "controvers<mask>" I really had to see this for myself.<br /><<mask><mask>The<mask> is centered<mask><mask> young<mask> drama student named<mask> who wants to learn everything she can about life. In particular she wants to<mask> her attentions<mask><mask> some sort of<mask><mask>'

'>>>  what the average Swed<mask> thought about<mask><mask> issues such asiting Vietnam War and race issues in<mask> United<mask><mask><mask> between asking politicians and<mask> denizens<mask> Stockholm about their opin exec on politics, she has sex<mask> her drama teacher, classmates, and<mask> men.<br /><br />What kills me about<mask> AM CURIOUS-YELLOW<mask> that 40<mask> ago, this was considered<mask>

In [21]:
train_size = 10000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [22]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size

training_args = TrainingArguments(
    output_dir=f"RobertA-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    logging_steps=logging_steps,
)

In [23]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [24]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,7.838,7.222185
2,7.1219,7.023505
3,6.9991,6.990989


TrainOutput(global_step=471, training_loss=7.317707134659883, metrics={'train_runtime': 56.1305, 'train_samples_per_second': 534.469, 'train_steps_per_second': 8.391, 'total_flos': 340517560320000.0, 'train_loss': 7.317707134659883, 'epoch': 3.0})

In [25]:
trainer.save_model("./la_model")

In [26]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 1062.44
