<a href="https://colab.research.google.com/github/human-ai2025/NLP-Codes/blob/master/train_a_new_language_model_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations

In [1]:
!pip install datasets -q
!pip install datasets evaluate transformers[sentencepiece] -q
!pip install accelerate -q

# Libraries

In [2]:
import datasets
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
    ByteLevelBPETokenizer
)
import os
import math

from transformers import RobertaConfig
from transformers import RobertaTokenizerFast
from transformers import RobertaForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments



# Build Model

## Load the dataset

In [3]:
dataset = datasets.load_dataset('oscar', 'unshuffled_deduplicated_la', split="train")



## Tokenizer

In [4]:
with open("la_files.txt", "w", encoding="utf-8") as f:
    for i in range(len(dataset)):
        f.write(dataset[i]["text"] + "\n")

Roberta used BPE tokenization method

In [5]:
# Here we keep the vocab size of 20k
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=["/content/la_files.txt"], vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [6]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding)

Encoding(num_tokens=10, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [8]:
# os.mkdir('./la')
tokenizer.save_model('la') 

['la/vocab.json', 'la/merges.txt']

## MLM

In [50]:
config = RobertaConfig(
    vocab_size=20000,
    max_position_embeddings=514,
    num_attention_heads=6,
    num_hidden_layers=2,
    type_vocab_size=1,
)

In [51]:
tokenizer = RobertaTokenizerFast.from_pretrained("./la", max_len=512, truncation=True)

In [52]:
model = RobertaForMaskedLM(config=config)

In [58]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if len(result) > 512:
      print(result)
      result = result[:512]
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=["id", "text"]
)
tokenized_datasets

Map:   0%|          | 0/18808 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids'],
    num_rows: 18808
})

In [59]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [60]:
train_size = 10000
test_size = int(0.1 * train_size)

downsampled_dataset = dataset.train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset



DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['id', 'text'],
        num_rows: 1000
    })
})

In [61]:
batch_size = 8
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size

training_args = TrainingArguments(
    output_dir=f"la_out",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    logging_steps=logging_steps,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [62]:
%%time
trainer.train()

  return table.fast_gather(key % table.num_rows)


In [None]:
trainer.save_model("./la_model")