In [None]:
import transformers

print(transformers.__version__)

In [14]:
from datasets import load_dataset

dataset_name = "uestc-swahili/swahili"
datasets = load_dataset(dataset_name)

In [None]:
datasets

In [None]:
datasets["train"][10]

In [None]:
from transformers import AutoTokenizer


def tokenize_function(examples):
    return tokenizer(examples["text"])


model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenized_datasets = datasets.map(
    tokenize_function, batched=True, num_proc=4, remove_columns=["text"]
)

In [None]:
tokenized_datasets

In [None]:
tokenized_datasets["train"][10]

In [20]:
assert 512 == tokenizer.model_max_length
block_size = tokenizer.model_max_length

In [21]:
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [22]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [None]:
lm_datasets

In [None]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "xlm-roberta-base"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
print(
    f"{model_checkpoint} number of parameters: {round(model.num_parameters() / 1_000_000)}M"
)

In [26]:
from transformers import Trainer, TrainingArguments

In [27]:
training_args = TrainingArguments(
    f"{model_checkpoint}-finetuned-{dataset_name.split('/')[-1]}",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_eval_batch_size=4,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
)

In [28]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm_probability=0.15
)

In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
)

In [None]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.train()

In [None]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.save_model()