In [1]:
import math
import os
import random
from datetime import datetime

import pandas as pd
import transformers
from datasets import ClassLabel, load_dataset
from IPython.display import HTML, display
from transformers import (
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

import wandb

In [None]:
wandb.login()

In [3]:
os.environ["WANDB_PROJECT"] = "LLMs and African Language"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

In [4]:
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [5]:
dataset_name = "uestc-swahili/swahili"
datasets = load_dataset(dataset_name, trust_remote_code=True)

In [None]:
datasets

In [None]:
datasets["train"][0]

In [None]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(
        dataset
    ), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset) - 1)
        while pick in picks:
            pick = random.randint(0, len(dataset) - 1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))


show_random_elements(datasets["train"])

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"])


model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenized_datasets = datasets.map(
    tokenize_function, batched=True, num_proc=4, remove_columns=["text"]
)

In [None]:
tokenized_datasets

In [None]:
tokenized_datasets["train"][10]

In [None]:
show_random_elements(tokenized_datasets["train"])

In [None]:
import math

print(f"Model maximum block size: {tokenizer.model_max_length}")
# block_size = tokenizer.model_max_length
block_size = 128
batch_size = 1000
print(f"Block size: {block_size}")

for type in ["train", "test", "validation"]:
    num_tokens = sum(
        [len(input_ids) for input_ids in tokenized_datasets[type]["input_ids"]]
    )
    print(
        f"The number of tokens in {type}: {num_tokens}, this will be ~{round(num_tokens / block_size)} blocks."
    )

In [14]:
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [15]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [None]:
lm_datasets

In [None]:
show_random_elements(lm_datasets["train"])

In [None]:
model_checkpoint = "xlm-roberta-base"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
print(
    f"{model_checkpoint} number of parameters: {round(model.num_parameters() / 1_000_000)}M"
)

In [20]:
training_args = TrainingArguments(
    f"{model_checkpoint}-finetuned-{dataset_name.split('/')[-1]}",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to="wandb",
)

In [21]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm_probability=0.15
)

In [22]:
small_train_dataset = lm_datasets["train"].shuffle(seed=42).select(range(1000))
small_validation_dataset = (
    lm_datasets["validation"].shuffle(seed=42).select(range(1000))
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_validation_dataset,
    data_collator=data_collator,
)

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [26]:
trainer.save_model(f"./{current_time}_first_fine_tuning")

In [None]:
wandb.finish()