In [1]:
!pip install -Uq apache_beam mwparserfromhell

In [2]:
from itertools import chain

import datasets
from datasets import load_dataset, load_metric

import transformers
import transformers.adapters.composition as ac
from transformers import (
    AdapterTrainer,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    MultiLingAdapterArguments,
    TrainingArguments,
    set_seed,
)
from transformers.adapters.configuration import AdapterConfig
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version

NOTE: Redirects are currently not supported in Windows or MacOs.


In [3]:
set_seed(42069)

In [23]:
valid_percentage = 20
language_code = "am"
task_name = language_code + "_mlm"
date = "20221120"

training_args = TrainingArguments(language_code + '_output',
    learning_rate=1e-4,
    num_train_epochs=10.0, report_to="all")

PyTorch: setting up devices


In [5]:
raw_datasets = load_dataset(
    "wikipedia", language=language_code, date=date
)
raw_datasets["validation"] = load_dataset(
    "wikipedia", language=language_code, date=date, 
    split=f"train[:{valid_percentage}%]"
)
raw_datasets["train"] = load_dataset(
    "wikipedia", language=language_code, date=date, 
    split=f"train[{valid_percentage}%:]"
)

Using custom data configuration 20221120.am-date=20221120,language=am
Reusing dataset wikipedia (/Users/thomas/.cache/huggingface/datasets/wikipedia/20221120.am-date=20221120,language=am/0.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration 20221120.am-date=20221120,language=am
Reusing dataset wikipedia (/Users/thomas/.cache/huggingface/datasets/wikipedia/20221120.am-date=20221120,language=am/0.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)
Using custom data configuration 20221120.am-date=20221120,language=am
Reusing dataset wikipedia (/Users/thomas/.cache/huggingface/datasets/wikipedia/20221120.am-date=20221120,language=am/0.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


In [7]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")
model.resize_token_embeddings(len(tokenizer))

Embedding(250002, 768, padding_idx=1)

In [8]:
adapter_config = AdapterConfig.load("pfeiffer+inv")
model.add_adapter(task_name, config=adapter_config)

In [9]:
model.train_adapter([task_name])
model.set_active_adapters(task_name)

In [10]:
column_names = raw_datasets["train"].column_names
text_column_name = "text"
max_seq_length = tokenizer.model_max_length

In [11]:
def tokenize_function(examples):
    return tokenizer(examples[text_column_name], return_special_tokens_mask=True)

tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=column_names
)

  0%|          | 0/11 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1122 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/3 [00:00<?, ?ba/s]

In [12]:
def group_texts(examples):
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= max_seq_length:
        total_length = (total_length // max_seq_length) * max_seq_length
    result = {
        k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
        for k, t in concatenated_examples.items()
    }
    return result

tokenized_datasets = tokenized_datasets.map(
    group_texts,
    batched=True
)

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [13]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]
def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        logits = logits[0]
    return logits.argmax(dim=-1)

metric = load_metric("accuracy")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    labels = labels.reshape(-1)
    preds = preds.reshape(-1)
    mask = labels != -100
    labels = labels[mask]
    preds = preds[mask]
    return metric.compute(predictions=preds, references=labels)

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

In [14]:
data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm_probability=0.15)

In [24]:
trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)


In [None]:
train_result = trainer.train()
trainer.save_model()

metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)

trainer.save_metrics("train", metrics)
trainer.save_state()

The following columns in the training set don't have a corresponding argument in `XLMRobertaForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `XLMRobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5151
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6440


Step,Training Loss


In [None]:
metrics = trainer.evaluate()

metrics["eval_samples"] = len(eval_dataset)

try:
    perplexity = math.exp(metrics["eval_loss"])
except OverflowError:
    perplexity = float("inf")
    
metrics["perplexity"] = perplexity

trainer.save_metrics("eval", metrics)

In [None]:
model.save_adapter(language_code + '_output', f'lm_{language_code}_{perplexity:.3}')