In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, RobertaForQuestionAnswering, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import pipeline
from datasets import load_dataset
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd
import random

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())	

  from .autonotebook import tqdm as notebook_tqdm


There are 1 GPU(s) available.


In [2]:
class Model:
    name: str
    tokenizer: object
    model: object

    def __init__(self, name: str, tokenizer: object = None, model: object = None):
        self.name = name
        self.tokenizer = tokenizer
        self.model = model

    def __repr__(self):
        return f"Model(name={self.name})"

    def set_tokenizer(self, tokenizer: object):
        self.tokenizer = tokenizer

    def set_model(self, model: object):
        self.model = model

    def to_pipeline(self):
        return pipeline("fill-mask", model=self.model, tokenizer=self.tokenizer, device=0)

    def get_collator(self):
        return DataCollatorForLanguageModeling(tokenizer=self.tokenizer, mlm=True, mlm_probability=0.15)

In [3]:
# load models
models = {
    "de" : Model("uklfr/gottbert-base"),
    "nl" : Model("pdelobelle/robbert-v2-dutch-base"),
}

for language, model in tqdm(models.items()):
    # get model & tokenizer from huggingface
    model.set_tokenizer(AutoTokenizer.from_pretrained(model.name))
    model.set_model(AutoModelForMaskedLM.from_pretrained(model.name).to("cuda"))


100%|██████████| 2/2 [00:11<00:00,  5.60s/it]


In [4]:
# create pipelines

pipelines = {k: v.to_pipeline() for k, v in models.items()}

In [5]:
%%capture
# load datasets

datasets = {
    "de" : load_dataset("xquad", "xquad.de"),
    "es" : load_dataset("xquad", "xquad.es"),
    "en" : load_dataset("xquad", "xquad.en"),
}

In [21]:
results = {}
# tokenize each dataset for each model
for model_language, pipeline in tqdm(pipelines.items()):

    for dataset_language, dataset in datasets.items():
        # raw data
        raw_data = dataset["validation"]["context"]
        # split into sentences
        sentences = [sentence.split(" ") for text in raw_data for sentence in text.split(".")]       
        # mask one word in each sentence
        for sentence in sentences:
            # get random index
            index = random.randint(0, len(sentence) - 1)
            # mask word
            sentence[index] = pipeline.tokenizer.mask_token

        sentences = [" ".join(sentence) for sentence in sentences]
        
        predictions = pipeline(sentences)

        # get average score
        scores = [prediction[0]["score"] for prediction in predictions]
        average_score = np.mean(scores)
        results[(model_language, dataset_language)] = average_score


100%|██████████| 2/2 [04:50<00:00, 145.01s/it]


Unnamed: 0,key,value


In [22]:
results

{('de', 'de'): 0.4575018730614422,
 ('de', 'es'): 0.36086595123453896,
 ('de', 'en'): 0.40376126086599906,
 ('nl', 'de'): 0.24470338059054128,
 ('nl', 'es'): 0.2522425835616527,
 ('nl', 'en'): 0.30160213439215805}

In [28]:
# %%capture

def tokenize_function(examples):
    tokenizer = models["de"].tokenizer
    return tokenizer(examples["text"]["validation"]["context"], max_length = 384, padding="max_length", truncation=True)

tokenized_datasets = {k: v.map(tokenize_function, batched=True) for k, v in datasets.items()}

# tokenized_datasets = test_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

  0%|          | 0/2 [00:00<?, ?ba/s]


KeyError: 'text'

In [16]:
tokenized_datasets.keys()

dict_keys(['test', 'train', 'validation'])

In [17]:
trainer = Trainer(
    model=models["de"].model,
    args=TrainingArguments(
        f"{models['de'].name}-wikitext",
        evaluation_strategy = "epoch",
        learning_rate=2e-5,
        weight_decay=0.01),
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [18]:
trainer.train()

***** Running training *****
  Num examples = 1801350
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 675507


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
for model_language, pipeline in pipelines.items():
    for test_language, dataset in datasets.items():
        scores = []

        # randomly mask 10% of tokens in data
        masked_data = ["".join([token if np.random.rand() > 0.1 else pipeline.tokenizer.mask_token for token in context.split(" ")]) for context in dataset["validation"]["context"]]
        tokenized_data = [pipeline.tokenizer(token_list, return_tensors="pt", padding=True) for token_list in masked_data]
        print(f"tokenized data: {tokenized_data[0]}")
        res = pipeline(tokenized_data[0])
        
        for elem in res:
            scores.append(elem["score"])
        print(f"model language: {model_language}, test language: {test_language}, average score: {np.average(scores)}, min score: {np.min(scores)}, max score: {np.max(scores)}")
            