In [8]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, RobertaForQuestionAnswering
from transformers import pipeline
from tqdm import tqdm
import json

In [2]:
class Model:
    name: str
    tokenizer: object
    model: object

    def __init__(self, name: str, tokenizer: object = None, model: object = None):
        self.name = name
        self.tokenizer = tokenizer
        self.model = model

    def __repr__(self):
        return f"Model(name={self.name})"

    def set_tokenizer(self, tokenizer: object):
        self.tokenizer = tokenizer

    def set_model(self, model: object):
        self.model = model

In [6]:
# load models
models = {
    "de" : Model("uklfr/gottbert-base"),
    "nl" : Model("pdelobelle/robbert-v2-dutch-base"),
}

for language, model in tqdm(models.items()):
    # get model & tokenizer from huggingface
    model.set_tokenizer(AutoTokenizer.from_pretrained(model.name))
    model.set_model(RobertaForQuestionAnswering.from_pretrained(model.name))


  0%|          | 0/2 [00:00<?, ?it/s]Some weights of the model checkpoint at uklfr/gottbert-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at uklfr/gottbert-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
Yo

In [18]:
%%capture

from datasets import load_dataset

datasets = {
    "de" : load_dataset("xquad", "xquad.de"),
    "es" : load_dataset("xquad", "xquad.es"),
    "en" : load_dataset("xquad", "xquad.en"),
}


In [21]:
datasets["de"]

DatasetDict({
    validation: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 1190
    })
})

In [9]:
pipelines = {
    "de": pipeline("question-answering", model=models["de"].model, tokenizer=models["de"].tokenizer),
    "nl": pipeline("question-answering", model=models["nl"].model, tokenizer=models["nl"].tokenizer),
}

In [25]:
for language, pipeline in pipelines.items():
    for language, dataset in datasets.items():
        res = pipeline({"question": dataset["validation"]["question"][0]
                , "context": dataset["validation"]["context"][0]})
        print(res)
            

{'score': 2.438506817270536e-05, 'start': 1063, 'end': 1075, 'answer': 'Karrierehoch'}
{'score': 2.9517095754272304e-05, 'start': 1283, 'end': 1298, 'answer': 'también jugador'}
{'score': 3.348920654389076e-05, 'start': 414, 'end': 420, 'answer': 'bowler'}
{'score': 5.6197415688075125e-05, 'start': 903, 'end': 931, 'answer': 'Tackles anführte (118), zwei'}
{'score': 2.9597707907669246e-05, 'start': 1283, 'end': 1290, 'answer': 'también'}
{'score': 2.5089304472203366e-05, 'start': 421, 'end': 445, 'answer': "who was the NFL's active"}
