In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, RobertaForQuestionAnswering, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import pipeline
from datasets import load_dataset
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd
import random

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())	

  from .autonotebook import tqdm as notebook_tqdm


There are 1 GPU(s) available.


In [2]:
class Model:
    name: str
    tokenizer: object
    model: object

    def __init__(self, name: str, tokenizer: object = None, model: object = None):
        self.name = name
        self.tokenizer = tokenizer
        self.model = model

    def __repr__(self):
        return f"Model(name={self.name})"

    def set_tokenizer(self, tokenizer: object):
        self.tokenizer = tokenizer

    def set_model(self, model: object):
        self.model = model

    def to_pipeline(self):
        return pipeline("fill-mask", model=self.model, tokenizer=self.tokenizer, device=0)

    def get_collator(self):
        return DataCollatorForLanguageModeling(tokenizer=self.tokenizer, mlm=True, mlm_probability=0.15)

In [3]:
# load models
models = {
    "de" : Model("uklfr/gottbert-base"),
    "nl" : Model("pdelobelle/robbert-v2-dutch-base"),
    "en" : Model("bert-base"),
    "es" : Model("bertin-project/bertin-roberta-base-spanish"),
    "tr" : Model("dbmdz/bert-base-turkish-cased")
}

for language, model in tqdm(models.items()):
    # get model & tokenizer from huggingface
    model.set_tokenizer(AutoTokenizer.from_pretrained(model.name))
    model.set_model(AutoModelForMaskedLM.from_pretrained(model.name).to("cuda"))


Downloading: 100%|██████████| 481/481 [00:00<00:00, 204kB/s]
Downloading: 100%|██████████| 878k/878k [00:00<00:00, 1.76MB/s]
Downloading: 100%|██████████| 446k/446k [00:00<00:00, 1.09MB/s]
Downloading: 100%|██████████| 1.29M/1.29M [00:00<00:00, 2.61MB/s]
Downloading: 100%|██████████| 478M/478M [00:06<00:00, 72.7MB/s]
Downloading: 100%|██████████| 1.09k/1.09k [00:00<00:00, 656kB/s]
Downloading: 100%|██████████| 831k/831k [00:00<00:00, 1.98MB/s]
Downloading: 100%|██████████| 497k/497k [00:00<00:00, 1.20MB/s]
Downloading: 100%|██████████| 2.11M/2.11M [00:00<00:00, 3.72MB/s]
Downloading: 100%|██████████| 772/772 [00:00<00:00, 407kB/s]
Downloading: 100%|██████████| 674/674 [00:00<00:00, 320kB/s]
Downloading: 100%|██████████| 476M/476M [00:04<00:00, 112MB/s]
Downloading: 100%|██████████| 60.0/60.0 [00:00<00:00, 35.1kB/s]
Downloading: 100%|██████████| 385/385 [00:00<00:00, 127kB/s]
Downloading: 100%|██████████| 245k/245k [00:00<00:00, 747kB/s]
Downloading: 100%|██████████| 424M/424M [00:04<00

In [4]:
# create pipelines

pipelines = {k: v.to_pipeline() for k, v in models.items()}

In [9]:
%%capture
# load datasets

datasets = {
    "de" : load_dataset("xquad", "xquad.de"),
    "es" : load_dataset("xquad", "xquad.es"),
    "en" : load_dataset("xquad", "xquad.en"),
    "tr" : load_dataset("xquad", "xquad.tr"),
    "ro" : load_dataset("xquad", "xquad.ro"),
}

In [10]:
results = {}
# tokenize each dataset for each model
for model_language, pipeline in tqdm(pipelines.items()):

    for dataset_language, dataset in datasets.items():
        # raw data
        raw_data = dataset["validation"]["context"]
        # split into sentences
        # TODO: split at all sentence separators (not just .)
        sentences = [sentence.split(" ") for text in raw_data for sentence in text.split(".")]       
        # mask one word in each sentence
        for sentence in sentences:
            # get random index
            index = random.randint(0, len(sentence) - 1)
            # mask word
            sentence[index] = pipeline.tokenizer.mask_token

        sentences = [" ".join(sentence) for sentence in sentences]
        
        try:
            predictions = pipeline(sentences)
        except RuntimeError:
            print(f"Error with {model_language} and {dataset_language}")
            continue

        # get average score
        scores = [prediction[0]["score"] for prediction in predictions]
        average_score = np.mean(scores)
        print(f"{model_language} - {dataset_language}: {average_score}")
        results[(model_language, dataset_language)] = average_score


  0%|          | 0/5 [00:00<?, ?it/s]

de - de: 0.46071538940542683
de - es: 0.3567187782143202




de - en: 0.39938523550053423
de - tr: 0.28283499372129534


 20%|██        | 1/5 [04:25<17:42, 265.66s/it]

de - ro: 0.3047445067640461
nl - de: 0.25299656951255545
nl - es: 0.2479078366528343
nl - en: 0.3053643304479345
nl - tr: 0.20374616370099918


 40%|████      | 2/5 [08:39<12:55, 258.63s/it]

nl - ro: 0.19068813664510717
en - de: 0.5488932316252907
en - es: 0.609008788265456
en - en: 0.6592554890597058
en - tr: 0.4299370428742694


 60%|██████    | 3/5 [13:02<08:41, 260.55s/it]

en - ro: 0.5608771198478245
es - de: 0.26342204932808266
es - es: 0.48206794604227443
es - en: 0.35937982430666865
es - tr: 0.21336857984569754


 80%|████████  | 4/5 [17:29<04:23, 263.40s/it]

es - ro: 0.21655181313657607
tr - de: 0.3109657022356408
tr - es: 0.31926372437494843
tr - en: 0.37528451141533153
tr - tr: 0.4052198801610779


100%|██████████| 5/5 [21:24<00:00, 256.84s/it]

tr - ro: 0.2733037169898011





In [12]:
to_df = []
for (model_language, dataset_language), score in results.items():
    to_df.append({
        "model_language" : model_language,
        "dataset_language" : dataset_language,
        "score" : score
    })

df = pd.DataFrame(to_df)

In [13]:
df.head()

Unnamed: 0,model_language,dataset_language,score
0,de,de,0.460715
1,de,es,0.356719
2,de,en,0.399385
3,de,tr,0.282835
4,de,ro,0.304745


In [16]:
from lets_plot import *
LetsPlot.setup_html()

ggplot() + geom_tile(aes(x='model_language', y='dataset_language', fill='score'), data=df) + scale_fill_gradient(low='white', high='blue') + ggsize(500, 500)