In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, RobertaForQuestionAnswering, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import pipeline
from evaluate import load
from datasets import load_dataset
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd
import random

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())	

  from .autonotebook import tqdm as notebook_tqdm


There are 1 GPU(s) available.


In [2]:
class Model:
    name: str
    tokenizer: object
    model: object

    def __init__(self, name: str, tokenizer: object = None, model: object = None):
        self.name = name
        self.tokenizer = tokenizer
        self.model = model

    def __repr__(self):
        return f"Model(name={self.name})"

    def set_tokenizer(self, tokenizer: object):
        self.tokenizer = tokenizer

    def set_model(self, model: object):
        self.model = model

    def to_pipeline(self):
        return pipeline("fill-mask", model=self.model, tokenizer=self.tokenizer, device=0, top_k = 1)

    def get_collator(self):
        return DataCollatorForLanguageModeling(tokenizer=self.tokenizer, mlm=True, mlm_probability=0.15)

In [3]:
# load models
models = {
    "de" : Model("uklfr/gottbert-base"),
    # "nl" : Model("pdelobelle/robbert-v2-dutch-base"),
    # "es" : Model("bertin-project/bertin-roberta-base-spanish"),
    # "tr" : Model("dbmdz/bert-base-turkish-cased")
}

for language, model in tqdm(models.items()):
    # get model & tokenizer from huggingface
    model.set_tokenizer(AutoTokenizer.from_pretrained(model.name))
    model.set_model(AutoModelForMaskedLM.from_pretrained(model.name).to("cuda"))


100%|██████████| 1/1 [00:06<00:00,  6.18s/it]


In [4]:
# create pipelines

pipelines = {k: v.to_pipeline() for k, v in models.items()}

In [5]:
%%capture
# load datasets

datasets = {
    "de" : load_dataset("xquad", "xquad.de"),
    "es" : load_dataset("xquad", "xquad.es"),
    # "tr" : load_dataset("xquad", "xquad.tr"),
    # "ro" : load_dataset("xquad", "xquad.ro"),
    "en" : load_dataset("xquad", "xquad.en"),
}

In [6]:
perplexity = load("perplexity", module_type="metric")

In [34]:
results = {}
# tokenize each dataset for each model
for model_language, pipeline in tqdm(pipelines.items()):

    for dataset_language, dataset in datasets.items():
        # raw data
        raw_data = dataset["validation"]["context"]
        # split into sentences
        # TODO: permorm additional data cleaning
        sentences = [sentence.split(" ") for text in raw_data for sentence in text.split(".")]
        masked_words = []     
        # mask one word in each sentence
        for sentence in sentences:
            # get random index
            index = random.randint(0, len(sentence) - 1)
            # mask word
            masked_words.append(sentence[index])
            sentence[index] = pipeline.tokenizer.mask_token

        sentences = [" ".join(sentence) for sentence in sentences]
        
        try:
            predictions = pipeline(sentences)
        except RuntimeError:
            print(f"Error with {model_language} and {dataset_language}")
            continue

        # get average score
        softmax_scores = [prediction[0]["score"] for prediction in predictions]
        predicted_sentences = [prediction[0]["sequence"] for prediction in predictions]
        prediced_words = [prediction[0]["token_str"] for prediction in predictions]
        # get vector for masked and predicted word
        masked_index = [pipeline.tokenizer.encode(token, add_special_tokens=False) for token in masked_words]
        predict_index = [pipeline.tokenizer.encode(token, add_special_tokens=False) for token in prediced_words]
        # calculate cosine similarity between predicted words and masked words
        cosine_similarities = []
        for i in range(len(masked_index)):
            if masked_index[i] == [] or predict_index[i] == []:
                cosine_similarities.append(np.array([0]))
                continue
            # get vector for masked word
            masked_vector = pipeline.model.roberta.embeddings.word_embeddings(torch.tensor(masked_index[i]).to("cuda"))
            # get vector for predicted word
            predict_vector = pipeline.model.roberta.embeddings.word_embeddings(torch.tensor(predict_index[i]).to("cuda"))

            # calculate cosine similarity using torch
            cos_sim = torch.nn.functional.cosine_similarity(masked_vector, predict_vector, dim=1).cpu().detach().numpy()
            cosine_similarities.append(cos_sim)

        # mean cosine similarity
        # flatten cos sim
        cosine_similarities = [abs(item) for sublist in cosine_similarities for item in sublist]
        mean_cosine_similarity = np.mean(cosine_similarities)

        average_score = np.mean(softmax_scores)
        median_score = np.median(softmax_scores)
        print(f"{model_language} - {dataset_language}: {average_score}, {mean_cosine_similarity}")
        results[(model_language, dataset_language)] = [average_score, median_score, mean_cosine_similarity]


  0%|          | 0/1 [00:00<?, ?it/s]

de - de: 0.4654309966866726, 0.16268255323756378
de - es: 0.3568663840078348, 0.08487572332219087


100%|██████████| 1/1 [02:29<00:00, 149.72s/it]

de - en: 0.3958673913783324, 0.09840186539969191





In [None]:
idx = random.choice(range(0, len(sentences)))
masked_words[idx], sentences[idx], predicted_sentences[idx], softmax_scores[idx]

In [None]:
to_df = []
for (model_language, dataset_language), score in results.items():
    to_df.append({
        "model_language" : model_language,
        "dataset_language" : dataset_language,
        "avg_score" : score[0],
        "median_score" : score[1],
        "perplexity" : score[2]
    })

df = pd.DataFrame(to_df)

In [None]:
df.head()

In [None]:
from lets_plot import *
LetsPlot.setup_html()

ggplot() + geom_tile(aes(x='model_language', y='dataset_language', fill='perplexity'), data=df) + scale_fill_gradient(low='orange', high='blue') + ggsize(500, 500)