In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, RobertaForQuestionAnswering, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import pipeline
from evaluate import load
from datasets import load_dataset
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd
import random
import math

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())	

In [None]:
class Model:
    name: str
    tokenizer: object
    model: object

    def __init__(self, name: str, tokenizer: object = None, model: object = None):
        self.name = name
        self.tokenizer = tokenizer
        self.model = model

    def __repr__(self):
        return f"Model(name={self.name})"

    def set_tokenizer(self, tokenizer: object):
        self.tokenizer = tokenizer

    def set_model(self, model: object):
        self.model = model

    def to_pipeline(self):
        return pipeline("fill-mask", model=self.model, tokenizer=self.tokenizer, device=0, top_k = 1)

    def get_collator(self):
        return DataCollatorForLanguageModeling(tokenizer=self.tokenizer, mlm=True, mlm_probability=0.15)

In [None]:
class Perplexity:
	def __init__(self):
		self.sum = 0
		self.number =0
		self.str = 'perplexity'
		self.metric_scores = {}

	# cross entropy loss
	def score(self, loss):
		self.sum += loss
		# should be number of sentences
		self.number += 1

	def reset(self):
		self.sum = 0
		self.number = 0

	def get_score(self):
		if self.sum ==0:
			self.metric_scores[self.str] = 0.0
		self.metric_scores[self.str] = math.exp(self.sum / self.number)
		self.metric_scores["sum"] = self.str
		return self.metric_scores

In [None]:
# load models
models = {
    "de" : Model("uklfr/gottbert-base"),
    "nl" : Model("pdelobelle/robbert-v2-dutch-base"),
    "se" : Model("birgermoell/roberta-swedish"),
    "dk" : Model("DDSC/roberta-base-danish"),
    # "es" : Model("bertin-project/bertin-roberta-base-spanish"),
}

for language, model in tqdm(models.items()):
    # get model & tokenizer from huggingface
    model.set_tokenizer(AutoTokenizer.from_pretrained(model.name, model_max_length=512))
    model.set_model(AutoModelForMaskedLM.from_pretrained(model.name).to("cuda"))


In [None]:
# create pipelines

pipelines = {k: v.to_pipeline() for k, v in models.items()}

In [None]:
%%capture
# load datasets

def load_dataset_local(dataset_name):
    with open(f"wiki_data/{dataset_name}.txt") as f:
        return f.read().splitlines()

datasets = {
    "nl" : load_dataset_local("nlwiki_sentences"),
    "dk" : load_dataset_local("dawiki_sentences"),
    "se" : load_dataset_local("svwiki_sentences"),
    "de" : load_dataset_local("dewiki_sentences"),
    # "es" : load_dataset_local("eswiki_sentences"),
}

In [None]:
word_pairs = {}

In [None]:
perplexity = load("perplexity", module_type="metric")

In [None]:
results = {}
# tokenize each dataset for each model
for model_language, pipeline in tqdm(pipelines.items()):

    for dataset_language, dataset in datasets.items():
        # raw data
        raw_data = dataset
        # split into sentences
        # TODO: permorm additional data cleaning
        sentences = [sentence.split(" ") for text in raw_data for sentence in text.split(".")]
        masked_words = []     
        # mask one word in each sentence
        for sentence in sentences:
            # get random index
            index = random.randint(0, len(sentence) - 1)
            # mask word
            masked_words.append(sentence[index])
            sentence[index] = pipeline.tokenizer.mask_token

        sentences = [" ".join(sentence) for sentence in sentences]
        
        try:
            predictions = pipeline(sentences)
        except RuntimeError:
            print(f"Error with {model_language} and {dataset_language}")
            continue

        # get average score
        softmax_scores = [prediction[0]["score"] for prediction in predictions]
        predicted_sentences = [prediction[0]["sequence"] for prediction in predictions]
        predicted_words = [prediction[0]["token_str"] for prediction in predictions]
        # get vector for masked and predicted word
        masked_index = [pipelines[dataset_language].tokenizer.encode(token, add_special_tokens=True) for token in masked_words]
        predict_index = [pipelines[dataset_language].tokenizer.encode(token, add_special_tokens=True) for token in predicted_words]
        unknown_index = pipelines[dataset_language].tokenizer.encode(pipelines[dataset_language].tokenizer.pad_token, add_special_tokens=True)
        word_pairs[f"{model_language}-{dataset_language}"] = [masked_words, predicted_words]
        nr_unks = []
        ul = len(unknown_index)
        for i in range(len(masked_index)):
            ml = len(masked_index[i])
            pl = len(predict_index[i])
            diff = abs(ml - pl)
            repeats = diff // ul
            nr_unks.append(repeats + 1)
            if ml < pl and diff % ul == 0:
                # diff is a multiple of ul
                masked_index[i] = masked_index[i] + unknown_index * repeats
            elif ml < pl and diff % ul != 0 and diff < ul:
                # diff is smaller than ul
                masked_index[i] = masked_index[i] + unknown_index[:diff]
            elif ml < pl and diff % ul != 0 and diff >= ul:
                # diff is larger than ul and not a multiple of ul
                masked_index[i] = masked_index[i] + unknown_index * repeats + unknown_index[:diff % ul]
            elif ml > pl and diff % ul == 0:
                predict_index[i] = predict_index[i] + unknown_index * repeats
            elif ml > pl and diff % ul != 0 and diff < ul:
                predict_index[i] = predict_index[i] + unknown_index[:diff]
            elif ml > pl and diff % ul != 0 and diff >= ul:
                predict_index[i] = predict_index[i] + unknown_index * repeats + unknown_index[:diff % ul]

        # calculate cosine similarity between predicted words and masked words
        cosine_similarities = []
        for i in range(len(masked_index)):
            if masked_index[i] == [] or predict_index[i] == []:
                cosine_similarities.append(np.array(0))
                continue
            # get vector for masked word
            masked_vector = pipelines[dataset_language].model.roberta.embeddings.word_embeddings(torch.tensor(masked_index[i]).to("cuda"))
            # get vector for predicted word
            predict_vector = pipelines[dataset_language].model.roberta.embeddings.word_embeddings(torch.tensor(predict_index[i]).to("cuda"))
            # if masked_vector.shape[0] < predict_vector.shape[0]:
            #     # extend with unk_vector until shape is the same
            #     masked_vector = torch.nn.functional.pad(masked_vector, (0, 0, 0, predict_vector.shape[0] - masked_vector.shape[0]))
            # elif masked_vector.shape[0] > predict_vector.shape[0]:
            #     predict_vector = torch.nn.functional.pad(predict_vector, (0, 0, 0, masked_vector.shape[0] - predict_vector.shape[0]))

            # calculate cosine similarity using torch
            cos_sim = torch.nn.functional.cosine_similarity(masked_vector, predict_vector, dim=1).cpu().detach().numpy()
            cosine_similarities.append(np.mean(cos_sim))

        # calculate perplexity
        perplexity_scores = perplexity.compute(predictions=predicted_words, references=masked_words, model_id=models[dataset_language].name)
        perplexity_score = np.mean(perplexity_scores["perplexities"]) / len(masked_words)
        # mean cosine similarity
        # flatten cos sim
        # cosine_similarities = [item for sublist in cosine_similarities for item in sublist]
        mean_cosine_similarity = np.mean(cosine_similarities)

        average_score = np.mean(softmax_scores)
        median_score = np.median(softmax_scores)
        print(f"{model_language} - {dataset_language}: {average_score}, {mean_cosine_similarity}, {perplexity_score}")
        results[(model_language, dataset_language)] = [average_score, median_score, mean_cosine_similarity, perplexity_score]


In [None]:
idx = random.choice(range(0, len(sentences)))
masked_words[idx], sentences[idx], predicted_sentences[idx], softmax_scores[idx]

In [None]:
import json
with open("results.json", "w") as f:
    json.dump(word_pairs, f)

In [None]:
to_df = []
for (model_language, dataset_language), score in results.items():
    to_df.append({
        "model_language" : model_language,
        "dataset_language" : dataset_language,
        "avg_score" : score[0],
        "median_score" : score[1],
        "cos_simil" : score[2],
        "perplexity" : score[3]
    })

df = pd.DataFrame(to_df)

In [None]:
# plot ranking
df["rank"] = df.groupby("dataset_language")["cos_simil"].rank(ascending=True)
df.sort_values(by=["dataset_language", "model_language"], inplace=True)

In [None]:
df

In [None]:
from lets_plot import *
LetsPlot.setup_html()

bunch = GGBunch()
bunch.add_plot(ggplot() + geom_tile(aes(x='model_language', y='dataset_language', fill='rank'), data=df) + scale_fill_gradient(low='white', high='blue') + ggsize(500, 500) + ggtitle("Cosine Similarity") + scale_x_discrete_reversed() + scale_y_discrete()
, 0, 0)
bunch.add_plot(ggplot() + geom_tile(aes(x='model_language', y='dataset_language', fill='avg_score'), data=df) + scale_fill_gradient(low='white', high='red') + ggsize(500, 500) + ggtitle("Average Score") + scale_x_discrete_reversed() + scale_y_discrete()
, 500, 0)

bunch.show()

In [None]:
ggsave(bunch, "results.html")