In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, RobertaForQuestionAnswering, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import pipeline
from evaluate import load
from datasets import load_dataset
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd
import random
import math

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())	

  from .autonotebook import tqdm as notebook_tqdm


There are 1 GPU(s) available.


In [2]:
class Model:
    name: str
    tokenizer: object
    model: object

    def __init__(self, name: str, tokenizer: object = None, model: object = None):
        self.name = name
        self.tokenizer = tokenizer
        self.model = model

    def __repr__(self):
        return f"Model(name={self.name})"

    def set_tokenizer(self, tokenizer: object):
        self.tokenizer = tokenizer

    def set_model(self, model: object):
        self.model = model

    def to_pipeline(self):
        return pipeline("fill-mask", model=self.model, tokenizer=self.tokenizer, device=0, top_k = 1)

    def get_collator(self):
        return DataCollatorForLanguageModeling(tokenizer=self.tokenizer, mlm=True, mlm_probability=0.15)

In [3]:
class Perplexity:
	def __init__(self):
		self.sum = 0
		self.number =0
		self.str = 'perplexity'
		self.metric_scores = {}

	# cross entropy loss
	def score(self, loss):
		self.sum += loss
		# should be number of sentences
		self.number += 1

	def reset(self):
		self.sum = 0
		self.number = 0

	def get_score(self):
		if self.sum ==0:
			self.metric_scores[self.str] = 0.0
		self.metric_scores[self.str] = math.exp(self.sum / self.number)
		self.metric_scores["sum"] = self.str
		return self.metric_scores

In [4]:
# load models
models = {
    "de" : Model("uklfr/gottbert-base"),
    "nl" : Model("pdelobelle/robbert-v2-dutch-base"),
    "es" : Model("bertin-project/bertin-roberta-base-spanish"),
    "se" : Model("birgermoell/roberta-swedish"),
    "dk" : Model("DDSC/roberta-base-danish"),
}

for language, model in tqdm(models.items()):
    # get model & tokenizer from huggingface
    model.set_tokenizer(AutoTokenizer.from_pretrained(model.name, model_max_length=512))
    model.set_model(AutoModelForMaskedLM.from_pretrained(model.name).to("cuda"))


100%|██████████| 5/5 [00:25<00:00,  5.01s/it]


In [5]:
# create pipelines

pipelines = {k: v.to_pipeline() for k, v in models.items()}

In [6]:
%%capture
# load datasets

def load_dataset_local(dataset_name):
    with open(f"wiki_data/{dataset_name}.txt") as f:
        return f.read().splitlines()

datasets = {
    "dk" : load_dataset_local("dawiki_sentences"),
    "nl" : load_dataset_local("nlwiki_sentences"),
}

In [15]:
perplexity = load("perplexity", module_type="metric")

In [18]:
results = {}
# tokenize each dataset for each model
for model_language, pipeline in tqdm(pipelines.items()):

    for dataset_language, dataset in datasets.items():
        # raw data
        raw_data = dataset
        # split into sentences
        # TODO: permorm additional data cleaning
        sentences = [sentence.split(" ") for text in raw_data for sentence in text.split(".")]
        masked_words = []     
        # mask one word in each sentence
        for sentence in sentences:
            # get random index
            index = random.randint(0, len(sentence) - 1)
            # mask word
            masked_words.append(sentence[index])
            sentence[index] = pipeline.tokenizer.mask_token

        sentences = [" ".join(sentence) for sentence in sentences]
        
        try:
            predictions = pipeline(sentences)
        except RuntimeError:
            print(f"Error with {model_language} and {dataset_language}")
            continue

        # get average score
        softmax_scores = [prediction[0]["score"] for prediction in predictions]
        predicted_sentences = [prediction[0]["sequence"] for prediction in predictions]
        predicted_words = [prediction[0]["token_str"] for prediction in predictions]
        # get vector for masked and predicted word
        masked_index = [pipelines[dataset_language].tokenizer.encode(token, add_special_tokens=False) for token in masked_words]
        predict_index = [pipelines[dataset_language].tokenizer.encode(token, add_special_tokens=False) for token in predicted_words]
        # calculate cosine similarity between predicted words and masked words
        cosine_similarities = []
        for i in range(len(masked_index)):
            if masked_index[i] == [] or predict_index[i] == []:
                cosine_similarities.append(np.array([0]))
                continue
            # get vector for masked word
            masked_vector = pipelines[dataset_language].model.roberta.embeddings.word_embeddings(torch.tensor(masked_index[i]).to("cuda"))
            # get vector for predicted word
            predict_vector = pipelines[dataset_language].model.roberta.embeddings.word_embeddings(torch.tensor(predict_index[i]).to("cuda"))
            # fill the smaller tensor with zeros so that masked_vector and predict_vector have the same shape
            if masked_vector.shape[0] < predict_vector.shape[0]:
                masked_vector = torch.nn.functional.pad(masked_vector, (0, 0, 0, predict_vector.shape[0] - masked_vector.shape[0]))
            elif masked_vector.shape[0] > predict_vector.shape[0]:
                predict_vector = torch.nn.functional.pad(predict_vector, (0, 0, 0, masked_vector.shape[0] - predict_vector.shape[0]))

            # calculate cosine similarity using torch
            cos_sim = torch.nn.functional.cosine_similarity(masked_vector, predict_vector, dim=1).cpu().detach().numpy()
            cosine_similarities.append(cos_sim)

        # calculate perplexity
        perplexity_scores = perplexity.compute(predictions=predicted_words, references=masked_words, model_id=models[dataset_language].name)
        perplexity_score = np.mean(perplexity_scores["perplexities"]) / len(masked_words)
        # mean cosine similarity
        # flatten cos sim
        cosine_similarities = [item for sublist in cosine_similarities for item in sublist]
        mean_cosine_similarity = np.mean(cosine_similarities)

        average_score = np.mean(softmax_scores)
        median_score = np.median(softmax_scores)
        print(f"{model_language} - {dataset_language}: {average_score}, {mean_cosine_similarity}, {perplexity_score}")
        results[(model_language, dataset_language)] = [average_score, median_score, mean_cosine_similarity, perplexity_score]


  0%|          | 0/5 [00:00<?, ?it/s]If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 721/721 [00:07<00:00, 100.54it/s]


de - dk: 0.2492884303535838, 0.1815131002795862, 12433.975734226316


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 697/697 [00:05<00:00, 131.07it/s]
 20%|██        | 1/5 [02:41<10:46, 161.56s/it]

de - nl: 0.28904147550884396, 0.027586013242075533, 123330429.02641025


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 721/721 [00:05<00:00, 139.98it/s]


nl - dk: 0.17430659572121546, 0.17302126693456454, 729.0852189691547


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 697/697 [00:02<00:00, 267.83it/s]
 40%|████      | 2/5 [05:02<07:28, 149.57s/it]

nl - nl: 0.5049835043971108, 0.12403929017687482, 341670703.0492279


Token indices sequence length is longer than the specified maximum sequence length for this model (549 > 512). Running this sequence through the model will result in indexing errors


Error with es and dk


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 697/697 [00:04<00:00, 157.21it/s]
 60%|██████    | 3/5 [07:07<04:36, 138.24s/it]

es - nl: 0.25532941688059496, 0.029381318419371947, 65217211.173626855


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 721/721 [00:03<00:00, 182.88it/s]


se - dk: 0.31381282833459184, 0.21487827390014422, 883890.0907943965


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 697/697 [00:04<00:00, 148.60it/s]
 80%|████████  | 4/5 [09:35<02:22, 142.02s/it]

se - nl: 0.25506704479875, 0.02644637501273703, 102829901.5122894


 80%|████████  | 4/5 [09:40<02:25, 145.10s/it]


KeyboardInterrupt: 

In [20]:
masked_vector, predict_vector

(tensor([[ 2.0848e-02, -1.3256e-01,  7.0148e-02, -6.4970e-02,  3.9045e-02,
          -1.6804e-01,  4.8733e-02, -1.3285e-02,  9.7881e-02, -1.2284e-01,
           1.2004e-01, -1.3028e-01, -4.7270e-02,  7.0994e-03, -1.3049e-01,
           2.9180e-03, -2.1014e-03,  5.3837e-03, -8.0522e-02, -9.2563e-03,
          -2.4973e-03,  2.6751e-02, -4.8266e-02, -2.9054e-02, -3.5251e-02,
           3.6251e-02,  1.2799e-02,  5.3483e-02, -7.4835e-03,  9.3621e-02,
           1.1026e-02,  4.2390e-02,  2.5491e-03, -7.8009e-02,  3.2994e-02,
           2.6747e-02, -5.3774e-02,  7.7423e-02,  2.1110e-02, -1.4736e-01,
          -3.7681e-02, -1.5344e-01,  1.0016e-01, -1.7052e-01, -2.1911e-01,
          -8.6054e-02,  8.1558e-02, -6.2331e-02, -2.3602e-02, -3.4057e-02,
           2.3995e-02, -1.8137e-02, -8.1614e-03, -3.5421e-02,  9.7237e-02,
          -1.0874e-01, -7.5366e-02,  3.4162e-02, -2.9242e-02, -5.2934e-02,
           1.2119e-02, -1.0767e-01, -8.3322e-02, -3.4783e-02,  2.8931e-02,
          -4.3873e-02, -2

In [None]:
idx = random.choice(range(0, len(sentences)))
masked_words[idx], sentences[idx], predicted_sentences[idx], softmax_scores[idx]

In [None]:
to_df = []
for (model_language, dataset_language), score in results.items():
    to_df.append({
        "model_language" : model_language,
        "dataset_language" : dataset_language,
        "avg_score" : score[0],
        "median_score" : score[1],
        "cos_simil" : score[2]
    })

df = pd.DataFrame(to_df)

In [None]:
df.head()

In [None]:
from lets_plot import *
LetsPlot.setup_html()

bunch = GGBunch()
bunch.add_plot(ggplot() + geom_tile(aes(x='model_language', y='dataset_language', fill='cos_dist '), data=df.query("model_language != 'es'")) + scale_fill_gradient(low='white', high='blue') + ggsize(500, 500) + ggtitle("Cosine Similarity")
, 0, 0)
bunch.add_plot(ggplot() + geom_tile(aes(x='model_language', y='dataset_language', fill='avg_score'), data=df.query("model_language != 'es'")) + scale_fill_gradient(low='white', high='red') + ggsize(500, 500) + ggtitle("Average Score")
, 500, 0)

bunch.show()

In [None]:
ggsave(bunch, "results.html")