In [14]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, RobertaForQuestionAnswering, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import pipeline
from evaluate import load
from datasets import load_dataset
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd
import random
import math

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())	

There are 1 GPU(s) available.


In [15]:
class Model:
    name: str
    tokenizer: object
    model: object

    def __init__(self, name: str, tokenizer: object = None, model: object = None):
        self.name = name
        self.tokenizer = tokenizer
        self.model = model

    def __repr__(self):
        return f"Model(name={self.name})"

    def set_tokenizer(self, tokenizer: object):
        self.tokenizer = tokenizer

    def set_model(self, model: object):
        self.model = model

    def to_pipeline(self):
        return pipeline("fill-mask", model=self.model, tokenizer=self.tokenizer, device=0, top_k = 1)

    def get_collator(self):
        return DataCollatorForLanguageModeling(tokenizer=self.tokenizer, mlm=True, mlm_probability=0.15)

In [16]:
class Perplexity:
	def __init__(self):
		self.sum = 0
		self.number =0
		self.str = 'perplexity'
		self.metric_scores = {}

	# cross entropy loss
	def score(self, loss):
		self.sum += loss
		# should be number of sentences
		self.number += 1

	def reset(self):
		self.sum = 0
		self.number = 0

	def get_score(self):
		if self.sum ==0:
			self.metric_scores[self.str] = 0.0
		self.metric_scores[self.str] = math.exp(self.sum / self.number)
		self.metric_scores["sum"] = self.str
		return self.metric_scores

In [17]:
# load models
models = {
    "de" : Model("uklfr/gottbert-base"),
    "nl" : Model("pdelobelle/robbert-v2-dutch-base"),
    "se" : Model("birgermoell/roberta-swedish"),
    "dk" : Model("DDSC/roberta-base-danish"),
}

for language, model in tqdm(models.items()):
    # get model & tokenizer from huggingface
    model.set_tokenizer(AutoTokenizer.from_pretrained(model.name, model_max_length=512))
    model.set_model(AutoModelForMaskedLM.from_pretrained(model.name).to("cuda"))


100%|██████████| 4/4 [00:19<00:00,  4.89s/it]


In [18]:
# create pipelines

pipelines = {k: v.to_pipeline() for k, v in models.items()}

In [19]:
%%capture
# load datasets

def load_dataset_local(dataset_name):
    with open(f"wiki_data/{dataset_name}.txt") as f:
        return f.read().splitlines()

datasets = {
    "nl" : load_dataset_local("nlwiki_sentences"),
    "dk" : load_dataset_local("dawiki_sentences"),
    "se" : load_dataset_local("svwiki_sentences"),
    "de" : load_dataset_local("dewiki_sentences"),
}

In [20]:
word_pairs = {}

In [21]:
perplexity = load("perplexity", module_type="metric")

In [22]:
results = {}
# tokenize each dataset for each model
for model_language, pipeline in tqdm(pipelines.items()):

    for dataset_language, dataset in datasets.items():
        # raw data
        raw_data = dataset
        # split into sentences
        # TODO: permorm additional data cleaning
        sentences = [sentence.split(" ") for text in raw_data for sentence in text.split(".")]
        masked_words = []     
        # mask one word in each sentence
        for sentence in sentences:
            # get random index
            index = random.randint(0, len(sentence) - 1)
            # mask word
            masked_words.append(sentence[index])
            sentence[index] = pipeline.tokenizer.mask_token

        sentences = [" ".join(sentence) for sentence in sentences]
        
        try:
            predictions = pipeline(sentences)
        except RuntimeError:
            print(f"Error with {model_language} and {dataset_language}")
            continue

        # get average score
        softmax_scores = [prediction[0]["score"] for prediction in predictions]
        predicted_sentences = [prediction[0]["sequence"] for prediction in predictions]
        predicted_words = [prediction[0]["token_str"] for prediction in predictions]
        # get vector for masked and predicted word
        masked_index = [pipelines[dataset_language].tokenizer.encode(token, add_special_tokens=True) for token in masked_words]
        predict_index = [pipelines[dataset_language].tokenizer.encode(token, add_special_tokens=True) for token in predicted_words]
        unknown_index = pipelines[dataset_language].tokenizer.encode(pipelines[dataset_language].tokenizer.unk_token, add_special_tokens=True)
        word_pairs[f"{model_language}-{dataset_language}"] = [masked_words, predicted_words]
        nr_unks = []
        ul = len(unknown_index)
        for i in range(len(masked_index)):
            ml = len(masked_index[i])
            pl = len(predict_index[i])
            diff = abs(ml - pl)
            repeats = diff // ul
            nr_unks.append(repeats + 1)
            if ml < pl and diff % ul == 0:
                # diff is a multiple of ul
                masked_index[i] = masked_index[i] + unknown_index * repeats
            elif ml < pl and diff % ul != 0 and diff < ul:
                # diff is smaller than ul
                masked_index[i] = masked_index[i] + unknown_index[:diff]
            elif ml < pl and diff % ul != 0 and diff >= ul:
                # diff is larger than ul and not a multiple of ul
                masked_index[i] = masked_index[i] + unknown_index * repeats + unknown_index[:diff % ul]
            elif ml > pl and diff % ul == 0:
                predict_index[i] = predict_index[i] + unknown_index * repeats
            elif ml > pl and diff % ul != 0 and diff < ul:
                predict_index[i] = predict_index[i] + unknown_index[:diff]
            elif ml > pl and diff % ul != 0 and diff >= ul:
                predict_index[i] = predict_index[i] + unknown_index * repeats + unknown_index[:diff % ul]

        # calculate cosine similarity between predicted words and masked words
        cosine_similarities = []
        for i in range(len(masked_index)):
            if masked_index[i] == [] or predict_index[i] == []:
                cosine_similarities.append(np.array(0))
                continue
            # get vector for masked word
            masked_vector = pipelines[dataset_language].model.roberta.embeddings.word_embeddings(torch.tensor(masked_index[i]).to("cuda"))
            # get vector for predicted word
            predict_vector = pipelines[dataset_language].model.roberta.embeddings.word_embeddings(torch.tensor(predict_index[i]).to("cuda"))
            # if masked_vector.shape[0] < predict_vector.shape[0]:
            #     # extend with unk_vector until shape is the same
            #     masked_vector = torch.nn.functional.pad(masked_vector, (0, 0, 0, predict_vector.shape[0] - masked_vector.shape[0]))
            # elif masked_vector.shape[0] > predict_vector.shape[0]:
            #     predict_vector = torch.nn.functional.pad(predict_vector, (0, 0, 0, masked_vector.shape[0] - predict_vector.shape[0]))

            # calculate cosine similarity using torch
            cos_sim = torch.nn.functional.cosine_similarity(masked_vector, predict_vector, dim=1).cpu().detach().numpy()
            cosine_similarities.append(np.mean(cos_sim) / nr_unks[i])

        # calculate perplexity
        perplexity_scores = perplexity.compute(predictions=predicted_words, references=masked_words, model_id=models[dataset_language].name)
        perplexity_score = np.mean(perplexity_scores["perplexities"]) / len(masked_words)
        # mean cosine similarity
        # flatten cos sim
        # cosine_similarities = [item for sublist in cosine_similarities for item in sublist]
        mean_cosine_similarity = np.mean(cosine_similarities)

        average_score = np.mean(softmax_scores)
        median_score = np.median(softmax_scores)
        print(f"{model_language} - {dataset_language}: {average_score}, {mean_cosine_similarity}, {perplexity_score}")
        results[(model_language, dataset_language)] = [average_score, median_score, mean_cosine_similarity, perplexity_score]


  0%|          | 0/4 [00:00<?, ?it/s]If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 697/697 [00:05<00:00, 133.19it/s]


de - nl: 0.28799366936014364, 0.49485372149553464, 131126265.43179055


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 721/721 [00:05<00:00, 139.79it/s]


de - dk: 0.25232107056116804, 0.24082141665132953, 2898.096486008697


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 743/743 [00:07<00:00, 100.36it/s]


de - se: 0.23813699464233723, 0.128770391361013, 4200610.145795178


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 694/694 [00:02<00:00, 257.51it/s]
 25%|██▌       | 1/4 [05:12<15:37, 312.34s/it]

de - de: 0.49317091358430865, 0.5455624932683708, 89853317078.90202


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 697/697 [00:02<00:00, 267.34it/s]


nl - nl: 0.5185370977312649, 0.5437241566452584, 398850193.39156175


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 721/721 [00:04<00:00, 147.86it/s]


nl - dk: 0.17897887501138462, 0.23401108517735555, 14397.090109044244


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 743/743 [00:04<00:00, 182.82it/s]


nl - se: 0.18017199604387116, 0.10809311741740346, 26868733.755516145


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 694/694 [00:05<00:00, 138.39it/s]
 50%|█████     | 2/4 [10:05<10:02, 301.13s/it]

nl - de: 0.2577283655412955, 0.4970116746180915, 4037656924.3197265


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 697/697 [00:04<00:00, 154.85it/s]


se - nl: 0.25600145236898453, 0.493979878364008, 111043148.30228892


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 721/721 [00:03<00:00, 182.70it/s]


se - dk: 0.31182136101284147, 0.27232591309601284, 450140.72381914913


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 743/743 [00:02<00:00, 258.81it/s]


se - se: 0.42099222592509317, 0.23732886433232087, 226911764.1768161


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 694/694 [00:04<00:00, 146.61it/s]
 75%|███████▌  | 3/4 [15:01<04:58, 298.95s/it]

se - de: 0.2643633002816717, 0.4949022586074477, 2916742796.5731316


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 697/697 [00:04<00:00, 148.81it/s]


dk - nl: 0.2923883924571594, 0.4865246465280186, 99989013.2068993


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 721/721 [00:02<00:00, 257.51it/s]


dk - dk: 0.4362164365570731, 0.3069264163017941, 185597.7234408888


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 743/743 [00:05<00:00, 139.67it/s]


dk - se: 0.29689900109596135, 0.20132672174656221, 84420212.9207603


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 694/694 [00:04<00:00, 146.85it/s]
100%|██████████| 4/4 [19:53<00:00, 298.46s/it]

dk - de: 0.27977260160608414, 0.4847021962209376, 2065837084.773247





In [45]:
idx = random.choice(range(0, len(sentences)))
masked_words[idx], sentences[idx], predicted_sentences[idx], softmax_scores[idx]

('Molnár,',
 'Nino Albarosa, Nino Antonellini, József <mask> Elsa Respighi, Nino Rota, Monika Ryba und Carlo Zecchi',
 'Nino Albarosa, Nino Antonellini, József, Elsa Respighi, Nino Rota, Monika Ryba und Carlo Zecchi',
 0.7408052086830139)

In [37]:
import json
with open("results.json", "w") as f:
    json.dump(word_pairs, f)

In [24]:
to_df = []
for (model_language, dataset_language), score in results.items():
    to_df.append({
        "model_language" : model_language,
        "dataset_language" : dataset_language,
        "avg_score" : score[0],
        "median_score" : score[1],
        "cos_simil" : score[2],
        "perplexity" : score[3]
    })

df = pd.DataFrame(to_df)

In [31]:
# plot ranking
df["rank"] = df.groupby("dataset_language")["cos_simil"].rank(ascending=True)
df.sort_values(by=["dataset_language", "model_language"], inplace=True)

In [38]:
df

Unnamed: 0,model_language,dataset_language,avg_score,median_score,cos_simil,perplexity,rank
3,de,de,0.493171,0.424997,0.545562,89853320000.0,4.0
15,dk,de,0.279773,0.218365,0.484702,2065837000.0,1.0
7,nl,de,0.257728,0.177668,0.497012,4037657000.0,3.0
11,se,de,0.264363,0.200517,0.494902,2916743000.0,2.0
1,de,dk,0.252321,0.186145,0.240821,2898.096,2.0
13,dk,dk,0.436216,0.363415,0.306926,185597.7,4.0
5,nl,dk,0.178979,0.111888,0.234011,14397.09,1.0
9,se,dk,0.311821,0.230629,0.272326,450140.7,3.0
0,de,nl,0.287994,0.202455,0.494854,131126300.0,3.0
12,dk,nl,0.292388,0.221076,0.486525,99989010.0,1.0


In [36]:
from lets_plot import *
LetsPlot.setup_html()

bunch = GGBunch()
bunch.add_plot(ggplot() + geom_tile(aes(x='model_language', y='dataset_language', fill='rank'), data=df) + scale_fill_gradient(low='white', high='blue') + ggsize(500, 500) + ggtitle("Cosine Similarity") + scale_x_discrete_reversed() + scale_y_discrete()
, 0, 0)
bunch.add_plot(ggplot() + geom_tile(aes(x='model_language', y='dataset_language', fill='avg_score'), data=df) + scale_fill_gradient(low='white', high='red') + ggsize(500, 500) + ggtitle("Average Score") + scale_x_discrete_reversed() + scale_y_discrete()
, 500, 0)

bunch.show()

In [None]:
ggsave(bunch, "results.html")

'/home/inkompotato/itu/nlp-project/lets-plot-images/results.html'