In [1]:
from __future__ import division
from itertools import repeat
import torch
from transformers import AutoModel, AutoTokenizer
import datasets
from tokenisemt import MTWordTokenizer
from sklearn.cluster import KMeans
from numbers import Number
from pandas import DataFrame
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
import sys, codecs, numpy

tok = MTWordTokenizer()


class autovivify_list(dict):
    """A pickleable version of collections.defaultdict"""

    def __missing__(self, key):
        """Given a missing key, set initial value to an empty list"""
        value = self[key] = []
        return value

    def __add__(self, x):
        """Override addition for numeric types when self is empty"""
        if not self and isinstance(x, Number):
            return x
        raise ValueError

    def __sub__(self, x):
        """Also provide subtraction method"""
        if not self and isinstance(x, Number):
            return -1 * x
        raise ValueError


# def build_word_vector_matrix(vector_file, n_words):


def find_word_clusters(labels_array, cluster_labels):
    """ Return the set of words in each cluster"""
    cluster_to_words = autovivify_list()
    for c, i in enumerate(cluster_labels):
        cluster_to_words[i].append(labels_array[c])
    return cluster_to_words


def get_embeddings(model, tokeniser, sentences):
    input = tokeniser.batch_encode_plus(sentences,
                                        max_length=model.embeddings.position_embeddings.num_embeddings,
                                        padding="max_length",
                                        truncation=True,
                                        # return_overflowing_tokens=True,
                                        return_tensors="pt",
                                        )
    output = model(**input)[0]

    words = []
    embeddings = []
    vocabulary = dict(zip(tokeniser.get_vocab().values(), tokeniser.get_vocab().keys()))
    
    for i, token in enumerate(map(lambda token_id: vocabulary[token_id],
                                  [token for instance in input["input_ids"].tolist() for token in instance])):
        try:
            embedding = output[:, i]
            if token in ("[CLS]", "[SEP]", "[PAD]"):
                continue
            elif token.startswith("##"):
                words[-1] += token[2:]
                embeddings[-1] = torch.cat((embeddings[-1], embedding), dim=0)
            # removes symbols only such as '-'
            elif (all(not c.isalnum() for c in token)) or (all(c.isdigit() for c in token)):
                continue
            else:
                words.append(token)
                embeddings.append(embedding)
        except:
            print(words)
            print(token)

    return words, [embedding.mean(dim=0) for embedding in embeddings]



In [2]:
from sklearn.metrics.pairwise import cosine_similarity
import os
import tensorflow as tf
import tensorflow_datasets as tfds
model_name = "MLRS/mBERTu"
model, tokeniser = AutoModel.from_pretrained(model_name), AutoTokenizer.from_pretrained(model_name)

# get_ipython().run_line_magic('cd', 'test_dataset')
os.chdir("test_dataset")
os.system("tfds build")


Some weights of the model checkpoint at MLRS/mBERTu were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at MLRS/mBERTu and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably

0

In [3]:

examples, metadata = tfds.load(name="test_dataset", with_info=True,
                               as_supervised=True, download=True, data_dir="test_dataset")
train_examples, val_examples = examples['train'], examples['validation']


BATCH_SIZE = 256
# Maximum sentence length
MAX_LENGTH = 50
BUFFER_SIZE = 20000


def tokenize_pairs(en, mt):
    return en, mt




def make_batches(ds):
    return (
        ds
            .cache()
            .shuffle(BUFFER_SIZE)
            .batch(BATCH_SIZE)
            .map(tokenize_pairs, num_parallel_calls=tf.data.AUTOTUNE)
            .prefetch(tf.data.AUTOTUNE))


train_batches = make_batches(train_examples)
val_batches = make_batches(val_examples)


INFO:absl:Load dataset info from test_dataset\test_dataset\1.0.9
INFO:absl:Field info.splits from disk and from code do not match. Keeping the one from code.
INFO:absl:Reusing dataset test_dataset (test_dataset\test_dataset\1.0.9)
INFO:absl:Constructing tf.data.Dataset test_dataset for split None, from test_dataset\test_dataset\1.0.9


# Step 1: Getting a list of multi-sense words

In [13]:

model_name = "MLRS/mBERTu"
model, tokeniser = AutoModel.from_pretrained(model_name), AutoTokenizer.from_pretrained(model_name)
total_words_en = []
total_words_mt = []
total_embeddings_en = []
total_embeddings_mt = []
mt_examples = datasets.load_dataset("MLRS/korpus_malti", "shuffled", split="train")
en_examples = datasets.load_dataset("MLRS/korpus_malti", "shuffled", split="train")
# en_examples = datasets.load_dataset("wikipedia", "20220301.en")


Some weights of the model checkpoint at MLRS/mBERTu were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at MLRS/mBERTu and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably

In [14]:

with open("../embeddings_mt.tsv", "w+", encoding="utf-8") as embeddings_mt_file, \
        open("../words_mt.tsv", "w+", encoding="utf-8") as words_mt_file, \
        open("../embeddings_en.tsv", "w+", encoding="utf-8") as embeddings_en_file, \
        open("../words_en.tsv", "w+", encoding="utf-8") as words_en_file:
    words_en_file.write("word\tindex\n")
    words_mt_file.write("word\tindex\n")
    
    for i, sentence in enumerate(en_examples):
        words, embeddings = get_embeddings(model, tokeniser, [sentence["text"].strip()])
        assert len(words) == len(embeddings)

        for embedding in embeddings:
            embeddings_en_file.write("\t".join(str(x) for x in embedding.tolist()) + "\n")
            total_embeddings_en.append(embedding.tolist())
        for j, word in enumerate(words):
            words_en_file.write(f"{word}\t{i}_{j}\n")
            total_words_en.append(word)
            
            
            
    for i, sentence in enumerate(mt_examples):
        words, embeddings = get_embeddings(model, tokeniser, [sentence["text"].strip()])
        assert len(words) == len(embeddings)

        for embedding in embeddings:
            embeddings_mt_file.write("\t".join(str(x) for x in embedding.tolist()) + "\n")
            total_embeddings_mt.append(embedding.tolist())
        for j, word in enumerate(words):
            words_mt_file.write(f"{word}\t{i}_{j}\n")
            total_words_mt.append(word)


{'text': "Interessat ħafna f'dak kollu li hu relatat mal-midja u x-xandir.\n"}
{'text': "Billi l-arranġamenti in kwistjoni huma maħsuba biex inaqqsu l-ispejjeż ta' produzzjoni u l-prezzijiet tal-konsumatur;\n"}
{'text': "rapport ex post sal-31 ta' Diċembru 2034.\n"}
{'text': 'Dawn l-ispezzjonijiet huma ntiżi sabiex jiżviluppaw rikonoxximent komuni u l-interpretazzjoni tal-prassi u r-rekwiżiti.\n'}
{'text': "Informazzjoni fuq il-progress u r-riżultat ta' l-investigazzjonijiet għandhom ikunu provduti lill-Istati kollha li għandhom interess fi, jew ikunu affetwati minn, l-allegata vjolazzjoni.\n"}
{'text': 'Għandhom jingħataw ġustifikazzjonijiet raġunati minn min japplika għaliex ma jissodisfax/tissodisfax il-parametri kollha tal-eliġibbiltà u għaliex huma mistħoqqa kunsiderazzjonijiet speċjali.\n'}
{'text': "benefiċċji applikabbli, kundizzjonijiet u regoli/regolamenti; bdil raġonevoli għall-persuni rreġistrati b'diżabbiltà;\n"}
{'text': 'Hu stmat li fl-Ewropa biss mal-5,000 tifel u tifla

KeyboardInterrupt: 

In [10]:
alreadySorted = []

with open("../listOfMultiSenseWords_mt.txt", "w+", encoding="utf-8") as multisensewords_mt, open("../listOfMultiSenseWords_en.txt", "w+", encoding="utf-8") as multisensewords_en:
    for i, word in enumerate(total_words_en):
        embeddingsOfCurrentWord = []

        if word in alreadySorted:
            continue
        alreadySorted.append(word)

        embeddingsOfCurrentWord.append(total_embeddings_en[i])

        # Cet all embeddings of the same word in diff sentences
        for j, futureWord in enumerate(total_words_en[i+1:]):
            if futureWord == word:
                embeddingsOfCurrentWord.append(total_embeddings_en[j])

        n_words = len(embeddingsOfCurrentWord)
        if(n_words < 2):
            continue

        eps = 17
        min_samples = 2
        ret = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(embeddingsOfCurrentWord)

        if len(set(ret)) > 1:
            print(ret)
            multisensewords_en.write(word+"\n")
            
            
            
    for i, word in enumerate(total_words_mt):
        embeddingsOfCurrentWord = []

        if word in alreadySorted:
            continue
        alreadySorted.append(word)

        embeddingsOfCurrentWord.append(total_embeddings_mt[i])

        # Cet all embeddings of the same word in diff sentences
        for j, futureWord in enumerate(total_words_mt[i+1:]):
            if futureWord == word:
                embeddingsOfCurrentWord.append(total_embeddings_mt[j])

        n_words = len(embeddingsOfCurrentWord)
        if(n_words < 2):
            continue

        eps = 17
        min_samples = 2
        ret = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(embeddingsOfCurrentWord)

        if len(set(ret)) > 1:
            print(ret)
            multisensewords_mt.write(word+"\n")


# Step 2:  Mining multisense words

Loop each word in each sentence and get the embeddings and save them to file

In [17]:
import test_dataset
import tensorflow_text as text

with open("../embeddings_mt.tsv", "w+", encoding="utf-8") as embeddingsmt_file, \
        open("../words_mt.tsv", "w+", encoding="utf-8") as wordsmt_file, \
        open("../embeddings_en.tsv", "w+", encoding="utf-8") as embeddingsen_file, \
        open("../words_en.tsv", "w+", encoding="utf-8") as wordsen_file:
    


    for i, en in enumerate(en_examples):
        total_embeddings = []
        total_words = []
        words, embeddings = get_embeddings(model, tokeniser, [en["text"].strip()])

        for embedding in embeddings:
            embeddingsen_file.write("\t".join(str(x) for x in embedding.tolist()) + "\n")
            total_embeddings.append(embedding.tolist())
        for j, word in enumerate(words):
            wordsen_file.write(f"{word}\t{i}_{j}\n")
            total_words.append(word)

    for i, mt in enumerate(mt_examples):
        total_embeddings = []
        total_words = []
        words, embeddings = get_embeddings(model, tokeniser, [mt["text"].strip()])

        for embedding in embeddings:
            embeddingsmt_file.write("\t".join(str(x) for x in embedding.tolist()) + "\n")
            total_embeddings.append(embedding.tolist())
        for j, word in enumerate(words):
            wordsmt_file.write(f"{word}\t{i}_{j}\n")
            total_words.append(word)

KeyboardInterrupt: 

Traverse each word in each sentence and check for multisense words. If one is found, get the embedding.

In [None]:
found_multisense_en_info = []
found_multisense_mt_info = []
with open("../listOfMultiSenseWords_en.txt", "r", encoding="utf-8") as multisense_en, \
        open("../listOfMultiSenseWords_mt.txt", "r", encoding="utf-8") as multisense_mt:
    
    en_multisense_words = multisense_en.readlines()
    mt_multisense_words = multisense_mt.readlines()

    for i, en in enumerate(en_examples):
        for multisense_word_pos, word in enumerate(en["text"].split()):
            if word in en_multisense_words:
                words, embeddings = get_embeddings(model, tokeniser, [en.decode('utf-8').strip()])
                found_multisense_en_info.append({'word': word, 'embedding': embeddings[multisense_word_pos, 'sentence': en.decode('utf-8')]})
    
    for i, mt in enumerate(mt_examples):
        for multisense_word_pos, word in enumerate(mt["text"].split()):
            if word in mt_multisense_words:
                words, embeddings = get_embeddings(model, tokeniser, [mt.decode('utf-8').strip()])
                found_multisense_mt_info.append({'word': word, 'embedding': embeddings[multisense_word_pos, 'sentence': mt.decode('utf-8')]})


Get each sentence's embedding and store it.

In [14]:
en_embeddings = []
en_words = []
mt_embeddings = []
mt_words = []

for i, en in enumerate(en_examples):
    words, embeddings = get_embeddings(model, tokeniser, [en["text"].strip()])
    en_embeddings.append(embeddings)
    en_words.append(words)
for i, mt in enumerate(mt_examples):
    words, embeddings = get_embeddings(model, tokeniser, [mt["text"].strip()])
    mt_embeddings.append(embeddings)
    mt_words.append(words)


KeyboardInterrupt



Calculate scores of multisense word and each word in each sentence to see which is closest

In [None]:

embeddings_en = [e.detach().numpy() for e in en_embeddings]
results_en = []
for en_multi in found_multisense_en_info:
    list_of_sim = []
    list_of_sim_info = []
    for i, sentence_embedding in enumerate(mt_embeddings):
        embeddings_mt = [e.detach().numpy() for e in sentence_embedding]
        for j, word_embedding in enumerate(embeddings_mt):
            res = cosine_similarity(word_embedding.reshape(1, -1) , en_multi['embedding'].reshape(1, -1))
            print(res)
            list_of_sim.append(res)
            list_of_sim_info.append({'multi_sense_word': en_multi['word'], 'multi_sense_sentence': en_multi['sentence'], 'target_word': mt_words[i][j], 'target_sentence': mt_words[i]})
    list_of_biggest_sim_indices = sorted(range(len(list_of_sim)), key=lambda i: list_of_sim[i])[-5:]
    for index in list_of_biggest_sim_indices:
        results_en.append(list_of_sim_info[index])
        
        
results_mt = []
for mt_multi in found_multisense_mt_info:
    list_of_sim = []
    list_of_sim_info = []
    for i, sentence_embedding in enumerate(en_embeddings):
        embeddings_en = [e.detach().numpy() for e in sentence_embedding]
        for j, word_embedding in enumerate(embeddings_en):
            res = cosine_similarity(word_embedding.reshape(1, -1) , mt_multi['embedding'].reshape(1, -1))
            print(res)
            list_of_sim.append(res)
            list_of_sim_info.append({'multi_sense_word': mt_multi['word'], 'multi_sense_sentence': mt_multi['sentence'], 'target_word': en_words[i][j], 'target_sentence': en_words[i]})
    list_of_biggest_sim_indices = sorted(range(len(list_of_sim)), key=lambda i: list_of_sim[i])[-5:]
    for index in list_of_biggest_sim_indices:
        results_mt.append(list_of_sim_info[index])

print(results_en)
print(results_mt)

# Step 3: Create sentence pairs for each multisense word and create data to be backtranslated


In [None]:
reloaded_mt_en = tf.saved_model.load('translator_mt_en')
reloaded_en_mt = tf.saved_model.load('translator_en_mt')

for result in results_mt:
    source_sentence = result["target_sentence"].replace(result["target_word"], "<MASK>")
    target_sentence = reloaded_mt_en(source_sentence).numpy().decode('utf-8')
    print(source_sentence)
    print(target_sentence)
    
for result in results_en:
    source_sentence = result["target_sentence"].replace(result["target_word"], "<MASK>")
    target_sentence = reloaded_en_mt(source_sentence).numpy().decode('utf-8')
    print(source_sentence)
    print(target_sentence)
    