In [1]:
import torch
import transformers
import itertools
from nltk import word_tokenize
import numpy as np

def align_sentence(src,
                   tgt,
                   model_name='UGARIT/grc-alignment',
                   match="softmax", # possible values ["softmax", "argmax"]
                   align_layer=8):
    
    # Load the model
    model = transformers.XLMRobertaForMaskedLM.from_pretrained(model_name)
    tokenizer = transformers.XLMRobertaTokenizer.from_pretrained(model_name)

    # tokenizing, first phase
    sent_src = word_tokenize(src.strip())  
    sent_tgt = word_tokenize(tgt.strip())  

    token_src = [tokenizer.tokenize(word) for word in sent_src]
    token_tgt = [tokenizer.tokenize(word) for word in sent_tgt]
    
    wid_src = [tokenizer.convert_tokens_to_ids(x) for x in token_src]
    wid_tgt = [tokenizer.convert_tokens_to_ids(x) for x in token_tgt]

    ids_src = tokenizer.prepare_for_model(list(itertools.chain(*wid_src)), return_tensors='pt',
                                          model_max_length=tokenizer.model_max_length, truncation=True)['input_ids']
    ids_tgt = tokenizer.prepare_for_model(list(itertools.chain(*wid_tgt)), return_tensors='pt',
                                          truncation=True, model_max_length=tokenizer.model_max_length)['input_ids']

    sub2word_map_src = []
    for i, word_list in enumerate(token_src):
        sub2word_map_src += [i for x in word_list]

    sub2word_map_tgt = []
    for i, word_list in enumerate(token_tgt):
        sub2word_map_tgt += [i for x in word_list]

    # alignment
    threshold = 1e-3
    model.eval()

    with torch.no_grad():
        out_src = model(ids_src.unsqueeze(0), output_hidden_states=True)["hidden_states"][align_layer][0, 1:-1]
        out_tgt = model(ids_tgt.unsqueeze(0), output_hidden_states=True)["hidden_states"][align_layer][0, 1:-1]
        # Create Similarity Matrix With Dot Product
        sim_matrix = torch.matmul(out_src, out_tgt.transpose(-1, -2))  # dot_prod

        # Extraction Algorithms
        if match == "softmax":
            softmax_srctgt = torch.nn.Softmax(dim=-1)(sim_matrix)
            softmax_tgtsrc = torch.nn.Softmax(dim=-2)(sim_matrix)
            intersection = (softmax_srctgt > threshold) * (softmax_tgtsrc > threshold)
        else:
            sim_matrix = cosine_similarity(out_src, out_tgt)  # cosine_similarity
            m, n = sim_matrix.shape
            forward = np.eye(n)[sim_matrix.argmax(axis=1)]  # m x n
            backward = np.eye(m)[sim_matrix.argmax(axis=0)]  # n x m
            intersection = torch.from_numpy(forward * backward.transpose())

    align_subwords = torch.nonzero(intersection, as_tuple=False)

    # Extract alignments
    alignment = set()
    for i, j in align_subwords:
        alignment.add((sub2word_map_src[i], sub2word_map_tgt[j]))
    return {"src": sent_src, "tgt": sent_tgt, "alignment": list(alignment)} 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
src = 'Λητοῦς καὶ Διὸς υἱός'
tgt = 'The son of Leto and Zeus'

align_sentence(src, tgt) # Softmax is default

Some weights of the model checkpoint at UGARIT/grc-alignment were not used when initializing XLMRobertaForMaskedLM: ['psi_cls.bias', 'psi_cls.decoder.weight', 'psi_cls.transform.bias', 'psi_cls.transform.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'src': ['Λητοῦς', 'καὶ', 'Διὸς', 'υἱός'],
 'tgt': ['The', 'son', 'of', 'Leto', 'and', 'Zeus'],
 'alignment': [(3, 1), (0, 3), (2, 5), (1, 4)]}

In [3]:
align_sentence(src, tgt, match="argmax") # argmax

Some weights of the model checkpoint at UGARIT/grc-alignment were not used when initializing XLMRobertaForMaskedLM: ['psi_cls.bias', 'psi_cls.decoder.weight', 'psi_cls.transform.bias', 'psi_cls.transform.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NameError: name 'cosine_similarity' is not defined