In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
from sklearn.metrics.pairwise import cosine_similarity



Utility Functions

In [2]:
## utility functions 
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


def load_model(name,do_lower_case=False):
    tokenizer = AutoTokenizer.from_pretrained(name,do_lower_case=False)
    model = AutoModel.from_pretrained(name)
    return model,tokenizer


def calc_embeddings(model,tokenizer,sentences):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).numpy()
#     sentence_embeddings = (sentence_embeddings-np.mean(sentence_embeddings,axis=0)) / np.std(sentence_embeddings,axis=0)
    return sentence_embeddings

def plot_similarity(sentence_embeddings):
    labels = ['Sent1_en','Sent1_hi','Sent1_ta','Sent1_mr','Sent2_en','Sent2_hi','Sent2_ta','Sent2_mr']
    sns.set(font_scale=1.2)
    similarity = (1+cosine_similarity(sentence_embeddings))/2
    plt.figure(figsize=(14,14))
    ax = sns.heatmap(similarity, linewidth=0.5,annot=True,xticklabels= labels, yticklabels=labels)
    plt.show()




In [3]:
sentences = ['Can you please help me here?',
             'क्या आप कृपया यहाँ मेरी मदद कर सकते हैं?',
             'தயவுசெய்து எனக்கு இங்கு உதவ முடியுமா?','तुम्ही मला इथे मदत करू शकता का?',
             'Sure, what help do you need?','ज़रूर, आपको क्या मदद चाहिए?',
             'நிச்சயமாக, உங்களுக்கு என்ன உதவி தேவை?','नक्कीच, तुम्हाला कोणती मदत हवी आहे?']



LaBSE Embeddings

In [4]:
model_name = "pvl/labse_bert" # for mbert use bert-base-multilingual-cased
model,tokenizer = load_model(name=model_name)
sentence_embeddings = calc_embeddings(model,tokenizer,sentences)
print('Dimension of embeddings is {}'.format(sentence_embeddings.shape[1]))
plot_similarity(sentence_embeddings)

LASER Embeddings

In [5]:
!pip install laserembeddings

In [6]:
!python -m laserembeddings download-models

In [7]:
from laserembeddings import Laser
laser = Laser()
laser_embeddings = laser.embed_sentences(
    sentences[:],
    lang=['en', 'hi', 'ta','mr']*2)


In [8]:
laser_embeddings.shape

In [9]:
plot_similarity(laser_embeddings)