# <center> Comparing different SBERT models

### SBERT converts sentences into fixed-dimensional vectors. Sevreal tasks like similarity of sentences, clustering, and classification tasks can be done.

In [1]:
from sentence_transformers import SentenceTransformer,util
import torch
import pandas as pd
#import dataframe_image as dfi
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

model1 = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
model2 = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model3 = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
Corpus = ['He is playing a guitar','A man is playing guitar','The new movie is awesome']
Premise = ['She is playing violin','He is playing violin','Dog is playing in ground']

#Model1: 
Corpus_model1 = model1.encode(Corpus, convert_to_tensor=True)
Premise_model1 = model1.encode(Premise, convert_to_tensor=True)

#Compute cosine-similarities in nmatrix format
cosine_scores_model1 = util.cos_sim(Corpus_model1, Premise_model1)

#Model2
Corpus_model2 = model2.encode(Corpus, convert_to_tensor=True)
Premise_model2 = model2.encode(Premise, convert_to_tensor=True)

#Compute cosine-similarities in nmatrix format
cosine_scores_model2 = util.cos_sim(Corpus_model2, Premise_model2)

#Model3
Corpus_model3 = model3.encode(Corpus, convert_to_tensor=True)
Premise_model3 = model3.encode(Premise, convert_to_tensor=True)

#Compute cosine-similarities in nmatrix format
cosine_scores_model3 = util.cos_sim(Corpus_model3, Premise_model3)

print(cosine_scores_model1, cosine_scores_model2, cosine_scores_model3)

tensor([[0.2586, 0.4137, 0.1028],
        [0.1864, 0.3825, 0.0818],
        [0.1035, 0.0752, 0.0221]]) tensor([[ 0.2319,  0.3928,  0.0673],
        [ 0.1688,  0.3756,  0.0639],
        [-0.0496, -0.0484, -0.0347]]) tensor([[0.5717, 0.6732, 0.3612],
        [0.4373, 0.5479, 0.3622],
        [0.0906, 0.1030, 0.0082]])


In [3]:
type(cosine_scores_model1)

torch.Tensor

In [4]:
# Transpose check by flipping Corpus and Premise
Corpus = ['She is playing violin','He is playing violin','Dog is playing in ground']
Premise = ['He is playing a guitar','A man is playing guitar','The new movie is awesome']

#Model1 
Corpus_model1 = model1.encode(Corpus, convert_to_tensor=True)
Premise_model1 = model1.encode(Premise, convert_to_tensor=True)
#Compute cosine-similarities in nmatrix format
cosine_scores_model1 = util.cos_sim(Corpus_model1, Premise_model1)

#Model2
Corpus_model2 = model2.encode(Corpus, convert_to_tensor=True)
Premise_model2 = model2.encode(Premise, convert_to_tensor=True)
#Compute cosine-similarities in nmatrix format
cosine_scores_model2 = util.cos_sim(Corpus_model2, Premise_model2)

#Model3
Corpus_model3 = model3.encode(Corpus, convert_to_tensor=True)
Premise_model3 = model3.encode(Premise, convert_to_tensor=True)
#Compute cosine-similarities in nmatrix format
cosine_scores_model3 = util.cos_sim(Corpus_model3, Premise_model3)

print(cosine_scores_model1, cosine_scores_model2, cosine_scores_model3)

tensor([[0.2586, 0.1864, 0.1035],
        [0.4137, 0.3825, 0.0752],
        [0.1028, 0.0818, 0.0221]]) tensor([[ 0.2319,  0.1688, -0.0496],
        [ 0.3928,  0.3756, -0.0484],
        [ 0.0673,  0.0639, -0.0347]]) tensor([[0.5717, 0.4373, 0.0906],
        [0.6732, 0.5479, 0.1030],
        [0.3612, 0.3622, 0.0082]])


In [5]:
# Let us check dimensions of the models, that is the fixed lengths of the sentence embeddings. 
print("sentence-transformers/paraphrase-multilingual-mpnet-base-v2:",Corpus_model1.ndim, Corpus_model1.shape[0],Corpus_model1.shape[1])
print("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2:",Corpus_model2.ndim, Corpus_model2.shape[0],Corpus_model2.shape[1])
print("sentence-transformers/distiluse-base-multilingual-cased-v2:",Corpus_model3.ndim, Corpus_model3.shape[0],Corpus_model3.shape[1])


sentence-transformers/paraphrase-multilingual-mpnet-base-v2: 2 3 768
sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2: 2 3 384
sentence-transformers/distiluse-base-multilingual-cased-v2: 2 3 512


In [6]:
# Model 1 has maximum entities along the second axis or sequence length. 

# <center> Cross-lingual analysis

In [7]:
from sentence_transformers import SentenceTransformer,util
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

In [8]:
# Create a corpus and premises for other languages.
Corpus = ['He is playing a guitar','A man is playing guitar','The new movie is awesome']

#Premises in different languages
Premise_en = ['She is playing violin','He is playing violin','Dog is playing in ground']
Premise_ar = ["إنها تعزف على الكمان" ,"إنه يعزف على الكمان","كلب يلعب في الأرض"]
Premise_ru = ['Она играет на скрипке','Он играет на скрипке','Собака играет в земле']
Premise_uk = ['Вона грає на скрипці','Він грає на скрипці','Собака грається в землі']
Premise_zh = ['她在拉小提琴','他在拉小提琴','狗在地上玩耍']
Premise_pl = ['Ona gra na skrzypcach','On gra na skrzypcach','Pies bawi się w ziemi']
Premise_lv = ["Viņa spēlē vijoli", "Viņš spēlē vijoli", "Suns spēlē zemē"]
Premise_et = ["Ta mängib viiulit", "Ta mängib viiulit", "Koer mängib maas"]
Premise_lt = ['Ji griežia smuiku', 'Jis groja smuiku', 'Šuo groja žemėje']

In [9]:
# Convert into sentence embeddings or vectors
embeddings_corpus = model.encode(Corpus, convert_to_tensor=True)
embeddings_en = model.encode(Premise_en, convert_to_tensor=True)
embeddings_ar = model.encode(Premise_ar, convert_to_tensor=True)
embeddings_ru = model.encode(Premise_ru, convert_to_tensor=True)
embeddings_uk = model.encode(Premise_uk, convert_to_tensor=True)
embeddings_zh = model.encode(Premise_zh, convert_to_tensor=True)
embeddings_pl = model.encode(Premise_pl, convert_to_tensor=True)
embeddings_lv = model.encode(Premise_lv, convert_to_tensor=True)
embeddings_et = model.encode(Premise_et, convert_to_tensor=True)
embeddings_lt = model.encode(Premise_lt, convert_to_tensor=True)

In [10]:
cosine_scores = util.cos_sim(embeddings_corpus, embeddings_en)
for i in range(len(Corpus)):
    for j in range(len(Premise_en)):
        print("{} \t\t\t {} \t\t\t Score: {:.4f}".format(Corpus[i], Premise_en[j], cosine_scores[i][j]))

He is playing a guitar 			 She is playing violin 			 Score: 0.2586
He is playing a guitar 			 He is playing violin 			 Score: 0.4137
He is playing a guitar 			 Dog is playing in ground 			 Score: 0.1028
A man is playing guitar 			 She is playing violin 			 Score: 0.1864
A man is playing guitar 			 He is playing violin 			 Score: 0.3825
A man is playing guitar 			 Dog is playing in ground 			 Score: 0.0818
The new movie is awesome 			 She is playing violin 			 Score: 0.1035
The new movie is awesome 			 He is playing violin 			 Score: 0.0752
The new movie is awesome 			 Dog is playing in ground 			 Score: 0.0221


In [11]:
#f-string method:
for i in range(len(Corpus)):
    for j in range(len(Premise)):
        print(f"{Corpus[i]} \t\t\t {Premise[j]} \t\t\t Score: {cosine_scores[i][j]:.4f}")

He is playing a guitar 			 He is playing a guitar 			 Score: 0.2586
He is playing a guitar 			 A man is playing guitar 			 Score: 0.4137
He is playing a guitar 			 The new movie is awesome 			 Score: 0.1028
A man is playing guitar 			 He is playing a guitar 			 Score: 0.1864
A man is playing guitar 			 A man is playing guitar 			 Score: 0.3825
A man is playing guitar 			 The new movie is awesome 			 Score: 0.0818
The new movie is awesome 			 He is playing a guitar 			 Score: 0.1035
The new movie is awesome 			 A man is playing guitar 			 Score: 0.0752
The new movie is awesome 			 The new movie is awesome 			 Score: 0.0221


In [None]:
cosine_scores

In [None]:
cosine_scores = util.cos_sim(embeddings_corpus, embeddings_ar)
for i in range(len(Corpus)):
    for j in range(len(Premise_ar)):
        print("{} \t\t\t {} \t\t\t Score: {:.4f}".format(Corpus[i], Premise_ar[j], cosine_scores[i][j]))

In [None]:
cosine_scores

In [None]:
cosine_scores = util.cos_sim(embeddings_corpus, embeddings_ru)
for i in range(len(Corpus)):
    for j in range(len(Premise_ru)):
        print("{} \t\t\t {} \t\t\t Score: {:.4f}".format(Corpus[i], Premise_ru[j], cosine_scores[i][j]))

In [None]:
cosine_scores

In [None]:
cosine_scores = util.cos_sim(embeddings_corpus, embeddings_uk)
for i in range(len(Corpus)):
    for j in range(len(Premise_uk)):
        print("{} \t\t\t {} \t\t\t Score: {:.4f}".format(Corpus[i], Premise_uk[j], cosine_scores[i][j]))

In [None]:
cosine_scores

In [None]:
cosine_scores = util.cos_sim(embeddings_corpus, embeddings_zh)
for i in range(len(Corpus)):
    for j in range(len(Premise_zh)):
        print("{} \t\t\t {} \t\t\t Score: {:.4f}".format(Corpus[i], Premise_zh[j], cosine_scores[i][j]))

In [None]:
cosine_scores

In [None]:
cosine_scores = util.cos_sim(embeddings_corpus, embeddings_pl)
for i in range(len(Corpus)):
    for j in range(len(Premise_pl)):
        print("{} \t\t\t {} \t\t\t Score: {:.4f}".format(Corpus[i], Premise_pl[j], cosine_scores[i][j]))

In [None]:
cosine_scores

In [None]:
cosine_scores = util.cos_sim(embeddings_corpus, embeddings_lv)
for i in range(len(Corpus)):
    for j in range(len(Premise_lv)):
        print("{} \t\t\t {} \t\t\t Score: {:.4f}".format(Corpus[i], Premise_lv[j], cosine_scores[i][j]))

In [None]:
cosine_scores

In [None]:
cosine_scores = util.cos_sim(embeddings_corpus, embeddings_et)
for i in range(len(Corpus)):
    for j in range(len(Premise_et)):
        print("{} \t\t\t {} \t\t\t Score: {:.4f}".format(Corpus[i], Premise_et[j], cosine_scores[i][j]))

In [None]:
cosine_scores

In [None]:
cosine_scores = util.cos_sim(embeddings_corpus, embeddings_lt)
for i in range(len(Corpus)):
    for j in range(len(Premise_lt)):
        print("{} \t\t\t {} \t\t\t Score: {:.4f}".format(Corpus[i], Premise_lt[j], cosine_scores[i][j]))

In [None]:
cosine_scores

In [None]:
English = [0.2586,0.4137, 0.1028,0.1864, 0.3825, 0.0818,0.1035, 0.0752, 0.0221]
Arabic = [0.3790, 0.5544, 0.0837,0.2741, 0.4937, 0.0554,0.1688, 0.1383, 0.0323]
Russian = [0.2919, 0.4602, 0.1084,0.2120, 0.4221, 0.0710,0.1094, 0.0777, 0.0494]
Ukrainian = [0.2999, 0.4587, 0.1052,0.2209, 0.4252, 0.0616,0.1212, 0.0849, 0.0770]
Chinese = [0.2955, 0.4648, 0.0697,0.1999, 0.4136, 0.0292,0.1377, 0.1164, 0.0501]
Polish = [0.2817, 0.4404, 0.0751,0.1990, 0.4025, 0.0431,0.1219, 0.0844, 0.0394]
Latvian = [0.2693, 0.4335, 0.1006,0.1868, 0.3904, 0.0717,0.1156, 0.0880, 0.0129]
Estonian = [0.4191, 0.4191, 0.0905,0.3539, 0.3539, 0.0586,0.0848, 0.0848, 0.0467]
Lithuanian = [0.2998, 0.4903, 0.1772,0.2101, 0.4498, 0.1426,0.1284, 0.0855, 0.0667]

In [None]:
df = pd.DataFrame(list(zip(English,Arabic,Russian,Ukrainian,Chinese,Polish,Latvian,Estonian,Lithuanian)),
               columns =['English','Arabic','Russian','Ukrainian','Chinese','Polish','Latvian','Estonian','Lithuanian'])
df

In [None]:
sns.heatmap(df, annot=True)

In [None]:
df.describe()

In [None]:
df.corr()