In [1]:
# Find the similarity between two texts

In [2]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

# Two lists of sentences
sentences1 = ['The cat sits outside. Today is a rainy day',
             'A man is playing guitar',
             'The new movie is awesome',
             'Apple and banana are my favorite fruits.']

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great. I love it.',
              'Apple and Banana Republic are american brands.']

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

The cat sits outside. Today is a rainy day 		 The dog plays in the garden 		 Score: 0.3498
A man is playing guitar 		 A woman watches TV 		 Score: 0.1759
The new movie is awesome 		 The new movie is so great. I love it. 		 Score: 0.8492
Apple and banana are my favorite fruits. 		 Apple and Banana Republic are american brands. 		 Score: 0.6408


In [3]:
#Test the accuracy using the dataset

In [4]:
import pandas as pd
data_frame = pd.read_csv("headlines.test.tsv", sep="\t")

data_frame

Unnamed: 0,Similarity,Sentence 1,Sentence 2
0,4.20,"In Nigeria, Chevron has been accused by the Al...","In Nigeria, the whole ijaw indigenous showed C..."
1,4.25,I know that in France they have had whole herd...,"I know that in France, the principle of slaugh..."
2,4.80,"Unfortunately, the ultimate objective of a Eur...",Unfortunately the final objective of a Europea...
3,4.80,The right of a government arbitrarily to set a...,The right for a government to draw aside its c...
4,4.80,The right of a government arbitrarily to set a...,The right for a government to dismiss arbitrar...
...,...,...,...
61,2.75,Mr Morse is charged with assault and Mr Darvis...,His partner Bijan Darvish is charged with fili...
62,3.25,"The mock explosion, the first event in the dri...","The mock explosion of a radioactive ""dirty bom..."
63,3.25,"The third appointment was to a new job, execut...","Bruce N. Hawthorne, 53, was named executive vi..."
64,2.50,The commission dropped charges that Patton imp...,Patton also appointed Conner to the Kentucky L...


In [5]:
sentences1 = data_frame["Sentence 1"].values.tolist()
sentences2 = data_frame["Sentence 2"].values.tolist()
similarities = data_frame["Similarity"].apply(lambda x: x * 0.2).values.tolist()

print(sentences1)
print(sentences2)
print(similarities)

['In Nigeria, the whole ijaw indigenous showed Chevron to encourage the violence against them and of up to pay Nigerian soldiers to shoot the demonstrators at the naval base from Warri.', 'I know that in France, the principle of slaughter of whole herd has been implemented and that this is not the best way to combat this phenomenon.', 'Unfortunately the final objective of a European Constitution would be exactly the opposite and obviously we cannot approve it.', 'The right for a government to draw aside its constitution arbitrarily is the definition characteristic of a tyranny.', 'The right for a government to dismiss arbitrarily its constitution is the definition of a characteristic tyranny.', 'But other sources close to the sale said Vivendi was keeping the door open for further bids in the next day or two.', "Micron's numbers also marked the first quarterly profit in three years for the DRAM manufacturer.", "Perry said he backs the Senate's efforts, including the fines, to force the

In [6]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('paraphrase-distilroberta-base-v1')

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

print(embeddings1)
print(embeddings2)

tensor([[-0.0303,  0.2912,  0.1155,  ...,  0.6757,  0.0406,  0.0214],
        [-0.0401,  0.0601,  0.2873,  ...,  0.2321,  0.1433,  0.2102],
        [ 0.1091,  0.5481,  0.0626,  ...,  0.6095,  0.1814,  0.0681],
        ...,
        [ 0.0526,  0.3425,  0.0958,  ..., -0.3047, -0.0406,  0.1113],
        [ 0.1083,  0.2988,  0.0239,  ...,  0.0248,  0.1180, -0.1757],
        [-0.0367,  0.4687, -0.1689,  ...,  0.1332,  0.1357, -0.1066]])
tensor([[ 0.0961,  0.1433,  0.1425,  ...,  0.5805, -0.1245, -0.0684],
        [ 0.0997,  0.1774,  0.2231,  ...,  0.4032,  0.2753,  0.3018],
        [ 0.3123,  0.7210,  0.2541,  ...,  0.6080,  0.1859, -0.0712],
        ...,
        [ 0.2473,  0.2396,  0.0228,  ..., -0.3773,  0.0272, -0.0923],
        [ 0.0271,  0.1912,  0.1319,  ..., -0.2053, -0.0304, -0.2544],
        [ 0.3917,  0.2489,  0.2606,  ...,  0.8711,  0.1934, -0.2630]])


In [7]:
#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{}\n {}\nScore: {:.4f}\nComputed Score: {:.4f}".format(sentences1[i], sentences2[i], similarities[i], cosine_scores[i][i]))

In Nigeria, Chevron has been accused by the All-Ijaw indigenous people of instigating violence against them and actually paying Nigerian soldiers to shoot protesters at the Warri naval base.
 In Nigeria, the whole ijaw indigenous showed Chevron to encourage the violence against them and of up to pay Nigerian soldiers to shoot the demonstrators at the naval base from Warri.
Score: 0.8400
Computed Score: 0.7139
I know that in France they have had whole herd slaughter and this does not seem to be the best way forward.
 I know that in France, the principle of slaughter of whole herd has been implemented and that this is not the best way to combat this phenomenon.
Score: 0.8500
Computed Score: 0.8003
Unfortunately, the ultimate objective of a European Constitution would be precisely the opposite, and so, of course, we cannot vote for it.
 Unfortunately the final objective of a European Constitution would be exactly the opposite and obviously we cannot approve it.
Score: 0.9600
Computed Scor

In [8]:
from sklearn.metrics import mean_squared_error

computed_similarities = []

for i in range(len(similarities)):
    computed_similarities.append(cosine_scores[i][i])

mean_squared_error(similarities, computed_similarities)

0.022684910386787667

In [9]:
from sklearn.metrics import r2_score

r2_score(similarities, computed_similarities)

0.2895542611525993

In [10]:
from sklearn.metrics import max_error

max_error(similarities, computed_similarities)

0.3975889205932617