In [1]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import os
import time
import pandas as pd
from scipy.spatial.distance import cosine
import pickle
from tqdm import tqdm 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset
dataset = load_dataset("sentence-transformers/stsb", split="train")
print(f"Loaded {len(dataset)} examples. Sample:")
print(dataset[0], "\n")


Loaded 5749 examples. Sample:
{'sentence1': 'A plane is taking off.', 'sentence2': 'An air plane is taking off.', 'score': 1.0} 



In [6]:
# Generate embeddings for the first few sentence pairs (do not run without need, save results, costs money)
model_name = 'nomic-embed-text-v1'
file_to_save = 'nomic_text-'+model_name+'.pickle'
embeddings = pd.DataFrame()
sentence1 = []
sentence2 = []
embedding1 = []
embedding2 = []
cosine_similarity = []
human_score = []

model = SentenceTransformer("nomic-ai/"+model_name, trust_remote_code=True)

print('Extracting embedding and measuring similarity ...')
time.sleep(1)
for idx in tqdm(range(len(dataset))):
    first_sentence = dataset[idx]["sentence1"]
    emb1 = response = list(model.encode(first_sentence))
    sentence1.append(
        first_sentence
    )
    embedding1.append(
        emb1
    )
    #############################
    second_sentence = dataset[idx]["sentence2"]
    emb2 = list(model.encode(second_sentence))
    sentence2.append(
        second_sentence
    )
    embedding2.append(
        emb2
    )
    
    ############################
    cosine_similarity.append(
        1 - cosine(emb1, emb2)
    )
    human_score.append(
        dataset[idx]["score"]
    )
    
    time.sleep(1)  # respect rate limits!

embeddings['Sentence 1'] = sentence1
embeddings['Sentence 2'] = sentence2
embeddings['Embedding of Sentence 1'] = embedding1
embeddings['Embedding of Sentence 2'] = embedding2
embeddings['Cosine similarity'] = cosine_similarity
embeddings['Human score'] = human_score

with open(file_to_save, 'wb') as f:
    pickle.dump(embeddings, f)

<All keys matched successfully>


Extracting embedding and measuring similarity ...


100%|██████████| 5749/5749 [1:43:02<00:00,  1.08s/it]


In [16]:
embeddings.head()

Unnamed: 0,Sentence 1,Sentence 2,Embedding of Sentence 1,Embedding of Sentence 2,Cosine similarity,Human score
0,A plane is taking off.,An air plane is taking off.,"[-0.026247814297676086, 0.01684456691145897, -...","[-0.035567473620176315, -0.001996719976887107,...",0.889733,1.0
1,A man is playing a large flute.,A man is playing a flute.,"[0.01073455810546875, -0.054696112871170044, -...","[0.010396339930593967, -0.023768354207277298, ...",0.732201,0.76
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,"[-0.012448686175048351, 0.029658401384949684, ...","[0.00020476203644648194, 0.004532653372734785,...",0.913542,0.76
3,Three men are playing chess.,Two men are playing chess.,"[0.02218366600573063, 0.014727897942066193, -0...","[0.00860250648111105, -0.0020768065005540848, ...",0.727974,0.52
4,A man is playing the cello.,A man seated is playing the cello.,"[0.003227482782676816, -0.0025188112631440163,...","[-0.003892626380547881, -0.007955207489430904,...",0.788158,0.85


In [5]:
cosine_similarity

[0.9338326096981154,
 0.8674470444752889,
 0.8910569082430774,
 0.8371824026107788,
 0.8662249108023918]