In [1]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import os
import time
import pandas as pd
from scipy.spatial.distance import cosine
import pickle
from tqdm import tqdm 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset
dataset = load_dataset("sentence-transformers/stsb", split="train")
#dataset = load_dataset("tabilab/biosses", split="train")
print(f"Loaded {len(dataset)} examples. Sample:")
print(dataset[0], "\n")

Loaded 5749 examples. Sample:
{'sentence1': 'A plane is taking off.', 'sentence2': 'An air plane is taking off.', 'score': 1.0} 



In [3]:
# Generate embeddings for the first few sentence pairs (do not run without need, save results, costs money)
model_name = 'nomic-embed-text-v2-moe'#'nomic-embed-text-v1.5'
dims = [768]#[64, 256, 512, 768]

for dim in dims:
    print('Doing dimension ', dim)
    file_to_save = 'stsb/nomic_text-'+model_name+'_'+str(dim)+'.pickle'
    embeddings = pd.DataFrame()
    sentence1 = []
    sentence2 = []
    embedding1 = []
    embedding2 = []
    cosine_similarity = []
    human_score = []

    model = SentenceTransformer("nomic-ai/"+model_name, trust_remote_code=True, truncate_dim=64)

    print('Extracting embedding and measuring similarity ...')
    time.sleep(1)
    for idx in tqdm(range(len(dataset))):
        first_sentence = dataset[idx]["sentence1"]
        emb1 = response = list(model.encode(first_sentence))
        sentence1.append(
            first_sentence
        )
        embedding1.append(
            emb1
        )
        #############################
        second_sentence = dataset[idx]["sentence2"]
        emb2 = list(model.encode(second_sentence))
        sentence2.append(
            second_sentence
        )
        embedding2.append(
            emb2
        )
        
        ############################
        cosine_similarity.append(
            1 - cosine(emb1, emb2)
        )
        human_score.append(
            dataset[idx]["score"]#/4.0 #comment out this depending on the data
        )
        
        time.sleep(1)  # respect rate limits!

    embeddings['Sentence 1'] = sentence1
    embeddings['Sentence 2'] = sentence2
    embeddings['Embedding of Sentence 1'] = embedding1
    embeddings['Embedding of Sentence 2'] = embedding2
    embeddings['Cosine similarity'] = cosine_similarity
    embeddings['Human score'] = human_score

    with open(file_to_save, 'wb') as f:
        pickle.dump(embeddings, f)

Doing dimension  768




Extracting embedding and measuring similarity ...


100%|██████████| 5749/5749 [2:06:20<00:00,  1.32s/it]  


In [4]:
embeddings.head()

Unnamed: 0,Sentence 1,Sentence 2,Embedding of Sentence 1,Embedding of Sentence 2,Cosine similarity,Human score
0,A plane is taking off.,An air plane is taking off.,"[0.015153886, 0.041404974, -0.014125247, 0.016...","[0.010745221, 0.05729635, -0.016948922, 0.0269...",0.966485,1.0
1,A man is playing a large flute.,A man is playing a flute.,"[0.0024366875, 0.032083724, 0.018080892, -0.03...","[0.014941305, 0.031600654, -0.010353177, -0.03...",0.919278,0.76
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,"[0.03733847, 0.00078736147, 0.0005427449, 0.03...","[0.0301026, 0.0054820334, -0.009974424, 0.0288...",0.951363,0.76
3,Three men are playing chess.,Two men are playing chess.,"[0.06387799, -0.010154583, 0.000906593, 0.0075...","[0.082604, -0.025732871, -0.0040285327, -0.009...",0.931828,0.52
4,A man is playing the cello.,A man seated is playing the cello.,"[0.01687711, -0.008224646, 0.0040950323, -0.00...","[0.027671488, 0.012839296, 0.017273515, 0.0058...",0.93824,0.85
