In [1]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import txtai
import os
import time
import pandas as pd
from scipy.spatial.distance import cosine
import pickle
from tqdm import tqdm 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset
dataset = load_dataset("sentence-transformers/stsb", split="train")
#dataset = load_dataset("tabilab/biosses", split="train")
print(f"Loaded {len(dataset)} examples. Sample:")
print(dataset[0], "\n")

Loaded 5749 examples. Sample:
{'sentence1': 'A plane is taking off.', 'sentence2': 'An air plane is taking off.', 'score': 1.0} 



In [3]:
# Generate embeddings for the first few sentence pairs (do not run without need, save results, costs money)
model_name = 'pubmedbert-base-embeddings'#'nomic-embed-text-v1.5'
dims = [768]#[64, 256, 512, 768]

for dim in dims:
    print('Doing dimension ', dim)
    file_to_save = 'stsb/'+model_name+'_'+str(dim)+'.pickle'
    embeddings = pd.DataFrame()
    sentence1 = []
    sentence2 = []
    embedding1 = []
    embedding2 = []
    cosine_similarity = []
    human_score = []

    model = SentenceTransformer("neuml/"+model_name)
    print('Extracting embedding and measuring similarity ...')
    time.sleep(1)
    for idx in tqdm(range(len(dataset))):
        first_sentence = dataset[idx]["sentence1"]
        emb1 = response = list(model.encode(first_sentence))
        sentence1.append(
            first_sentence
        )
        embedding1.append(
            emb1
        )
        #############################
        second_sentence = dataset[idx]["sentence2"]
        emb2 = list(model.encode(second_sentence))
        sentence2.append(
            second_sentence
        )
        embedding2.append(
            emb2
        )
        
        ############################
        cosine_similarity.append(
            1 - cosine(emb1, emb2)
        )
        human_score.append(
            dataset[idx]["score"]/4.0
        )
        
        time.sleep(1)  # respect rate limits!

    embeddings['Sentence 1'] = sentence1
    embeddings['Sentence 2'] = sentence2
    embeddings['Embedding of Sentence 1'] = embedding1
    embeddings['Embedding of Sentence 2'] = embedding2
    embeddings['Cosine similarity'] = cosine_similarity
    embeddings['Human score'] = human_score

    with open(file_to_save, 'wb') as f:
        pickle.dump(embeddings, f)

Doing dimension  768
Extracting embedding and measuring similarity ...


100%|██████████| 5749/5749 [1:39:56<00:00,  1.04s/it]


In [4]:
embeddings.head()

Unnamed: 0,Sentence 1,Sentence 2,Embedding of Sentence 1,Embedding of Sentence 2,Cosine similarity,Human score
0,A plane is taking off.,An air plane is taking off.,"[-0.3398199, 0.92987967, -0.40230492, -0.60855...","[-0.5526142, -0.004194508, -0.40110624, -0.585...",0.87365,0.25
1,A man is playing a large flute.,A man is playing a flute.,"[-0.21034254, 0.77234876, -0.3687168, 1.133538...","[-0.08574683, 0.63033944, -0.5184751, 0.835966...",0.962215,0.19
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,"[-0.16809179, -0.14355482, -0.037588455, 0.317...","[-0.3554399, -0.1295483, -0.04689787, 0.079731...",0.919897,0.19
3,Three men are playing chess.,Two men are playing chess.,"[0.22388993, 0.5177767, -0.014132596, -0.43374...","[0.29313192, 0.41708857, -0.0053667277, -0.429...",0.950179,0.13
4,A man is playing the cello.,A man seated is playing the cello.,"[0.68852305, 0.64783514, 0.005147201, 0.838880...","[0.55768585, 0.79276526, -0.15436953, 0.429810...",0.889764,0.2125
