In [1]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import os
import time
import pandas as pd
from scipy.spatial.distance import cosine
import pickle
from tqdm import tqdm 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset
#dataset = load_dataset("sentence-transformers/stsb", split="train")
dataset = load_dataset("tabilab/biosses", split="train")
print(f"Loaded {len(dataset)} examples. Sample:")
print(dataset[0], "\n")

Loaded 100 examples. Sample:
{'sentence1': 'Here, looking for agents that could specifically kill KRAS mutant cells, they found that knockdown of GATA2 was synthetically lethal with KRAS mutation', 'sentence2': 'Not surprisingly, GATA2 knockdown in KRAS mutant cells resulted in a striking reduction of active GTP-bound RHO proteins, including the downstream ROCK kinase', 'score': 2.200000047683716} 



In [3]:
# Generate embeddings for the first few sentence pairs (do not run without need, save results, costs money)
model_name = 'nomic-embed-text-v1.5'
dims = [64, 256, 512, 768]

for dim in dims:
    print('Doing dimension ', dim)
    file_to_save = 'biosses/nomic_text-'+model_name+'_'+str(dim)+'.pickle'
    embeddings = pd.DataFrame()
    sentence1 = []
    sentence2 = []
    embedding1 = []
    embedding2 = []
    cosine_similarity = []
    human_score = []

    model = SentenceTransformer("nomic-ai/"+model_name, trust_remote_code=True, truncate_dim=64)

    print('Extracting embedding and measuring similarity ...')
    time.sleep(1)
    for idx in tqdm(range(len(dataset))):
        first_sentence = dataset[idx]["sentence1"]
        emb1 = response = list(model.encode(first_sentence))
        sentence1.append(
            first_sentence
        )
        embedding1.append(
            emb1
        )
        #############################
        second_sentence = dataset[idx]["sentence2"]
        emb2 = list(model.encode(second_sentence))
        sentence2.append(
            second_sentence
        )
        embedding2.append(
            emb2
        )
        
        ############################
        cosine_similarity.append(
            1 - cosine(emb1, emb2)
        )
        human_score.append(
            dataset[idx]["score"]/4.0
        )
        
        time.sleep(1)  # respect rate limits!

    embeddings['Sentence 1'] = sentence1
    embeddings['Sentence 2'] = sentence2
    embeddings['Embedding of Sentence 1'] = embedding1
    embeddings['Embedding of Sentence 2'] = embedding2
    embeddings['Cosine similarity'] = cosine_similarity
    embeddings['Human score'] = human_score

    with open(file_to_save, 'wb') as f:
        pickle.dump(embeddings, f)

Doing dimension  64


<All keys matched successfully>


Extracting embedding and measuring similarity ...


100%|██████████| 100/100 [01:54<00:00,  1.14s/it]


Doing dimension  256


<All keys matched successfully>


Extracting embedding and measuring similarity ...


100%|██████████| 100/100 [01:47<00:00,  1.08s/it]


Doing dimension  512


<All keys matched successfully>


Extracting embedding and measuring similarity ...


100%|██████████| 100/100 [01:47<00:00,  1.08s/it]


Doing dimension  768


<All keys matched successfully>


Extracting embedding and measuring similarity ...


100%|██████████| 100/100 [01:47<00:00,  1.08s/it]


In [4]:
embeddings.head()

Unnamed: 0,Sentence 1,Sentence 2,Embedding of Sentence 1,Embedding of Sentence 2,Cosine similarity,Human score
0,"Here, looking for agents that could specifical...","Not surprisingly, GATA2 knockdown in KRAS muta...","[0.37600115, -0.16444215, -3.428607, 1.0291133...","[0.41314304, -0.003164128, -2.8305767, 0.55561...",0.858913,0.55
1,MLL-FKBP and MLL-AF9 transformed cells showed ...,Regardless of the mechanism for transcriptiona...,"[0.3319264, 0.1859387, -3.2266126, 0.57614785,...","[-0.12157992, 0.700806, -3.0858023, 0.83130765...",0.914254,0.8
2,The oncogenic activity of mutant Kras appears ...,Oncogenic KRAS mutations are common in cancer.,"[0.1695996, -0.41325438, -3.1154187, 0.7725454...","[0.6242161, -0.060434245, -3.234395, 0.7518816...",0.871847,0.5
3,Consequently miRNAs have been demonstrated to ...,Given the extensive involvement of miRNA in ph...,"[-0.27360806, 0.0042789746, -3.075625, -0.1715...","[0.36149248, 0.41634881, -2.6628473, 0.4822917...",0.904348,0.7
4,We then sought to reassess the regulation of m...,"Importantly, our reassessment revealed that th...","[-0.5231466, 0.36688644, -3.3939989, -0.380803...","[-0.4272349, 0.38933787, -2.6761675, -0.285906...",0.834761,0.6
