In [4]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import txtai
import os
import time
import pandas as pd
from scipy.spatial.distance import cosine
import pickle
from tqdm import tqdm 

In [5]:
# Load dataset
#dataset = load_dataset("sentence-transformers/stsb", split="train")
dataset = load_dataset("tabilab/biosses", split="train")
print(f"Loaded {len(dataset)} examples. Sample:")
print(dataset[0], "\n")

Loaded 100 examples. Sample:
{'sentence1': 'Here, looking for agents that could specifically kill KRAS mutant cells, they found that knockdown of GATA2 was synthetically lethal with KRAS mutation', 'sentence2': 'Not surprisingly, GATA2 knockdown in KRAS mutant cells resulted in a striking reduction of active GTP-bound RHO proteins, including the downstream ROCK kinase', 'score': 2.200000047683716} 



In [6]:
# Generate embeddings for the first few sentence pairs (do not run without need, save results, costs money)
model_name = 'pubmedbert-base-embeddings'#'nomic-embed-text-v1.5'
dims = [768]#[64, 256, 512, 768]

for dim in dims:
    print('Doing dimension ', dim)
    file_to_save = 'biosses/'+model_name+'_'+str(dim)+'.pickle'
    embeddings = pd.DataFrame()
    sentence1 = []
    sentence2 = []
    embedding1 = []
    embedding2 = []
    cosine_similarity = []
    human_score = []

    model = SentenceTransformer("neuml/"+model_name)
    print('Extracting embedding and measuring similarity ...')
    time.sleep(1)
    for idx in tqdm(range(len(dataset))):
        first_sentence = dataset[idx]["sentence1"]
        emb1 = response = list(model.encode(first_sentence))
        sentence1.append(
            first_sentence
        )
        embedding1.append(
            emb1
        )
        #############################
        second_sentence = dataset[idx]["sentence2"]
        emb2 = list(model.encode(second_sentence))
        sentence2.append(
            second_sentence
        )
        embedding2.append(
            emb2
        )
        
        ############################
        cosine_similarity.append(
            1 - cosine(emb1, emb2)
        )
        human_score.append(
            dataset[idx]["score"]/4.0
        )
        
        time.sleep(1)  # respect rate limits!

    embeddings['Sentence 1'] = sentence1
    embeddings['Sentence 2'] = sentence2
    embeddings['Embedding of Sentence 1'] = embedding1
    embeddings['Embedding of Sentence 2'] = embedding2
    embeddings['Cosine similarity'] = cosine_similarity
    embeddings['Human score'] = human_score

    with open(file_to_save, 'wb') as f:
        pickle.dump(embeddings, f)

Doing dimension  768
Extracting embedding and measuring similarity ...


100%|██████████| 100/100 [01:51<00:00,  1.11s/it]


In [7]:
embeddings.head()

Unnamed: 0,Sentence 1,Sentence 2,Embedding of Sentence 1,Embedding of Sentence 2,Cosine similarity,Human score
0,"Here, looking for agents that could specifical...","Not surprisingly, GATA2 knockdown in KRAS muta...","[-0.36339355, 0.02793861, 0.2921029, -0.080254...","[-0.3903387, 0.40554783, 0.14837721, -0.737586...",0.580357,0.55
1,MLL-FKBP and MLL-AF9 transformed cells showed ...,Regardless of the mechanism for transcriptiona...,"[0.12080047, 0.7514093, -0.32931244, 0.3409620...","[0.14736821, 0.77914613, 0.13533372, 0.0014942...",0.83616,0.8
2,The oncogenic activity of mutant Kras appears ...,Oncogenic KRAS mutations are common in cancer.,"[-0.4348381, -0.09014715, 0.19112898, -0.49777...","[-0.044404767, -0.08884077, 0.5306239, -0.6016...",0.61283,0.5
3,Consequently miRNAs have been demonstrated to ...,Given the extensive involvement of miRNA in ph...,"[-0.4525161, -0.233401, -0.25025502, 0.2557852...","[-0.28733408, -0.6672212, 0.18202859, -0.12695...",0.709452,0.7
4,We then sought to reassess the regulation of m...,"Importantly, our reassessment revealed that th...","[-0.12415402, -0.06532924, -0.039601725, -0.13...","[-0.37341017, -0.33027577, -0.3874684, 0.03357...",0.415175,0.6
