In [1]:
from datasets import load_dataset
import openai
import os
import time
import pandas as pd
from scipy.spatial.distance import cosine
import pickle
from tqdm import tqdm 

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load dataset
#dataset = load_dataset("sentence-transformers/stsb", split="train")
dataset = load_dataset("tabilab/biosses", split="train")
print(f"Loaded {len(dataset)} examples. Sample:")
print(dataset[0], "\n")


Generating train split: 100%|██████████| 100/100 [00:00<00:00, 10798.93 examples/s]

Loaded 100 examples. Sample:
{'sentence1': 'Here, looking for agents that could specifically kill KRAS mutant cells, they found that knockdown of GATA2 was synthetically lethal with KRAS mutation', 'sentence2': 'Not surprisingly, GATA2 knockdown in KRAS mutant cells resulted in a striking reduction of active GTP-bound RHO proteins, including the downstream ROCK kinase', 'score': 2.200000047683716} 






In [6]:
# Set your OpenAI API key (best via environment variable)
openai.api_key = os.getenv("OPENAI_API_KEY")

In [11]:
# Function to get embedding
def get_embedding(text, model="text-embedding-3-large"):
    response = openai.embeddings.create(
        model=model,
        input=[text]
    )
    # Extract the embedding vector
    return response.data[0].embedding

In [12]:
# Generate embeddings for the first few sentence pairs (do not run without need, save results, costs money)
file_to_save = 'biosses/openai_text-embedding-3-large.pickle'
embeddings = pd.DataFrame()
sentence1 = []
sentence2 = []
embedding1 = []
embedding2 = []
cosine_similarity = []
human_score = []

print('Extracting embedding and measuring similarity ...')
time.sleep(1)
for idx in tqdm(range(len(dataset))):
    first_sentence = dataset[idx]["sentence1"]
    emb1 = get_embedding(first_sentence)
    sentence1.append(
        first_sentence
    )
    embedding1.append(
        emb1
    )
    #############################
    second_sentence = dataset[idx]["sentence2"]
    emb2 = get_embedding(second_sentence)
    sentence2.append(
        second_sentence
    )
    embedding2.append(
        emb2
    )
    
    ############################
    cosine_similarity.append(
        1 - cosine(emb1, emb2)
    )
    human_score.append(
        dataset[idx]["score"]/4.0
    )
    
    time.sleep(1)  # respect rate limits!

embeddings['Sentence 1'] = sentence1
embeddings['Sentence 2'] = sentence2
embeddings['Embedding of Sentence 1'] = embedding1
embeddings['Embedding of Sentence 2'] = embedding2
embeddings['Cosine similarity'] = cosine_similarity
embeddings['Human score'] = human_score

with open(file_to_save, 'wb') as f:
    pickle.dump(embeddings, f)

Extracting embedding and measuring similarity ...


100%|██████████| 100/100 [03:28<00:00,  2.08s/it]


In [13]:
embeddings.head()

Unnamed: 0,Sentence 1,Sentence 2,Embedding of Sentence 1,Embedding of Sentence 2,Cosine similarity,Human score
0,"Here, looking for agents that could specifical...","Not surprisingly, GATA2 knockdown in KRAS muta...","[-0.013234409503638744, -0.006376287434250116,...","[0.014252624474465847, 0.013045219704508781, -...",0.686539,0.55
1,MLL-FKBP and MLL-AF9 transformed cells showed ...,Regardless of the mechanism for transcriptiona...,"[0.025199266150593758, 0.04252807796001434, -0...","[0.029102345928549767, 0.05190785229206085, -0...",0.749075,0.8
2,The oncogenic activity of mutant Kras appears ...,Oncogenic KRAS mutations are common in cancer.,"[-0.01700298674404621, 0.0015223498921841383, ...","[-0.028718745335936546, 0.00742156570777297, -...",0.595304,0.5
3,Consequently miRNAs have been demonstrated to ...,Given the extensive involvement of miRNA in ph...,"[-0.008369757793843746, 0.031594742089509964, ...","[-0.01659364625811577, 0.02505677565932274, -0...",0.647595,0.7
4,We then sought to reassess the regulation of m...,"Importantly, our reassessment revealed that th...","[-0.000885025248862803, 0.01873880811035633, -...","[0.015441286377608776, 0.033518191426992416, -...",0.6545,0.6
