In [1]:
from datasets import load_dataset
from google import genai
from tenacity import retry, wait_exponential, stop_after_attempt
import os
import time
import pandas as pd
from scipy.spatial.distance import cosine
import pickle
from tqdm import tqdm 
from google.genai import types

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset
#dataset = load_dataset("sentence-transformers/stsb", split="train")
dataset = load_dataset("tabilab/biosses", split="train")
print(f"Loaded {len(dataset)} examples. Sample:")
print(dataset[0], "\n")


Loaded 100 examples. Sample:
{'sentence1': 'Here, looking for agents that could specifically kill KRAS mutant cells, they found that knockdown of GATA2 was synthetically lethal with KRAS mutation', 'sentence2': 'Not surprisingly, GATA2 knockdown in KRAS mutant cells resulted in a striking reduction of active GTP-bound RHO proteins, including the downstream ROCK kinase', 'score': 2.200000047683716} 



In [3]:
# Set your Google API key (best via environment variable)
client = genai.Client(api_key=os.getenv("GOOGLE_GENERATIVE_AI_API_KEY")) 

In [4]:
# Function to get embedding
@retry(wait=wait_exponential(multiplier=1, min=4, max=60), stop=stop_after_attempt(5))
def get_embedding(text, model_name="gemini-embedding-001", dim=768):
    try:
        response = client.models.embed_content(
            model = model_name,
            contents=text,
            config=types.EmbedContentConfig(output_dimensionality=dim)
        )

        return response.embeddings
    
    except Exception as e:
        print(f"Error for text: {text[:30]}... => {e}")
        return None

In [10]:
# Generate embeddings for the first few sentence pairs (do not run without need, save results, costs money)
dimension = 768
model_name = 'gemini-embedding-001'
file_to_save = 'biosses/gemini_text-'+model_name+'_dim_'+str(dimension)+'.pickle'
embeddings = pd.DataFrame()
sentence1 = []
sentence2 = []
embedding1 = []
embedding2 = []
cosine_similarity = []
human_score = []

print('Extracting embedding and measuring similarity ...')
time.sleep(1)
for idx in tqdm(range(len(dataset))):
    first_sentence = dataset[idx]["sentence1"]
    emb1 = list(get_embedding(first_sentence, model_name, dim=dimension)[0])[0][1]
    sentence1.append(
        first_sentence
    )
    embedding1.append(
        emb1
    )
    #############################
    second_sentence = dataset[idx]["sentence2"]
    emb2 = list(get_embedding(second_sentence, model_name, dim=dimension)[0])[0][1]
    sentence2.append(
        second_sentence
    )
    embedding2.append(
        emb2
    )
    
    ############################
    cosine_similarity.append(
        1 - cosine(emb1, emb2)
    )
    human_score.append(
        dataset[idx]["score"]/4.0
    )
    
    time.sleep(1)  # respect rate limits!

embeddings['Sentence 1'] = sentence1
embeddings['Sentence 2'] = sentence2
embeddings['Embedding of Sentence 1'] = embedding1
embeddings['Embedding of Sentence 2'] = embedding2
embeddings['Cosine similarity'] = cosine_similarity
embeddings['Human score'] = human_score

with open(file_to_save, 'wb') as f:
    pickle.dump(embeddings, f)

Extracting embedding and measuring similarity ...


100%|██████████| 100/100 [02:20<00:00,  1.40s/it]


In [8]:
embeddings.head()

Unnamed: 0,Sentence 1,Sentence 2,Embedding of Sentence 1,Embedding of Sentence 2,Cosine similarity,Human score
0,"Here, looking for agents that could specifical...","Not surprisingly, GATA2 knockdown in KRAS muta...","[0.041973848, 0.028084457, -0.00015977329, -0....","[0.05175928, 0.034314334, -0.008227446, -0.068...",0.835795,0.55
1,MLL-FKBP and MLL-AF9 transformed cells showed ...,Regardless of the mechanism for transcriptiona...,"[-0.004544786, 0.019371057, 0.00059932604, -0....","[-0.008072136, 0.017004538, 0.0044828756, -0.0...",0.873793,0.8
2,The oncogenic activity of mutant Kras appears ...,Oncogenic KRAS mutations are common in cancer.,"[0.011584494, 0.015489457, 0.004773209, -0.067...","[0.03410616, 0.009374501, 0.00034309138, -0.05...",0.747346,0.5
3,Consequently miRNAs have been demonstrated to ...,Given the extensive involvement of miRNA in ph...,"[0.015258154, 0.010334861, -0.0064169425, -0.0...","[0.014204325, -0.0097979475, -0.011153166, -0....",0.765303,0.7
4,We then sought to reassess the regulation of m...,"Importantly, our reassessment revealed that th...","[0.019466532, 0.0012507971, -0.018862067, -0.0...","[0.007889474, -0.019908652, -0.0050982917, -0....",0.817216,0.6
