Gemini Embedding Ref: https://colab.research.google.com/github/google/generative-ai-docs/blob/main/site/en/gemini-api/tutorials/text_classifier_embeddings.ipynb#scrollTo=Tce3stUlHN0L

In [1]:
!pip install -U -q google-generativeai

In [2]:
import os
if os.getcwd()=='/content':
    from google.colab import drive
    drive.mount('/content/drive')
    root_path = '/content/drive/MyDrive/Project/AI and Cardiology/Cardiotoxicity'
    data_path = os.path.join(root_path,'Dataset')

else:
    root_path = '../'
    data_path = "../Dataset/"


Mounted at /content/drive


In [3]:
import re
import tqdm
import numpy as np
import pandas as pd

import google.generativeai as genai

# Used to securely store your API key
from google.colab import userdata

### Grab an API Key

Before you can use the Gemini API, you must first obtain an API key. If you don't already have one, create a key with one click in Google AI Studio.

<a class="button button-primary" href="https://makersuite.google.com/app/apikey" target="_blank" rel="noopener noreferrer">Get an API key</a>

In Colab, add the key to the secrets manager under the "🔑" in the left panel. Give it the name `API_KEY`.

Once you have the API key, pass it to the SDK. You can do this in two ways:

* Put the key in the `GOOGLE_API_KEY` environment variable (the SDK will automatically pick it up from there).
* Pass the key to `genai.configure(api_key=...)`

In [4]:
# Or use `os.getenv('API_KEY')` to fetch an environment variable.
genai.configure(api_key='AIzaSyChY1d7EiuoQlh0IS_hDE_1k5OJjEBtois')

Key Point: Next, you will choose a model. Any embedding model will work for this tutorial, but for real applications it's important to choose a specific model and stick with it. The outputs of different models are not compatible with each other.

In [5]:
for m in genai.list_models():
  if 'embedContent' in m.supported_generation_methods:
    print(m.name)

models/embedding-001
models/text-embedding-004


In [6]:
from tqdm.auto import tqdm
tqdm.pandas()

from google.api_core import retry

def make_embed_text_fn(model):

  @retry.Retry(timeout=300.0)
  def embed_fn(text: str) -> list[float]:
    # Set the task_type to CLASSIFICATION.
    embedding = genai.embed_content(model=model,
                                    content=text,
                                    task_type="classification")
    return embedding['embedding']

  return embed_fn

def create_embeddings(model, df):
  df['Embeddings'] = df['std_smiles'].progress_apply(make_embed_text_fn(model))
  return df

In [7]:
df = pd.read_csv(os.path.join(data_path,"UniChemDB-Data","final-herg-split.csv"))
df

Unnamed: 0,id,std_smiles,classes,train_test_split,cv_fold
0,CHEMBL240,O=C1NCCN1CCN1CCC(c2cn(-c3ccc(F)cc3)c3ccc(Cl)cc...,1,0,9
1,CHEMBL240,O=C(CCCN1CC=C(n2c(=O)[nH]c3ccccc32)CC1)c1ccc(F...,1,0,8
2,CHEMBL240,O=C(O[C@@H]1C[C@@H]2C[C@H]3C[C@H](C1)N2CC3=O)c...,1,0,0
3,CHEMBL240,COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C...,1,0,5
4,CHEMBL240,CCCCN(CCCC)CCC(O)c1cc2c(Cl)cc(Cl)cc2c2cc(C(F)(...,1,0,0
...,...,...,...,...,...
20404,C(CCO)CC(C(=O)O)N,N[C@H](CCCCO)C(=O)O,0,1,8
20405,RISPERIDONE,CC1=NC2=[N+](CCCC2)C(=O)C1CC[NH+]1CCC(c2noc3cc...,1,0,0
20406,,NC(=O)NOC[C@H]1NC(=O)NC1=O,0,0,5
20407,lumefantrine,CCCC[NH+](CCCC)CC(O)c1cc(Cl)cc2c1-c1ccc(Cl)cc1...,1,1,7


In [10]:
model = 'models/embedding-001'
df = create_embeddings(model, df)

  0%|          | 0/20409 [00:00<?, ?it/s]

In [11]:
df

Unnamed: 0,id,std_smiles,classes,train_test_split,cv_fold,Embeddings
0,CHEMBL240,O=C1NCCN1CCN1CCC(c2cn(-c3ccc(F)cc3)c3ccc(Cl)cc...,1,0,9,"[-0.009118106, -0.01473874, -0.02095197, -0.02..."
1,CHEMBL240,O=C(CCCN1CC=C(n2c(=O)[nH]c3ccccc32)CC1)c1ccc(F...,1,0,8,"[-0.016289616, -0.0039640917, -0.025137946, -0..."
2,CHEMBL240,O=C(O[C@@H]1C[C@@H]2C[C@H]3C[C@H](C1)N2CC3=O)c...,1,0,0,"[-0.025030453, -0.005247756, -0.041259497, -0...."
3,CHEMBL240,COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C...,1,0,5,"[-0.004453162, -0.02507347, -0.040514078, -0.0..."
4,CHEMBL240,CCCCN(CCCC)CCC(O)c1cc2c(Cl)cc(Cl)cc2c2cc(C(F)(...,1,0,0,"[-0.010023608, -0.004040814, -0.03360485, -0.0..."
...,...,...,...,...,...,...
20404,C(CCO)CC(C(=O)O)N,N[C@H](CCCCO)C(=O)O,0,1,8,"[0.013100136, -0.008221838, -0.06676493, -0.03..."
20405,RISPERIDONE,CC1=NC2=[N+](CCCC2)C(=O)C1CC[NH+]1CCC(c2noc3cc...,1,0,0,"[-0.014204543, -0.011475447, -0.03163016, -0.0..."
20406,,NC(=O)NOC[C@H]1NC(=O)NC1=O,0,0,5,"[-0.019909335, -0.016860656, -0.061723907, -0...."
20407,lumefantrine,CCCC[NH+](CCCC)CC(O)c1cc(Cl)cc2c1-c1ccc(Cl)cc1...,1,1,7,"[-0.02103665, 0.011718793, -0.028218094, -0.02..."


In [12]:
df.to_parquet(os.path.join(data_path,'UniChemDB-Data','herg-gemini-embedding.parquet'),index = False)

In [13]:
#External Test-1: https://github.com/Abdulk084/CardioTox/blob/master/data/external_test_set_pos.csv
ext_pos_df = pd.read_csv(f"{data_path}/External-Data/external_test_set_pos.csv")

In [14]:
# External Test h70, h60 dataset: https://github.com/issararab/CToxPred/tree/main/data/raw/hERG
ext_h60_df = pd.read_csv(f"{data_path}/External-Data/eval_set_herg_60.csv")
ext_h70_df = pd.read_csv(f"{data_path}/External-Data/eval_set_herg_70.csv")


In [15]:
ext_h60_df.head()

Unnamed: 0,InChl Key,SMILES,Source,pIC50
0,LIHJHFVXLZSRNK-UHFFFAOYSA-N,Cn1ccc(C[N+]2=CC(c3cccc(C(F)(F)F)c3)C=N2)n1,US Patent,5.647817
1,RXGDDWPITVSKDR-UHFFFAOYSA-N,CC(C)(C)OC(=O)N1CCN(c2nc3c([N+](=O)[O-])c(Br)c...,US Patent,5.60206
2,YRSBMPKJFDYYFO-UHFFFAOYSA-N,Fc1cccc(Oc2cc(C(F)(F)F)nc(N3CCc4nc[nH]c4C3)n2)c1,US Patent,5.59998
3,OMQQLDITRYIEHZ-UHFFFAOYSA-N,Cn1nccc1Cc1cn(-c2ccc(F)c(Cl)c2)nn1,US Patent,5.364516
4,BXBUTKPGTGJGTQ-YOEHRIQHSA-N,CNC[C@@H](c1ccc(Cl)c(Cl)c1)[C@@H](OC)c1cccc(NS...,US Patent,5.327902


In [16]:
ext_h70_df['emb'] = ext_h70_df['SMILES'].progress_apply(make_embed_text_fn(model))
ext_h70_df.to_parquet(os.path.join(data_path,'External-Data','h70-uniherg_db-gemini-embedding.parquet'),index = False)
ext_pos_df['emb'] = ext_pos_df['smiles'].progress_apply(make_embed_text_fn(model))
ext_pos_df.to_parquet(os.path.join(data_path,'External-Data','pos-uniherg_db-gemini-embedding.parquet'),index = False)

  0%|          | 0/473 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

In [17]:
ext_h60_df['emb'] = ext_h60_df['SMILES'].progress_apply(make_embed_text_fn(model))
ext_pos_df.to_parquet(os.path.join(data_path,'External-Data','h60-uniherg_db-gemini-embedding.parquet'),index = False)

  0%|          | 0/250 [00:00<?, ?it/s]