In [22]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel,pipeline
from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
from huggingface_hub import login

hf_token = "<your_hf_secret_code>"
login(token=hf_token, add_to_git_credential=True)

In [13]:
# Load your molecular data into a DataFrame
df = pd.read_csv('data/final-herg-split.csv')

In [14]:
df.head()

Unnamed: 0,id,std_smiles,classes,train_test_split,cv_fold
0,CHEMBL240,O=C1NCCN1CCN1CCC(c2cn(-c3ccc(F)cc3)c3ccc(Cl)cc...,1,0,9
1,CHEMBL240,O=C(CCCN1CC=C(n2c(=O)[nH]c3ccccc32)CC1)c1ccc(F...,1,0,8
2,CHEMBL240,O=C(O[C@@H]1C[C@@H]2C[C@H]3C[C@H](C1)N2CC3=O)c...,1,0,0
3,CHEMBL240,COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C...,1,0,5
4,CHEMBL240,CCCCN(CCCC)CCC(O)c1cc2c(Cl)cc(Cl)cc2c2cc(C(F)(...,1,0,0


In [6]:

# Load LLaMA-3 tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B')  # Replace with the correct model ID
model = AutoModel.from_pretrained('deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B')

In [7]:
## HF Pipeline for feature extraction
pipe =pipeline('feature-extraction',model= model,tokenizer = tokenizer,device_map= 'auto',device = 'cuda')

Both `device` and `device_map` are specified. `device` will override `device_map`. You will most likely encounter unexpected behavior. Please remove `device` and keep `device_map`.


In [17]:
# Function to compute embeddings for a single SMILES string
def compute_embedding(smiles):
    embeddings = pipe(smiles,return_tensors = 'pt').mean(dim=1).detach().cpu().numpy()[0]
    return embeddings

# Compute embeddings for all SMILES strings in the DataFrame
df['Embeddings'] = df['std_smiles'].progress_apply(lambda x: compute_embedding(x))


  0%|          | 0/20409 [00:00<?, ?it/s]

In [18]:
df.head()

Unnamed: 0,id,std_smiles,classes,train_test_split,cv_fold,Embeddings
0,CHEMBL240,O=C1NCCN1CCN1CCC(c2cn(-c3ccc(F)cc3)c3ccc(Cl)cc...,1,0,9,"[-0.29996324, -0.7903553, 0.17217371, -0.41181..."
1,CHEMBL240,O=C(CCCN1CC=C(n2c(=O)[nH]c3ccccc32)CC1)c1ccc(F...,1,0,8,"[-0.5740264, -0.87309414, -0.08635505, -0.5592..."
2,CHEMBL240,O=C(O[C@@H]1C[C@@H]2C[C@H]3C[C@H](C1)N2CC3=O)c...,1,0,0,"[0.4522462, -0.8792749, -0.060670193, -0.98133..."
3,CHEMBL240,COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C...,1,0,5,"[-0.99224347, -0.3035103, 0.37104177, -1.04528..."
4,CHEMBL240,CCCCN(CCCC)CCC(O)c1cc2c(Cl)cc(Cl)cc2c2cc(C(F)(...,1,0,0,"[-0.87651235, -0.72876805, -0.82546175, -1.452..."


In [19]:
df.to_parquet('data/uniherg_db-deepseek-qwen1_5b-embedding.parquet',index = False)

In [20]:
#External Test-1: https://github.com/Abdulk084/CardioTox/blob/master/data/external_test_set_pos.csv
ext_pos_df = pd.read_csv(f"data//external_test_set_pos.csv")
# External Test h70, h60 dataset: https://github.com/issararab/CToxPred/tree/main/data/raw/hERG
ext_h60_df = pd.read_csv(f"data/eval_set_herg_60.csv")
ext_h70_df = pd.read_csv(f"data/eval_set_herg_70.csv")


In [26]:
# ext_h70_df['emb'] = ext_h70_df['SMILES'].progress_apply(compute_embedding)
ext_h70_df.to_parquet(os.path.join('data','h70-uniherg_db-deepseek-qwen1_5b-embedding.parquet'),index = False)
ext_pos_df['emb'] = ext_pos_df['smiles'].progress_apply(compute_embedding)
ext_pos_df.to_parquet(os.path.join('data','pos-uniherg_db-deepseek-qwen1_5b-embedding.parquet'),index = False)

  0%|          | 0/44 [00:00<?, ?it/s]

In [27]:
ext_h60_df['emb'] = ext_h60_df['SMILES'].progress_apply(compute_embedding)
ext_pos_df.to_parquet(os.path.join('data','h60-uniherg_db-deepseek-qwen1_5b-embedding.parquet'),index = False)

  0%|          | 0/250 [00:00<?, ?it/s]