In [3]:
import os
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification 

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
file_paths = {
    "clintox_test": "/home/raghvendra2/Molformer_Finetuning/clintox_test.csv",
    "clintox_eval": "/home/raghvendra2/Molformer_Finetuning/clintox_valid.csv",
    "clintox_train": "/home/raghvendra2/Molformer_Finetuning/clintox_train.csv"
}

In [4]:
datasets = {name: pd.read_csv(path) for name, path in file_paths.items()}


In [5]:
# Model configurations
models_info = {
    "ChemBERTa_Base": {
        "tokenizer": "seyonec/SMILES_tokenized_PubChem_shard00_160k",
        "model": "seyonec/SMILES_tokenized_PubChem_shard00_160k"
    },
    "MolFormer_Base": {
        "tokenizer": "ibm-research/MoLFormer-XL-both-10pct",
        "model": "ibm-research/MoLFormer-XL-both-10pct"
    },
    "MolFormer_Finetuned": {
        "tokenizer": "ibm-research/MoLFormer-XL-both-10pct",
        "model": "/home/raghvendra2/Molformer_Finetuning/Clintox_Final_Molformer_model",
        "num_labels": 2
    },
    "ChemBERTa_77M_MTR": {
        "tokenizer": "DeepChem/ChemBERTa-77M-MTR",
        "model": "/home/raghvendra2/Molformer_Finetuning/Clintox_Final_chemberta_77m_mtr_model",
        "num_labels": 2
    },
    "ChemBERTa_10M_MTR": {
        "tokenizer": "DeepChem/ChemBERTa-10M-MTR",
        "model": "/home/raghvendra2/Molformer_Finetuning/Clintox_Final_chemberta_10m_mtr_model",
        "num_labels": 2
    },
    "ChemBERTa_77M_MLM": {
        "tokenizer": "DeepChem/ChemBERTa-77M-MLM",
        "model": "/home/raghvendra2/Molformer_Finetuning/Clintox_Final_chemberta_77m_mlm_model",
        "num_labels": 2
    },
    "ChemBERTa_10M_MLM": {
        "tokenizer": "DeepChem/ChemBERTa-10M-MLM",
        "model": "/home/raghvendra2/Molformer_Finetuning/Clintox_Final_chemberta_10m_mlm_model",
        "num_labels": 2
    },
    "ChemBERTa_5M_MTR": {
        "tokenizer": "DeepChem/ChemBERTa-5M-MTR",
        "model": "/home/raghvendra2/Molformer_Finetuning/Clintox_Final_chemberta_5m_mtr_model",
        "num_labels": 2
    }
}


In [6]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [7]:
def load_model(model_info):
    """Load model and tokenizer. Choose appropriate class based on whether it's a classification model."""
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_info["tokenizer"], trust_remote_code=True
        )

        if "num_labels" in model_info:
            model = AutoModelForSequenceClassification.from_pretrained(
                model_info["model"],
                num_labels=model_info["num_labels"],
                trust_remote_code=True
            )
        else:
            model = AutoModel.from_pretrained(
                model_info["model"],
                trust_remote_code=True
            )

        model.to(device)
        model.eval()
        print(f"Loaded model: {model_info['model']}")
        return tokenizer, model

    except Exception as e:
        print(f"Error loading {model_info['model']}: {e}")
        return None, None


In [8]:
def get_embeddings(smiles_list, tokenizer, model):
    embeddings = []
    
    for smiles in smiles_list:
        tokens = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True, max_length=512)
        tokens = {k: v.to(device) for k, v in tokens.items()}

        with torch.no_grad():
            # Extract embeddings from base model
            if hasattr(model, "roberta"):
                output = model.roberta(**tokens).last_hidden_state.mean(dim=1)
            elif hasattr(model, "bert"):
                output = model.bert(**tokens).last_hidden_state.mean(dim=1)
            elif hasattr(model, "base_model"):
                output = model.base_model(**tokens).last_hidden_state.mean(dim=1)
            else:
                raise ValueError("Unknown model architecture!")
                
        embeddings.append(output.squeeze().cpu().numpy())

    return np.array(embeddings)

In [9]:
output_dir = "/home/raghvendra2/Molformer_Finetuning/clintox_output_embeddings/all_models"
os.makedirs(output_dir, exist_ok=True)

In [None]:
for dataset_name, df in datasets.items():
    print(f"Processing {dataset_name}...")

    for model_name, model_info in models_info.items():
        print(f"Extracting embeddings using {model_name}...")

        tokenizer, model = load_model(model_info)
        if not tokenizer or not model:
            continue 

        embeddings = get_embeddings(df["smiles"].tolist(), tokenizer, model)

        df[model_name + "_embeddings"] = [",".join(map(str, emb)) for emb in embeddings]

    output_path = os.path.join(output_dir, f"{dataset_name}_embed.csv")
    df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")

Processing clintox_test...
Extracting embeddings using ChemBERTa_Base...


2025-04-11 08:26:13.953787: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-11 08:26:13.973306: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744352773.998143 2002980 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744352774.005778 2002980 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744352774.024258 2002980 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Loaded model: seyonec/SMILES_tokenized_PubChem_shard00_160k
Extracting embeddings using MolFormer_Base...
Loaded model: ibm-research/MoLFormer-XL-both-10pct
Extracting embeddings using MolFormer_Finetuned...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Clintox_Final_Molformer_model
Extracting embeddings using ChemBERTa_77M_MTR...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Clintox_Final_chemberta_77m_mtr_model
Extracting embeddings using ChemBERTa_10M_MTR...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Clintox_Final_chemberta_10m_mtr_model
Extracting embeddings using ChemBERTa_77M_MLM...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Clintox_Final_chemberta_77m_mlm_model
Extracting embeddings using ChemBERTa_10M_MLM...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Clintox_Final_chemberta_10m_mlm_model
Extracting embeddings using ChemBERTa_5M_MTR...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Clintox_Final_chemberta_5m_mtr_model
Saved: /hom

In [1]:
import pandas as pd

file_path = "/home/raghvendra2/Molformer_Finetuning/clintox_output_embeddings/all_models/clintox_train_embed.csv"
df = pd.read_csv(file_path, nrows=5)

print("Column names:")
print(df.columns.tolist())


Column names:
['smiles', 'FDA_APPROVED', 'CT_TOX', 'ChemBERTa_Base_embeddings', 'MolFormer_Base_embeddings', 'MolFormer_Finetuned_embeddings', 'ChemBERTa_77M_MTR_embeddings', 'ChemBERTa_10M_MTR_embeddings', 'ChemBERTa_77M_MLM_embeddings', 'ChemBERTa_10M_MLM_embeddings', 'ChemBERTa_5M_MTR_embeddings']
