# ALL MODELS

In [1]:
import os
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification 

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
file_paths = {
    "fart_test": ".../flavor_datasets/fart_test.csv",
    "fart_eval": ".../flavor_datasets/fart_val.csv",
    "fart_train": ".../flavor_datasets/fart_train.csv"
}

In [12]:
datasets = {name: pd.read_csv(path) for name, path in file_paths.items()}


In [None]:
# Model configurations of additional ChemBERTa final models 
models_info = {
    
    "MolFormer_Base": {
        "tokenizer": "ibm-research/MoLFormer-XL-both-10pct",
        "model": "ibm-research/MoLFormer-XL-both-10pct",
        "num_labels": 5
    },
    "Molformer_Finetuned_WL": {
        "tokenizer": "ibm-research/MoLFormer-XL-both-10pct",
        "model": ".../weighted_loss/Molformer_finetuned_WL",
        "num_labels": 5
    },
    "MolFormer_Finetuned_FL": {
        "tokenizer": "ibm-research/MoLFormer-XL-both-10pct",
        "model": ".../Focal_loss_models/Molformer_finetuned_FL",
        "num_labels": 5
    },
    "ChemBERTa_10M_MTR_WL": {
        "tokenizer": "DeepChem/ChemBERTa-10M-MTR",
        "model": ".../weighted_loss/Chemberta_10M_MTR_WL",
        "num_labels": 5
    },
    "ChemBERTa_10M_MTR_FL": {
        "tokenizer": "DeepChem/ChemBERTa-10M-MTR",
        "model": ".../Focal_loss_models/chemberta_10M_MTR_FL",
        "num_labels": 5
    },
    "ChemBERTa_77M_MLM_WL": {
        "tokenizer": "DeepChem/ChemBERTa-77M-MLM",
        "model": ".../weighted_loss/Chemberta_77M_MLM_WL",
        "num_labels": 5
    },
    "ChemBERTa_77M_MLM_FL": {
        "tokenizer": "DeepChem/ChemBERTa-77M-MLM",
        "model": ".../Focal_loss_models/chemberta_77M_MLM_FL",
        "num_labels": 5
    },
    "ChemBERTa_10M_MLM_WL": {
        "tokenizer": "DeepChem/ChemBERTa-10M-MLM",
        "model": ".../weighted_loss/Chemberta_10M_MLM_WL",
        "num_labels": 5
    },
    "ChemBERTa_10M_MLM_FL": {
        "tokenizer": "DeepChem/ChemBERTa-10M-MLM",
        "model": ".../Focal_loss_models/chemberta_10M_MLM_FL",
        "num_labels": 5
    },
    "ChemBERTa_5M_MTR_FL": {
        "tokenizer": "DeepChem/ChemBERTa-5M-MTR",
        "model": ".../Focal_loss_models/chemberta_5M_MTR_FL",
        "num_labels": 5
    },
    "ChemBERTa_5M_MTR_WL": {
        "tokenizer": "DeepChem/ChemBERTa-5M-MTR",
        "model": ".../weighted_loss/Chemberta_5M_MTR_WL",
        "num_labels": 5
    },
    "ChemBERTa_77M_MTR_WL": {
        "tokenizer": "DeepChem/ChemBERTa-77M-MTR",
        "model": ".../weighted_loss/Chemberta_77M_MTR_WL",
        "num_labels": 5
    },
    "ChemBERTa_77M_MTR_WL": {
        "tokenizer": "DeepChem/ChemBERTa-77M-MTR",
        "model": ".../weighted_loss/Chemberta_77M_MTR_WL",
        "num_labels": 5
    },
     "ChemBERTa_77M_MTR_FL": {
        "tokenizer": "DeepChem/ChemBERTa-77M-MTR",
        "model": ".../Focal_loss_models/chemberta_77M_MTR_FL",
        "num_labels": 5
    },
    "ChemBERTa_77M_MTR_Base": {
        "tokenizer": "DeepChem/ChemBERTa-77M-MTR",
        "model": "DeepChem/ChemBERTa-77M-MTR",
        
    },
  
    "ChemBERTa_10M_MTR_Base": {
        "tokenizer": "DeepChem/ChemBERTa-10M-MTR",
        "model": "DeepChem/ChemBERTa-10M-MTR",
        
    },
   
    "ChemBERTa_10M_MLM_Base": {
        "tokenizer": "DeepChem/ChemBERTa-10M-MLM",
        "model": "DeepChem/ChemBERTa-10M-MLM",
        
    },
   
    "ChemBERTa_77M_MLM_Base": {
        "tokenizer": "DeepChem/ChemBERTa-77M-MLM",
        "model": "DeepChem/ChemBERTa-77M-MLM",
        
    },
    "ChemBERTa_5M_MTR_Base": {
        "tokenizer": "DeepChem/ChemBERTa-5M-MTR",
        "model": "DeepChem/ChemBERTa-5M-MTR",
        
    }

}


In [14]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [15]:
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")

Using device: cpu


In [16]:
def load_model(model_info):
    """Load model and tokenizer. Choose appropriate class based on whether it's a classification model."""
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_info["tokenizer"], trust_remote_code=True
        )

        if "num_labels" in model_info:
            model = AutoModelForSequenceClassification.from_pretrained(
                model_info["model"],
                num_labels=model_info["num_labels"],
                trust_remote_code=True
            )
        else:
            model = AutoModel.from_pretrained(
                model_info["model"],
                trust_remote_code=True
            )

        model.to(device)
        model.eval()
        print(f"Loaded model: {model_info['model']}")
        return tokenizer, model

    except Exception as e:
        print(f"Error loading {model_info['model']}: {e}")
        return None, None


In [None]:
def get_embeddings(smiles_list, tokenizer, model, model_name):
    """
    Get embeddings from MolFormer or ChemBERTa or base modeldepending on the model name.

    Args:
        smiles_list (list of str): SMILES inputs.
        tokenizer: Huggingface tokenizer.
        model: Huggingface model (MolFormer or ChemBERTa).
        model_name (str): Name of the model to determine architecture.

    Returns:
        np.array: Embeddings extracted from dense layers.
    """
    model.eval()
    embeddings = []

    # Define which models use MolFormer structure
    molformer_models = {"MolFormer_WL", "MolFormer_FL"}
    base_models={"MolFormer_Base","ChemBERTa_77M_MLM_Base","ChemBERTa_10M_MLM_Base","ChemBERTa_5M_MTR_Base","ChemBERTa_77M_MTR_Base","ChemBERTa_10M_MTR_Base"}

    # Decide model type
    is_molformer = model_name in molformer_models
    is_base_model = model_name in base_models

    for smiles in smiles_list:
        tokens = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True, max_length=512)
        tokens = {k: v.to(model.device) for k, v in tokens.items()}

        with torch.no_grad():
            output = model(**tokens, output_hidden_states=True)

            if not hasattr(output, "hidden_states"):
                raise ValueError("Model output does not contain hidden states!")

            if is_base_model:
                embedding = output.last_hidden_state[:, 0, :]

            elif is_molformer:
                cls_embedding = output.hidden_states[-1][:, 0, :]
                x = model.classifier.dense(cls_embedding)
                x = model.classifier.dense2(x)
                x = model.classifier.classifier_act_fn(x)  # e.g., GELU
                embedding = x
            else:  # ChemBERTa
                cls_embedding = output.hidden_states[-1][:, 0, :]
                x = model.classifier.dense(cls_embedding)
                x = torch.tanh(x)
                embedding = x

        embeddings.append(embedding.squeeze().cpu().numpy())

    return np.array(embeddings)


In [None]:
output_dir = ".../transformer_CH_finetuned"
os.makedirs(output_dir, exist_ok=True)

In [19]:
for dataset_name, df in datasets.items():
    print(f"Processing {dataset_name}...")

    for model_name, model_info in models_info.items():
        print(f"Extracting embeddings using {model_name}...")

        tokenizer, model = load_model(model_info)
        if not tokenizer or not model:
            continue 

        embeddings = get_embeddings(df["Canonicalized SMILES"].tolist(), tokenizer, model)

        df[model_name + "_embeddings"] = [",".join(map(str, emb)) for emb in embeddings]

    output_path = os.path.join(output_dir, f"{dataset_name}_embed.csv")
    df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")

Processing fart_test...
Extracting embeddings using ChemBERTa_Base...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/SMILES_tokenized_PubChem_shard00_160k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded model: seyonec/SMILES_tokenized_PubChem_shard00_160k
Extracting embeddings using MolFormer_Base...


Some weights of MolformerForSequenceClassification were not initialized from the model checkpoint at ibm-research/MoLFormer-XL-both-10pct and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.dense2.bias', 'classifier.dense2.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded model: ibm-research/MoLFormer-XL-both-10pct
Extracting embeddings using Molformer_Finetuned_WL...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Flavor_prediction_task/Flavor_Final_models_with_classification_head_finetuned/weighted_loss/Molformer_finetuned_WL
Extracting embeddings using MolFormer_Finetuned_FL...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Flavor_prediction_task/Flavor_Final_models_with_classification_head_finetuned/Focal_loss_models/Molformer_finetuned_FL
Extracting embeddings using ChemBERTa_10M_MTR_WL...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Flavor_prediction_task/Flavor_Final_models_with_classification_head_finetuned/weighted_loss/Chemberta_10M_MTR_WL
Extracting embeddings using ChemBERTa_10M_MTR_FL...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Flavor_prediction_task/Flavor_Final_models_with_classification_head_finetuned/Focal_loss_models/chemberta_10M_MTR_FL
Extracting embeddings using ChemBERTa_77M_MLM_WL...
Loaded mo

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/SMILES_tokenized_PubChem_shard00_160k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded model: seyonec/SMILES_tokenized_PubChem_shard00_160k
Extracting embeddings using MolFormer_Base...


Some weights of MolformerForSequenceClassification were not initialized from the model checkpoint at ibm-research/MoLFormer-XL-both-10pct and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.dense2.bias', 'classifier.dense2.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded model: ibm-research/MoLFormer-XL-both-10pct
Extracting embeddings using Molformer_Finetuned_WL...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Flavor_prediction_task/Flavor_Final_models_with_classification_head_finetuned/weighted_loss/Molformer_finetuned_WL
Extracting embeddings using MolFormer_Finetuned_FL...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Flavor_prediction_task/Flavor_Final_models_with_classification_head_finetuned/Focal_loss_models/Molformer_finetuned_FL
Extracting embeddings using ChemBERTa_10M_MTR_WL...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Flavor_prediction_task/Flavor_Final_models_with_classification_head_finetuned/weighted_loss/Chemberta_10M_MTR_WL
Extracting embeddings using ChemBERTa_10M_MTR_FL...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Flavor_prediction_task/Flavor_Final_models_with_classification_head_finetuned/Focal_loss_models/chemberta_10M_MTR_FL
Extracting embeddings using ChemBERTa_77M_MLM_WL...
Loaded mo

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/SMILES_tokenized_PubChem_shard00_160k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded model: seyonec/SMILES_tokenized_PubChem_shard00_160k
Extracting embeddings using MolFormer_Base...


Some weights of MolformerForSequenceClassification were not initialized from the model checkpoint at ibm-research/MoLFormer-XL-both-10pct and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.dense2.bias', 'classifier.dense2.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded model: ibm-research/MoLFormer-XL-both-10pct
Extracting embeddings using Molformer_Finetuned_WL...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Flavor_prediction_task/Flavor_Final_models_with_classification_head_finetuned/weighted_loss/Molformer_finetuned_WL
Extracting embeddings using MolFormer_Finetuned_FL...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Flavor_prediction_task/Flavor_Final_models_with_classification_head_finetuned/Focal_loss_models/Molformer_finetuned_FL
Extracting embeddings using ChemBERTa_10M_MTR_WL...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Flavor_prediction_task/Flavor_Final_models_with_classification_head_finetuned/weighted_loss/Chemberta_10M_MTR_WL
Extracting embeddings using ChemBERTa_10M_MTR_FL...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Flavor_prediction_task/Flavor_Final_models_with_classification_head_finetuned/Focal_loss_models/chemberta_10M_MTR_FL
Extracting embeddings using ChemBERTa_77M_MLM_WL...
Loaded mo