# BASE MODELS

**DATASET ISPECT**

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

file_paths = {
    "fart_test": "./fart_test.csv",
    "fart_eval": "./fart_val.csv",
    "fart_train": "./fart_train.csv"
}

datasets = {name: pd.read_csv(path) for name, path in file_paths.items()}

def summarize_dataset(name, df):
    print(f"\n--- {name} ---")
    print(f"Shape: {df.shape}")
    print("Columns:", df.columns.tolist())
    print("\nMissing Values:\n", df.isnull().sum())
    print("\nData Types:\n", df.dtypes)
    print("\nSummary Statistics:\n", df.describe())
    print("\nDuplicate Rows:", df.duplicated().sum())

for name, df in datasets.items():
    summarize_dataset(name, df)




--- fart_test ---
Shape: (2254, 7)
Columns: ['Unnamed: 0', 'Canonicalized SMILES', 'Standardized SMILES', 'Canonicalized Taste', 'Original Labels', 'Source', 'is_multiclass']

Missing Values:
 Unnamed: 0              0
Canonicalized SMILES    0
Standardized SMILES     0
Canonicalized Taste     0
Original Labels         0
Source                  0
is_multiclass           0
dtype: int64

Data Types:
 Unnamed: 0               int64
Canonicalized SMILES    object
Standardized SMILES     object
Canonicalized Taste     object
Original Labels         object
Source                  object
is_multiclass            int64
dtype: object

Summary Statistics:
          Unnamed: 0  is_multiclass
count   2254.000000         2254.0
mean    7048.832298            0.0
std     4101.425559            0.0
min        3.000000            0.0
25%     3463.000000            0.0
50%     7115.000000            0.0
75%    10643.750000            0.0
max    14152.000000            0.0

Duplicate Rows: 0

--- fart_

**EMBEDDINGS**

In [21]:
import os
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel

In [5]:
datasets = {name: pd.read_csv(path) for name, path in file_paths.items()}


In [29]:
# Model configurations (Only Base Models for Now)
models_info = {
    "ChemBERTa_Base": {
        "tokenizer": "seyonec/SMILES_tokenized_PubChem_shard00_160k",
        "model": "seyonec/SMILES_tokenized_PubChem_shard00_160k"
    },
    "MolFormer_Base": {
        "tokenizer": "ibm-research/MoLFormer-XL-both-10pct",
        "model": "ibm-research/MoLFormer-XL-both-10pct"
    }
}

In [30]:
def load_model(model_name):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
        model.eval()  # Set to evaluation mode
        print(f"Loaded model: {model_name}")
        return tokenizer, model
    except Exception as e:
        print(f"Error loading {model_name}: {e}")
        return None, None

In [31]:
# Extract embeddings
def get_embeddings(smiles_list, tokenizer, model):
    embeddings = []
    
    for smiles in smiles_list:
        tokens = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            output = model(**tokens).last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(output)
    
    return np.array(embeddings)

In [32]:
output_dir = "flavor_output_embeddings"
os.makedirs(output_dir, exist_ok=True)

In [33]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [34]:
for dataset_name, df in datasets.items():
    print(f"Processing {dataset_name}...")

    for model_name, model_info in models_info.items():
        print(f"Extracting embeddings using {model_name}...")

        tokenizer, model = load_model(model_info["model"])
        if not tokenizer or not model:
            continue 

        embeddings = get_embeddings(df["Canonicalized SMILES"].tolist(), tokenizer, model)

        df[model_name + "_embeddings"] = [",".join(map(str, emb)) for emb in embeddings]

    output_path = os.path.join(output_dir, f"{dataset_name}_embed.csv")
    df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")

Processing fart_test...
Extracting embeddings using ChemBERTa_Base...


Loaded model: seyonec/SMILES_tokenized_PubChem_shard00_160k
Extracting embeddings using MolFormer_Base...
Loaded model: ibm-research/MoLFormer-XL-both-10pct
Saved: flavor_output_embeddings/fart_test_embed.csv
Processing fart_eval...
Extracting embeddings using ChemBERTa_Base...
Loaded model: seyonec/SMILES_tokenized_PubChem_shard00_160k
Extracting embeddings using MolFormer_Base...
Loaded model: ibm-research/MoLFormer-XL-both-10pct
Saved: flavor_output_embeddings/fart_eval_embed.csv
Processing fart_train...
Extracting embeddings using ChemBERTa_Base...
Loaded model: seyonec/SMILES_tokenized_PubChem_shard00_160k
Extracting embeddings using MolFormer_Base...
Loaded model: ibm-research/MoLFormer-XL-both-10pct
Saved: flavor_output_embeddings/fart_train_embed.csv


In [36]:
df = pd.read_csv("/home/raghvendra2/Molformer_Finetuning/flavor_output_embeddings/fart_train_embed.csv")

df.head()

Unnamed: 0.1,Unnamed: 0,Canonicalized SMILES,Standardized SMILES,Canonicalized Taste,Original Labels,Source,is_multiclass,Embeddings,ChemBERTa_Base_embeddings,MolFormer_Base_embeddings
0,0,COC(=O)c1cccc(CCc2ccc(OC)c(O)c2)c1,COC(=O)c1cccc(CCc2ccc(OC)c(O)c2)c1,undefined,"Tasteless, Tastelessness",chemtastes_db,0,[ 4.23220634e-01 -3.61322016e-01 1.36605442e-...,"0.42322063,-0.36132202,0.13660544,-0.5751529,0...","0.57139325,0.5010865,0.95544285,-0.108787954,-..."
1,1,CCCCCC(=Cc1ccccc1)COC(=O)CC(C)C,CCCCCC(=Cc1ccccc1)COC(=O)CC(C)C,undefined,"oily, tobacco, fruity",flavor_db,0,[ 9.82137978e-01 -3.90844733e-01 4.28043664e-...,"0.982138,-0.39084473,0.42804366,0.51905835,-0....","0.8856047,-0.027730905,0.56260943,0.34591734,-..."
2,2,CC=C(C)C(CCC(C)CCCC(C)C(C)CCC(C)C)OC1C(CO)OC(O...,CC=C(C)C(CCC(C)CCCC(C)C(C)CCC(C)C)OC1C(CO)OC(O...,sweet,sweet-like,flavor_db,0,[ 4.26222205e-01 -6.78874791e-01 5.00504911e-...,"0.4262222,-0.6788748,0.5005049,0.28885567,-0.3...","0.67635477,0.008468097,0.83730954,0.34545955,-..."
3,3,CCC(C)C(N)C(=O)NC(Cc1ccccc1)C(=O)NC(CC(=O)O)C(...,CCC(C)C(N)C(=O)NC(Cc1ccccc1)C(=O)NC(CC(=O)O)C(...,sweet,sweet,flavor_db,0,[ 5.21623552e-01 -5.82899749e-01 3.38670701e-...,"0.52162355,-0.58289975,0.3386707,1.0003636,-0....","-0.47565556,0.79714537,0.53887177,0.50379896,0..."
4,4,CCOC1OC(CO)C(O)C(OC2OC(CO)C(O)C(O)C2O)C1O,CCOC1OC(CO)C(O)C(OC2OC(CO)C(O)C(O)C2O)C1O,sweet,sweet-like,flavor_db,0,[-8.61615717e-01 -5.76748490e-01 3.75833988e-...,"-0.8616157,-0.5767485,0.375834,0.09298423,-0.9...","1.0611067,-0.0151342545,0.060202017,0.7335563,..."


In [2]:

import pandas as pd
file = pd.read_csv("/home/raghvendra2/Molformer_Finetuning/classification_model_flavor/test_with_features.csv")

file.head()

Unnamed: 0.1,Unnamed: 0,Canonicalized SMILES,Standardized SMILES,Canonicalized Taste,Original Labels,Source,is_multiclass,Embeddings,ChemBERTa_Base_embeddings,MolFormer_Base_embeddings,RDKit_Features,PubChem_Features
0,683,C=C1CC23CCC4C(C)(C(=O)O)CCCC4(C)C2CCC1(O)C3,C=C1CC23CCC4C(C)(C(=O)O)CCCC4(C)C2CCC1(O)C3,sweet,"Sweet, Sweetness",chemtastes_db,0,[ 7.68807650e-01 -5.76021433e-01 5.22678196e-...,[ 7.68807650e-01 -5.76021430e-01 5.22678200e-...,[ 5.54604600e-01 -2.52589300e-01 1.67957400e-...,"[12.03983746430671, 12.03983746430671, 0.10673...",
1,11595,COc1ccc(C2Oc3cc(O)cc(O)c3C(=O)C2OC2OC(COC3OC(C...,COc1ccc(C2Oc3cc(O)cc(O)c3C(=O)C2OC2OC(COC3OC(C...,sweet,sweet-like,flavor_db,0,[-1.37687132e-01 -5.83282530e-01 4.27158654e-...,[-1.37687130e-01 -5.83282530e-01 4.27158650e-...,[ 9.43758960e-01 -1.73485620e-01 7.99534100e-...,"[13.708723535782834, 13.708723535782834, 0.109...",
2,3891,CCOc1ccc(N)cc1[N+](=O)[O-],CCOc1ccc(N)cc1[N+](=O)[O-],sweet,sweet-like,flavor_db,0,[ 0.88360196 -0.43877032 -0.33330277 0.079518...,[ 0.88360196 -0.43877032 -0.33330277 0.079518...,[ 3.78544600e-01 2.11703670e-01 4.95371040e-...,"[10.51880007558579, 10.51880007558579, 0.09550...",
3,8594,CCCOP(=O)(O)OP(=O)(O)OP(=O)(O)OCC1OC(n2cnc3c(N...,CCCOP(=O)(O)OP(=O)(O)OP(=O)(O)OCC1OC(n2cnc3c(N...,sweet,sweet-like,flavor_db,0,[-4.16371375e-01 -4.88543302e-01 -1.55383557e-...,[-4.16371380e-01 -4.88543300e-01 -1.55383560e-...,[ 6.93409440e-01 3.03417440e-01 2.26749000e-...,"[12.037809034783692, 12.037809034783692, 0.053...",
4,6577,CCC(C)C(NC(=O)C(Cc1ccccc1)NC(=O)C(C)NC(=O)C(N)...,CCC(C)C(NC(=O)C(Cc1ccccc1)NC(=O)C(C)NC(=O)C(N)...,sweet,sweet,flavor_db,0,[ 6.56488776e-01 -8.65771413e-01 4.55688894e-...,[ 6.56488800e-01 -8.65771400e-01 4.55688900e-...,[-5.84631700e-01 7.86514000e-01 4.75675640e-...,"[12.930864686426554, 12.930864686426554, 0.087...",


# ALL MODELS

In [1]:
import os
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_paths = {
    "fart_test": "/home/raghvendra2/Molformer_Finetuning/fart_test.csv",
    "fart_eval": "/home/raghvendra2/Molformer_Finetuning/fart_val.csv",
    "fart_train": "/home/raghvendra2/Molformer_Finetuning/fart_train.csv"
}

In [3]:
datasets = {name: pd.read_csv(path) for name, path in file_paths.items()}


In [4]:
# Model configurations of additional ChemBERTa final models 
models_info = {
    "ChemBERTa_Base": {
        "tokenizer": "seyonec/SMILES_tokenized_PubChem_shard00_160k",
        "model": "seyonec/SMILES_tokenized_PubChem_shard00_160k"
    },
    "MolFormer_Base": {
        "tokenizer": "ibm-research/MoLFormer-XL-both-10pct",
        "model": "ibm-research/MoLFormer-XL-both-10pct"
    },
    "ChemBERTa_Finetuned": {
        "tokenizer": "seyonec/SMILES_tokenized_PubChem_shard00_160k",
        "model": "/home/raghvendra2/Molformer_Finetuning/Chemberta_Lora_finetuned_flavor_model",
        "num_labels": 5
    },
    "MolFormer_Finetuned": {
        "tokenizer": "ibm-research/MoLFormer-XL-both-10pct",
        "model": "/home/raghvendra2/Molformer_Finetuning/Molformer_Lora_Finetuned_flavor_model2",
        "num_labels": 5
    },
    "ChemBERTa_77M_MTR": {
        "tokenizer": "DeepChem/ChemBERTa-77M-MTR",
        "model": "/home/raghvendra2/Molformer_Finetuning/chemberta_final_model_lora_100M_MTR",
        "num_labels": 5
    },
    "ChemBERTa_10M_MTR": {
        "tokenizer": "DeepChem/ChemBERTa-10M-MTR",
        "model": "/home/raghvendra2/Molformer_Finetuning/chemberta_final_model_lora_10M_MTR",
        "num_labels": 5
    },
    "ChemBERTa_77M_MLM": {
        "tokenizer": "DeepChem/ChemBERTa-77M-MLM",
        "model": "/home/raghvendra2/Molformer_Finetuning/chemberta_final_model_lora_77M_MLM",
        "num_labels": 5
    },
    "ChemBERTa_10M_MLM": {
        "tokenizer": "DeepChem/ChemBERTa-10M-MLM",
        "model": "/home/raghvendra2/Molformer_Finetuning/chemberta_final_model_lora_10M_MLM",
        "num_labels": 5
    },
    "ChemBERTa_5M_MTR": {
        "tokenizer": "DeepChem/ChemBERTa-5M-MTR",
        "model": "/home/raghvendra2/Molformer_Finetuning/chemberta_final_model_lora_5M_MTR",
        "num_labels": 5
    }
}


In [5]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [7]:
def load_model(model_info):
    """Load model and tokenizer. Choose appropriate class based on whether it's a classification model."""
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_info["tokenizer"], trust_remote_code=True
        )

        if "num_labels" in model_info:
            model = AutoModelForSequenceClassification.from_pretrained(
                model_info["model"],
                num_labels=model_info["num_labels"],
                trust_remote_code=True
            )
        else:
            model = AutoModel.from_pretrained(
                model_info["model"],
                trust_remote_code=True
            )

        model.to(device)
        model.eval()
        print(f"Loaded model: {model_info['model']}")
        return tokenizer, model

    except Exception as e:
        print(f"Error loading {model_info['model']}: {e}")
        return None, None


In [8]:
def get_embeddings(smiles_list, tokenizer, model):
    embeddings = []
    
    for smiles in smiles_list:
        tokens = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True, max_length=512)
        tokens = {k: v.to(device) for k, v in tokens.items()}

        with torch.no_grad():
            # Extract embeddings from base model (ignoring classification head)
            if hasattr(model, "roberta"):
                output = model.roberta(**tokens).last_hidden_state.mean(dim=1)
            elif hasattr(model, "bert"):
                output = model.bert(**tokens).last_hidden_state.mean(dim=1)
            elif hasattr(model, "base_model"):
                output = model.base_model(**tokens).last_hidden_state.mean(dim=1)
            else:
                raise ValueError("Unknown model architecture!")
                
        embeddings.append(output.squeeze().cpu().numpy())

    return np.array(embeddings)

In [9]:
output_dir = "/home/raghvendra2/Molformer_Finetuning/flavor_output_embeddings/all_models"
os.makedirs(output_dir, exist_ok=True)

In [10]:
for dataset_name, df in datasets.items():
    print(f"Processing {dataset_name}...")

    for model_name, model_info in models_info.items():
        print(f"Extracting embeddings using {model_name}...")

        tokenizer, model = load_model(model_info)
        if not tokenizer or not model:
            continue 

        embeddings = get_embeddings(df["Canonicalized SMILES"].tolist(), tokenizer, model)

        df[model_name + "_embeddings"] = [",".join(map(str, emb)) for emb in embeddings]

    output_path = os.path.join(output_dir, f"{dataset_name}_embed.csv")
    df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")

Processing fart_test...
Extracting embeddings using ChemBERTa_Base...


2025-04-11 06:09:51.607939: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-11 06:09:51.627573: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744344591.652746 1982895 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744344591.660434 1982895 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744344591.679367 1982895 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Loaded model: seyonec/SMILES_tokenized_PubChem_shard00_160k
Extracting embeddings using MolFormer_Base...
Loaded model: ibm-research/MoLFormer-XL-both-10pct
Extracting embeddings using ChemBERTa_Finetuned...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Chemberta_Lora_finetuned_flavor_model
Extracting embeddings using MolFormer_Finetuned...
Loaded model: /home/raghvendra2/Molformer_Finetuning/Molformer_Lora_Finetuned_flavor_model2
Extracting embeddings using ChemBERTa_77M_MTR...
Loaded model: /home/raghvendra2/Molformer_Finetuning/chemberta_final_model_lora_100M_MTR
Extracting embeddings using ChemBERTa_10M_MTR...
Loaded model: /home/raghvendra2/Molformer_Finetuning/chemberta_final_model_lora_10M_MTR
Extracting embeddings using ChemBERTa_77M_MLM...
Loaded model: /home/raghvendra2/Molformer_Finetuning/chemberta_final_model_lora_77M_MLM
Extracting embeddings using ChemBERTa_10M_MLM...
Loaded model: /home/raghvendra2/Molformer_Finetuning/chemberta_final_model_lora_10M_MLM
Extractin

In [11]:
import pandas as pd

file_path = "/home/raghvendra2/Molformer_Finetuning/flavor_output_embeddings/all_models/fart_train_embed.csv"
df = pd.read_csv(file_path, nrows=5)

print("Column names:")
print(df.columns.tolist())


Column names:
['Unnamed: 0', 'Canonicalized SMILES', 'Standardized SMILES', 'Canonicalized Taste', 'Original Labels', 'Source', 'is_multiclass', 'ChemBERTa_Base_embeddings', 'MolFormer_Base_embeddings', 'ChemBERTa_Finetuned_embeddings', 'MolFormer_Finetuned_embeddings', 'ChemBERTa_77M_MTR_embeddings', 'ChemBERTa_10M_MTR_embeddings', 'ChemBERTa_77M_MLM_embeddings', 'ChemBERTa_10M_MLM_embeddings', 'ChemBERTa_5M_MTR_embeddings']
