In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import MinMaxScaler
import numpy as np

def normalize_embeddings(embeddings_array):
    """
    Normalize embeddings to range [0,1] using Min-Max scaling
    
    Parameters:
    embeddings_array (numpy.ndarray): Array of embeddings to normalize
    
    Returns:
    numpy.ndarray: Normalized embeddings matrix
    """
    scaler = MinMaxScaler()
    normalized_embeddings = scaler.fit_transform(embeddings_array)
    
    # Verify normalization
    print("\nVerifying normalization:")
    print(f"Minimum value: {np.min(normalized_embeddings)}")
    print(f"Maximum value: {np.max(normalized_embeddings)}")
    
    return normalized_embeddings

def get_molecular_embeddings(smiles_list):
    """
    Calculate molecular embeddings for a list of SMILES strings using ChemBERTa
    
    Parameters:
    smiles_list (list): List of SMILES strings
    
    Returns:
    numpy.ndarray: Normalized embeddings matrix
    """
    # Load ChemBERTa model and tokenizer
    print("Loading ChemBERTa model and tokenizer...")
    model_name = "seyonec/ChemBERTa-zinc-base-v1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    # Set model to evaluation mode
    model.eval()
    
    # Initialize list to store embeddings
    embeddings_list = []
    
    # Process each SMILES string
    print("\nGenerating embeddings...")
    with torch.no_grad():
        for i, smiles in enumerate(smiles_list):
            try:
                # Tokenize SMILES
                inputs = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True, max_length=512)
                
                # Get model outputs
                outputs = model(**inputs)
                
                # Use CLS token embedding (first token)
                embedding = outputs.last_hidden_state[:, 0, :].numpy()
                embeddings_list.append(embedding.flatten())
                
                # Print progress every 100 molecules
                if (i + 1) % 100 == 0:
                    print(f"Processed {i + 1}/{len(smiles_list)} molecules...")
                
            except Exception as e:
                print(f"Error processing SMILES: {smiles}")
                print(f"Error message: {str(e)}")
                # Add a zero vector as embedding for failed cases
                embeddings_list.append(np.zeros(768))  # ChemBERTa base model has 768 dimensions
    
    # Convert list to numpy array
    print("\nConverting to numpy array...")
    embeddings_array = np.array(embeddings_list)
    
    # Print shape and stats before normalization
    print("\nEmbeddings before normalization:")
    print(f"Shape: {embeddings_array.shape}")
    print(f"Min value: {np.min(embeddings_array)}")
    print(f"Max value: {np.max(embeddings_array)}")
    print(f"Mean value: {np.mean(embeddings_array)}")
    
    # Normalize embeddings
    print("\nNormalizing embeddings...")
    normalized_embeddings = normalize_embeddings(embeddings_array)
    
    return normalized_embeddings

def process_drug_file(input_file, output_file):
    """
    Process a CSV file containing drug SMILES and save embeddings
    
    Parameters:
    input_file (str): Path to input CSV file with SMILES column
    output_file (str): Path to save output embeddings
    
    Returns:
    pandas.DataFrame: Processed data with embeddings
    """
    # Read the CSV file
    print("Reading CSV file...")
    df = pd.read_csv(input_file)
    
    print(f"\nFound {len(df)} SMILES strings to process...")
    
    # Check if SMILES column exists
    if 'SMILES' not in df.columns:
        raise KeyError("CSV file must contain a 'SMILES' column")
    
    # Calculate embeddings
    embeddings = get_molecular_embeddings(df['SMILES'].tolist())
    
    print("\nCreating embedding DataFrame...")
    # Create DataFrame with embeddings
    embedding_df = pd.DataFrame(
        embeddings,
        columns=[f'embedding_{i}' for i in range(embeddings.shape[1])]
    )
    
    # Combine original data with embeddings
    result_df = pd.concat([df, embedding_df], axis=1)
    
    print(f"\nSaving results to {output_file}...")
    # Save to CSV
    result_df.to_csv(output_file, index=False)
    
    # Print final statistics
    print("\nFinal statistics:")
    print(f"Original dataframe shape: {df.shape}")
    print(f"Embedding dataframe shape: {embedding_df.shape}")
    print(f"Final dataframe shape: {result_df.shape}")
    
    return result_df

if __name__ == "__main__":
    input_file = r"C:\Users\abir1\OneDrive\Desktop\Natural Product\Natural_test_unique.csv"
    output_file = r"C:\Users\abir1\OneDrive\Desktop\Natural Product\Natural_EMB.csv"
    
    try:
        result = process_drug_file(input_file, output_file)
        print("\nProcessing completed successfully!")
    except FileNotFoundError:
        print("Error: Input file not found. Please check the file path.")
    except KeyError as e:
        print(f"Error: {str(e)}")
    except Exception as e:
        print(f"An error occurred: {str(e)}")



Reading CSV file...

Found 4076 SMILES strings to process...
Loading ChemBERTa model and tokenizer...


  torch.utils._pytree._register_pytree_node(



Generating embeddings...
Processed 100/4076 molecules...
Processed 200/4076 molecules...
Processed 300/4076 molecules...
Processed 400/4076 molecules...
Processed 500/4076 molecules...
Processed 600/4076 molecules...
Processed 700/4076 molecules...
Processed 800/4076 molecules...
Processed 900/4076 molecules...
Processed 1000/4076 molecules...
Processed 1100/4076 molecules...
Processed 1200/4076 molecules...
Processed 1300/4076 molecules...
Processed 1400/4076 molecules...
Processed 1500/4076 molecules...
Processed 1600/4076 molecules...
Processed 1700/4076 molecules...
Processed 1800/4076 molecules...
Processed 1900/4076 molecules...
Processed 2000/4076 molecules...
Processed 2100/4076 molecules...
Processed 2200/4076 molecules...
Processed 2300/4076 molecules...
Processed 2400/4076 molecules...
Processed 2500/4076 molecules...
Processed 2600/4076 molecules...
Processed 2700/4076 molecules...
Processed 2800/4076 molecules...
Processed 2900/4076 molecules...
Processed 3000/4076 molecu