In [1]:
import os, sys
cwd = os.getcwd()
print("Current working directory:", cwd)
home_dir = os.path.dirname(os.path.dirname(cwd))
print("Home directory:", home_dir)
sys.path.append(home_dir)

Current working directory: /home/hoon/dd-agent/llm_dd/examples/BCL-2
Home directory: /home/hoon/dd-agent/llm_dd


In [2]:
import json
import warnings
warnings.filterwarnings('ignore')
from agentD.agents import agentD
from agentD.analysis_utils import get_tool_decorated_functions, custom_serializer
from agentD.prompts import FORMAT_INSTRUCTIONS

In [3]:
ret_tools = get_tool_decorated_functions(os.path.join(home_dir, "agentD/tools/retrieval.py"))
lit_tools = get_tool_decorated_functions(os.path.join(home_dir, "agentD/tools/lit_analysis.py"))
pred_tools = get_tool_decorated_functions(os.path.join(home_dir, "agentD/tools/prediction.py"))
gen_tools = get_tool_decorated_functions(os.path.join(home_dir, "agentD/tools/generation.py"))

tools = gen_tools + ret_tools + pred_tools #pred_tools #ret_tools + comm_tools + lit_tools
tool_names = [tool.name for tool in tools]  
tool_desc = [tool.description for tool in tools]

# Preliminary

In [4]:
import pandas as pd

def save_existing_drug_smiles(extraction_file, pool_directory):
    with open(extraction_file, "r") as f:
        extraction_data = json.load(f)
    smiles = extraction_data.get("SMILES", None)
    if smiles:
        # smiles are string or list
        if isinstance(smiles, str):
            smiles = [smiles]
        # Create a DataFrame from the list of SMILES
        smiles_df = pd.DataFrame(smiles, columns=["SMILES"])
        save_path = os.path.join(pool_directory, "existing_drug_molecules.csv")
        smiles_df.to_csv(save_path, index=False)
    else:
        print("No existing drug SMILES found in extraction.json.")


def combine_smiles_files(pool_directory):
    """
    Combines SMILES from all CSV files in a directory into a single CSV file.
    Adds an index based on the first letter of the file name and row number.

    Args:
        pool_directory (str): Path to the directory containing the CSV files.

    """
    combined_data = []

    # Iterate through all files in the directory
    for file_name in os.listdir(pool_directory):
        if file_name.endswith(".csv") and not file_name.startswith("combined_"):
            file_path = os.path.join(pool_directory, file_name)
            try:
                # Read the CSV file
                df = pd.read_csv(file_path)

                # Normalize column names to uppercase
                df.columns = df.columns.str.upper()
                #print(df.columns)

                # Ensure the 'SMILES' column exists
                if "SMILES" not in df.columns:
                    print(f"Skipping {file_name}: No 'SMILES' column found.")
                    continue

                combined_data.append(df[["SMILES"]])
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

    # Combine all dataframes and save to the output file
    if combined_data:
        combined_df = pd.concat(combined_data, ignore_index=True)
        output_file = os.path.join(pool_directory, "combined_smiles.csv")
        combined_df.to_csv(output_file, index=False)
        print(f"Combined CSV saved to {output_file}")

        
    else:
        print("No valid CSV files found in the directory.")


def extract_random_samples(file_path, save_path, num_samples=10):
    """
    Extracts a specified number of random samples from a CSV file.

    """
    df = pd.read_csv(file_path)
    if df.empty:
        print(f"The file {file_path} is empty or does not exist.")
        return None
    # Ensure the 'SMILES' column exists
    if "SMILES" not in df.columns:
        print(f"Skipping {file_path}: No 'SMILES' column found.")
        return None
    # Randomly sample the specified number of rows
    sampled_df = df.sample(n=num_samples, random_state=42)
    # Only keep the 'SMILES' column
    sampled_df = sampled_df[["SMILES"]].reset_index(drop=True)
    # Save the sampled DataFrame to a new CSV file
    sampled_df.to_csv(save_path, index=False)
    print(f"Random samples saved to {save_path}")
    return sampled_df

In [None]:
# read extraction.json

save_existing_drug_smiles("extraction.json", "pool")
# combine_smiles_files("pool")
extract_random_samples("pool/Mol2Mol_sampling.csv", "pool/Mol2Mol_sampling.csv", num_samples=50)
extract_random_samples("pool/Reinvent_sampling.csv", "pool/Reinvent_sampling.csv", num_samples=50)

Random samples saved to pool/Mol2Mol_sampling.csv
Random samples saved to pool/Reinvent_sampling.csv


Unnamed: 0,SMILES
0,O=C(Cc1cccs1)Nc1ccc2oc(N3Cc4ccccc4C3)nc2c1
1,Oc1ccc(C=Cc2cc(O)c(O)c(O)c2)cc1
2,Cc1ccc(-c2cc(C(=O)NC3CCCCCC3)nn2-c2ccc(-n3cncn...
3,c1ccc(CCn2c(-c3ccccc3)nc3ccccc32)cc1
4,Oc1ccc(CNC23CC4CC(CC(C4)C2)C3)cc1O
5,CC(=NN=C(N)N)c1cc(C)ccc1C
6,CCN(CC)Cc1c(O)ccc2c(C=O)cccc12
7,COC(=O)c1ccc(NS(=O)(=O)c2ccc3c(c2)C(=O)c2ccccc...
8,COc1ccc(OCCNC(=O)c2cc3ccccc3n2Cc2ccccc2)cc1
9,NS(=O)(=O)C1CC(C(=O)NC(Cc2ccc(-c3cccc(O)c3)cc2...


# Property Prediction

In [4]:
PREFIX = """You are AgentD, a highly knowledgeable and methodical AI chemist.
Your objective is to make property predictions over the provided SMILES and the target protein.
Specifically, the prediction task involves the binding affinity of a ligand to a protein and ADMET properties of the ligand.
Always utilize the designated tools, and if uncertain, refrain from generating speculative information.
"""

SUFFIX = """You MUST adhere strictly to the following protocol to complete the task:

1. Retrieve the UniProt IDs for the given protein. This will be used in the binding affinity prediction. 
2. Decide the most relevant protein for our target. 
3. Retrieve FASTA sequence of the given protein from relevant database.
4. Predict the properties using the appropriate tools.
    - Binding Affinity: Use the FASTA sequence obtained in Step 3 and the SMILES file path as inputs.
    - ADMET Properties: Apply the SMILES file directly.
 
---
### Notes:
- Begin by briefly stating the problem and how you will approach it.
- Don't skip any steps.
- If any step fails or the required data is not available, provide a clear explanation and do not make assumptions.

Now begin your task.

Question: {input}
Thought: {agent_scratchpad}
"""

In [5]:
protein = "BCL-2" 
diesease = "Lymphocytic leukemia"
file_name = "Mol2Mol_sampling_update_update"  # or "Reinvent"
candidate_smiles_file = f"pool/{file_name}.csv" 
Human_prompt = f"Make predictions for the potential drug molecules for {diesease} targeting the protein {protein}. The candidate SMILES file is '{candidate_smiles_file}'."

input_data = {
            "input": Human_prompt,
            "tools": tools,
            "tool_names": tool_names,
            "tool_desc": tool_desc
        }

In [6]:
agent = agentD(tools, 
               model="gpt-4o",
               prefix=PREFIX, 
               suffix=SUFFIX,
               format_instructions=FORMAT_INSTRUCTIONS).agent

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [7]:
response = agent.invoke(input_data)
with open(f"response/prediction_{file_name}.json", "w", encoding="utf-8") as f:
    json.dump(response, f, indent=2, default=custom_serializer)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: To make predictions for the potential drug molecules targeting the BCL-2 protein, I will follow the outlined protocol. First, I will retrieve the UniProt IDs for the BCL-2 protein. Then, I will fetch the FASTA sequence for the most relevant UniProt ID. After obtaining the sequence, I will predict the binding affinity using the provided SMILES file and the FASTA sequence. Finally, I will predict the ADMET properties using the SMILES file.

Action: get_uniprot_ids
Action Input: BCL-2[0m
Observation: [38;5;200m[1;3m[('P10415', 'Apoptosis regulator Bcl-2')][0m
Thought:[32;1m[1;3mThe UniProt ID for the BCL-2 protein is P10415, which corresponds to the "Apoptosis regulator Bcl-2". I will now proceed to fetch the FASTA sequence for this UniProt ID.

Action: fetch_uniprot_fasta
Action Input: P10415[0m
Observation: [36;1m[1;3m>sp|P10415|BCL2_HUMAN Apoptosis regulator Bcl-2 OS=Homo sapiens OX=9606 GN=BCL2 PE=1 SV=2
MA

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
100%|██████████| 47/47 [00:07<00:00,  5.88it/s]



Observation: [36;1m[1;3m('Affinity predictions saved to ', 'property/affinity_Mol2Mol_sampling_update_update.csv')[0m
Thought:[32;1m[1;3mThe binding affinity predictions have been successfully saved to 'property/affinity_Mol2Mol_sampling_update_update.csv'. Now, I will proceed to predict the ADMET properties using the same SMILES file.

Action: get_admet_predictions
Action Input: pool/Mol2Mol_sampling_update_update.csv[0m
Observation: [33;1m[1;3mADMET predictions saved to property/admet_Mol2Mol_sampling_update_update.csv[0m
Thought:[32;1m[1;3mHere's your final answer:
Final Answer: The predictions for the potential drug molecules targeting the BCL-2 protein have been completed. The binding affinity predictions are saved in 'property/affinity_Mol2Mol_sampling_update_update.csv', and the ADMET properties are saved in 'property/admet_Mol2Mol_sampling_update_update.csv'.[0m

[1m> Finished chain.[0m


# Post-processing
