In [1]:
import os, sys
cwd = os.getcwd()
print("Current working directory:", cwd)
home_dir = os.path.dirname(os.path.dirname(cwd))
print("Home directory:", home_dir)
sys.path.append(home_dir)

Current working directory: /home/hoon/dd-agent/llm_dd/examples/BCL-2
Home directory: /home/hoon/dd-agent/llm_dd


In [8]:
import json
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from agentD.agents import agentD
from agentD.analysis_utils import get_tool_decorated_functions, custom_serializer
from agentD.prompts import FORMAT_INSTRUCTIONS

In [3]:
ret_tools = get_tool_decorated_functions(os.path.join(home_dir, "agentD/tools/retrieval.py"))
# comm_tools = get_tool_decorated_functions(os.path.join(home_dir, "agentD/tools/common.py"))
lit_tools = get_tool_decorated_functions(os.path.join(home_dir, "agentD/tools/lit_analysis.py"))
pred_tools = get_tool_decorated_functions(os.path.join(home_dir, "agentD/tools/prediction.py"))
gen_tools = get_tool_decorated_functions(os.path.join(home_dir, "agentD/tools/generation.py"))

tools = gen_tools + ret_tools + pred_tools #pred_tools #ret_tools + comm_tools + lit_tools

tool_names = [tool.name for tool in tools]  
tool_desc = [tool.description for tool in tools]

# Preliminary

In [14]:
import pandas as pd

# combine adment.csv and affinity.csv to dataframe and save it as combined_property.csv

def combine_csv_files(adment_file, affinity_file, output_file):
    # Read the CSV files into DataFrames
    adment_df = pd.read_csv(adment_file)
    affinity_df = pd.read_csv(affinity_file)

    # Merge the DataFrames on the 'SMILES' column
    combined_df = pd.merge(adment_df, affinity_df, on='SMILES', how='inner')

    # Save the combined DataFrame to a new CSV file
    combined_df.to_csv(output_file, index=False)
# Example usage


In [15]:
combine_csv_files('property/admet_dummy.csv', 'property/affinity_dummy.csv', 'property/combined_dummy.csv')

# Ensemble Downselection

In [None]:
PREFIX = """You are AgentD, an AI chemist with expertise in molecular property analysis and scoring.

Your task is to determine the given molecule as a drug candidate for the specified target protein.
Consider that this is an initial screening, so be lenient to keep a wide range of drug-like molecules. 
"""

SUFFIX = """You MUST adhere strictly to the following protocol to complete the task:
 
1. Identify critical properties relevant to the drug molecule for the target protein, focusing primarily on property interpretations.
    - Think about the critical risk factors as drug candidates.
2. Leniently determine the given molecule's drug-likeness based on the selected properties. (YES or NO)
3. Provide a rationale for your assessment.

---
### Notes:
- Start by clearly stating the problem and the planned approach.
- Follow the outlined steps systematically without skipping any.
- Make sure to start the final answer with "YES" or "NO" and then provide the rationale. 
- Be lenient on the assessment, but if there's any critical issue, you should reject the molecule.

Now begin your task.

Question: {input}
Thought: {agent_scratchpad}
"""

In [5]:
agent = agentD(tools, 
               model="gpt-4o",
               prefix=PREFIX, 
               suffix=SUFFIX, 
               format_instructions=FORMAT_INSTRUCTIONS).agent

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [6]:


def process_dataset_with_agent(file_path: str, protein: str, objective: str, agent) -> None:
    """
    Reads the dataset and applies the LLM-driven analysis to each entry three times.
    Outputs a separate file containing only the assessments.

    Args:
        file_path (str): Path to the dataset file.
        protein (str): Target protein for drug analysis.
        objective (str): Target objective for drug analysis.
        agent: The LLM agent object.

    """
    # Read dataset
    df = pd.read_csv(file_path)

    # Store assessments
    assessments = []

    # Iterate over each entry
    for _, row in df.iterrows():
        smiles = row["SMILES"]
        # Construct the prompt for each entry
        Human_prompt = (
            f"Determine whether the given molecule is applicable as a potential drug molecule targeting the protein {protein}. "
            f"The screening objective is to {objective}. "
            f"The properties are stored in the dataset. The current entry is as follows: {row.to_dict()}"
        )

        # Construct the input for the agent
        input_data = {
            "input": Human_prompt,
            # "tools": tools,
            # "tool_names": tool_names,
            # "tool_desc": tool_desc
        }

        # Run the agent three times and store the outputs
        assessment1 = agent.invoke(input_data)['output']
        assessment2 = agent.invoke(input_data)['output']
        assessment3 = agent.invoke(input_data)['output']

        # Check if all assessments start with "NO"
        all_no = all(assessment.strip().upper().startswith("NO") for assessment in [assessment1, assessment2, assessment3])
        
        # Determine final decision
        decision = "Filtered" if all_no else "Accepted"

        # Append the results along with the SMILES and decision
        assessments.append([smiles, assessment1, assessment2, assessment3, decision])

    # Create a new DataFrame with the SMILES, assessments, and decision
    assessment_df = pd.DataFrame(assessments, columns=['SMILES', 'Assessment1', 'Assessment2', 'Assessment3', 'Decision'])

    return assessment_df
    # Save the assessment DataFrame to the specified output path
    # assessment_df.to_csv("ensemble2.csv", index=False)

    # print(f"Processing complete. Results saved to ensemble.csv")

In [9]:
# Example usage
protein = "BCL-2"
disease = "Lymphocytic leukemia"
property_file = "property/combined_dummy.csv"

# Instantiate the agent (assuming agentD is previously defined)
agent = agentD(
    tools, 
    model="gpt-4o",
    prefix=PREFIX, 
    suffix=SUFFIX
).agent

# Process the dataset
processed_data = process_dataset_with_agent(property_file, protein, disease, agent)
processed_data.to_csv("downselection/ensemble_dummy.csv", index=False)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThe task is to determine whether the given molecule is a potential drug candidate for targeting the BCL-2 protein, with a focus on lymphocytic leukemia. The assessment will be based on the provided ADMET properties and general molecular properties.

1. **Absorption**: 
   - The molecule is predicted to be bioavailable with medium to high confidence, which is a positive indicator for oral administration.
   - It is an inhibitor and substrate of P-Glycoprotein, which could affect drug-drug interactions and bioavailability.
   - Low skin permeability is noted, but this is less relevant for oral drugs.

2. **Distribution**:
   - The molecule is non-penetrable to the blood-brain barrier, which is acceptable for non-CNS targets.
   - Plasma protein binding is within an acceptable range, indicating a moderate therapeutic index.

3. **Metabolism**:
   - The molecule is a non-inhibitor for most CYP enzymes, except CYP 2C9, where it is