In [1]:
import os, sys
cwd = os.getcwd()
print("Current working directory:", cwd)
home_dir = os.path.dirname(os.path.dirname(cwd))
print("Home directory:", home_dir)
sys.path.append(home_dir)

Current working directory: /home/hoon/dd-agent/llm_dd/examples/BCL-2
Home directory: /home/hoon/dd-agent/llm_dd


In [2]:
import json
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from agentD.agents import agentD
from agentD.analysis_utils import get_tool_decorated_functions, custom_serializer, rule_based_evaluation
from agentD.prompts import FORMAT_INSTRUCTIONS

In [3]:
ret_tools = get_tool_decorated_functions(os.path.join(home_dir, "agentD/tools/retrieval.py"))
#comm_tools = get_tool_decorated_functions(os.path.join(home_dir, "agentD/tools/common.py"))
lit_tools = get_tool_decorated_functions(os.path.join(home_dir, "agentD/tools/lit_analysis.py"))
pred_tools = get_tool_decorated_functions(os.path.join(home_dir, "agentD/tools/prediction.py"))
gen_tools = get_tool_decorated_functions(os.path.join(home_dir, "agentD/tools/generation.py"))

tools = gen_tools + ret_tools + pred_tools #pred_tools #ret_tools + comm_tools + lit_tools

tool_names = [tool.name for tool in tools]  
tool_desc = [tool.description for tool in tools]

# Objective Downselection

In [4]:
PREFIX = """You are AgentD, an AI chemist with expertise in molecular property analysis and scoring.

Your task is to determine the given molecule as a drug candidate for the specified target protein.
Consider that this is an initial screening, so be lenient to keep a wide range of drug-like molecules. 
Also, you will provide a concise rationale for your assessment.
"""

SUFFIX = """You MUST adhere strictly to the following protocol to complete the task:
 
1. Identify critical properties relevant to the given screening objective.
   - Focus primarily on property interpretations.
2. Determine the given molecule as a drug candidate based on the selected properties. (Yes or No)
3. Review the validity of the answer.
4. Provide a final assessment answer with a rationale.

---
### Notes:
- Start by clearly stating the problem and the planned approach.
- Follow the outlined steps systematically without skipping any.
- Make sure to start the final answer with "YES" or "NO" and then provide the rationale. 
- Be lenient on the assessment, but if there's any critical issue, you should reject the molecule.

Now begin your task.

Question: {input}
Thought: {agent_scratchpad}
"""

In [5]:
agent = agentD(tools, 
               model="gpt-4o",
               prefix=PREFIX, 
               suffix=SUFFIX, 
               format_instructions=FORMAT_INSTRUCTIONS).agent

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [6]:
def process_dataset_with_agent(file_path: str, protein: str, objective: str, agent) -> pd.DataFrame:
    """
    Reads the dataset and applies the LLM-driven analysis to each entry.

    Args:
        file_path (str): Path to the dataset file.
        protein (str): Target protein for drug analysis.
        objective (str): Target objective for drug analysis.
        agent: The LLM agent object.

    Returns:
        pd.DataFrame: DataFrame with the original data and LLM assessments.
    """
    # Read dataset
    df = pd.read_csv(file_path)

    # Prepare list to store output
    results = []

    # Iterate through each row
    for _, row in df.iterrows():
        # rule-based evaluation
        entry = row.to_dict()
        report = rule_based_evaluation(entry)
        report.pop('smiles', None)
        smiles = row['SMILES']
        prompt = (
            f"Determine whether the given molecule is applicable as a potential drug targeting {protein}. "
            f"The screening objective is good {objective}-related properties. "
            f"The molecule's properties are: {entry}."
        )
        input_data = {"input": prompt}
        
        try:
            response = agent.invoke(input_data)
            assessment = response.get('output', 'No response')
        except Exception as e:
            assessment = f"Error: {str(e)}"

        results.append({'SMILES': smiles, 'LLM_Assessment': assessment, **report})

    # Create and return the new DataFrame
    return pd.DataFrame(results)


In [7]:
protein = "BCL-2"
objective = "absorption"
property_file = "property/combined_dummy.csv"
# Instantiate the agent (assuming agentD is previously defined)
agent = agentD(
    tools, 
    model="gpt-4o",
    prefix=PREFIX, 
    suffix=SUFFIX
).agent
# Process the dataset
processed_data = process_dataset_with_agent(property_file, protein, objective, agent)
processed_data.to_csv("downselection/objective_dummy.csv", index=False)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThe task is to determine if the given molecule is a potential drug candidate targeting BCL-2, focusing on absorption-related properties. I will evaluate the absorption properties and other relevant ADMET characteristics to make an assessment.

1. **Absorption Properties:**
   - **Caco-2 Permeability:** The prediction is -5.66, which is low, indicating poor intestinal absorption.
   - **Human Oral Bioavailability:** Predicted as bioavailable with medium to high confidence.
   - **Human Intestinal Absorption:** Predicted as absorbed with high confidence.
   - **Madin-Darby Canine Kidney (MDCK) Permeability:** The prediction is -5.29, which is low.
   - **P-Glycoprotein (P-gp) Interaction:** The molecule is predicted to be both an inhibitor and a substrate, which could affect absorption and distribution.

2. **Distribution Properties:**
   - **Blood-Brain Barrier (BBB) Penetration:** Predicted as non-penetrable with high confide