In [1]:
import os, sys
cwd = os.getcwd()
print("Current working directory:", cwd)
home_dir = os.path.dirname(os.path.dirname(cwd))
print("Home directory:", home_dir)
sys.path.append(home_dir)

Current working directory: /home/hoon/dd-agent/llm_dd/examples/BCL-2
Home directory: /home/hoon/dd-agent/llm_dd


In [2]:
import json
import warnings
warnings.filterwarnings('ignore')
from agentD.agents import agentD
from agentD.analysis_utils import get_tool_decorated_functions, custom_serializer
from agentD.prompts import FORMAT_INSTRUCTIONS

In [3]:
ret_tools = get_tool_decorated_functions(os.path.join(home_dir, "agentD/tools/retrieval.py"))
lit_tools = get_tool_decorated_functions(os.path.join(home_dir, "agentD/tools/lit_analysis.py"))
gen_tools = get_tool_decorated_functions(os.path.join(home_dir, "agentD/tools/generation.py"))
tools = ret_tools + lit_tools + gen_tools
tool_names = [tool.name for tool in tools]  
tool_desc = [tool.description for tool in tools]

# Retrieval

In [4]:
PREFIX = """You are an AgentD, helpful and knowlegable AI chemist.
Your task is to retrieve FASTA sequences for the target protein and SMILES for the drug taretting the protein.
Also, download the relevant papers for the selected existing drug and the target protein.
Lastly, generate the ligand-protein complex structure using the retrieved data.
Always use the available tools, and if unsure, do not fabricate answers.
"""

SUFFIX = """You MUST follow the protocol below to complete the task.

1. Retrieve the UniProt IDs for the given protein. 
2. Decide the most relevant protein for our target. 
3. Retrieve FASTA sequence of the given protein from relevant database.
4. Search an existing drug molecule for the target protein.
5. Retrieve the SMILES of an existing drug from relevant database.
6. For any molecular or sequence-related data (e.g., SMILES, FASTA, drug names), format the `Observation` output as a JSON dictionary with the following keys:
   - "uniprot_id": protein Uniprot ID (if applicable)
   - "fasta": FASTA string of the target protein (strictly the sequence)
   - "drug_name": name of the selected existing drug (list if multiple)
   - "SMILES": SMILES string of the selected existing drug (list if multiple)
7. Save the results from the previous step.
   - If it fails at writing the JSON file, txt file also works.
8. Download the relevant papers for the selected existing drug and the target protein.
9. Generate the ligand-protein complex structure using the retrieved data.
   
---
### Notes:
- Begin by briefly stating the problem and how you will approach it.
- Don't skip any steps.

Now begin your task.

Question: {input}
Thought: {agent_scratchpad}
"""

In [5]:
protein = "BCL-2"
diesease = "Lymphocytic leukemia"
Human_prompt = f"Suggest the potential drug molecules for {diesease} targeting the protein {protein}."

input_data = {
            "input": Human_prompt,
            "tools": tools,
            "tool_names": tool_names,
            "tool_desc": tool_desc
        }

In [6]:
agent = agentD(tools, 
               model="gpt-4o",
               prefix=PREFIX, 
               suffix=SUFFIX, 
               format_instructions=FORMAT_INSTRUCTIONS).agent

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [None]:
response = agent.invoke(input_data)
with open("response/extraction.json", "w", encoding="utf-8") as f:
    json.dump(response, f, indent=2, default=custom_serializer)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThe task is to suggest potential drug molecules for Lymphocytic leukemia targeting the protein BCL-2. To achieve this, I will follow the outlined protocol:

1. Retrieve the UniProt IDs for the protein BCL-2.
2. Decide the most relevant protein for our target.
3. Retrieve the FASTA sequence of the BCL-2 protein.
4. Search for an existing drug molecule targeting BCL-2.
5. Retrieve the SMILES of the existing drug.
6. Save the results in a structured format.
7. Download relevant papers for the selected drug and BCL-2.
8. Generate the ligand-protein complex structure using the retrieved data.

Let's start with the first step.

Action: get_uniprot_ids
Action Input: "BCL-2"[0m
Observation: [33;1m[1;3m[('P10415', 'Apoptosis regulator Bcl-2')][0m
Thought:[32;1m[1;3mThe UniProt ID for the protein BCL-2 is P10415, which corresponds to the "Apoptosis regulator Bcl-2". This is the relevant protein for our target.

Next, I will retri

# Q&A

In [None]:

PREFIX = """You are an AgentD, helpful and knowlegable AI chemist.
Your task is to answer the given question based on the downloaded papers. 
Always use the available tools, and if unsure, do not fabricate answers.
"""

SUFFIX = """You MUST follow the protocol below to complete the task.

1. Answer the questions based on the downloaded papers.  
---
### Notes:
- Begin by briefly stating the problem and how you will approach it.
- Don't download the papers. It's already done.

Now begin your task.

Question: {input}
Thought: {agent_scratchpad}
"""

FORMAT_INSTRUCTIONS = """Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
When you have a response to say to the Human, or if you do not need to use a tool, you MUST use the format:
'''
Thought: Here's your final answer:
Final Answer: [your response here]
'''

Use the exact sequence without any "\n" or any extra "()".
"""

In [None]:
Human_prompt = "What are the implications of the findings regarding the combinatorial use of ABT-199 and proteasome inhibitors for treating solid tumors compared to hematologic malignancies?"

input_data = {
            "input": Human_prompt,
            # "prompt": RETRIEVAL_QA,
            "tools": tools,
            "tool_names": tool_names,
            "tool_desc": tool_desc
        }

In [22]:
agent = agentD(tools, 
               model="gpt-4o",
               prefix=PREFIX, 
               suffix=SUFFIX, 
               format_instructions=FORMAT_INSTRUCTIONS).agent

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [None]:
response = agent.invoke(input_data)
with open("responses/q_n_a.json", "w", encoding="utf-8") as f:
    json.dump(response, f, indent=2, default=custom_serializer)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction: question_answering
Action Input: What are the implications of the findings regarding the combinatorial use of ABT-199 and proteasome inhibitors for treating solid tumors compared to hematologic malignancies?[0m
Observation: [33;1m[1;3mThe findings suggest that the combinatorial use of ABT-199 (venetoclax) and proteasome inhibitors (PIs) could have significant implications for treating both solid tumors and hematologic malignancies. In the context of solid tumors, such as soft tissue sarcoma, the combination has been shown to induce apoptosis synergistically. This is achieved through the transactivation of the pro-apoptotic protein NOXA, which inhibits the anti-apoptotic protein MCL-1, thereby enhancing the efficacy of ABT-199. This dual mechanism of action—direct inhibition of BCL-2 and indirect inhibition of MCL-1 via NOXA—suggests that the combination could be effective in solid tumors that overexpress these prot

# Playground

In [1]:
def find_common_chunks(seq1, seq2, min_length=5):
    """
    Finds common contiguous substrings (chunks) of at least `min_length` shared by seq1 and seq2.

    Args:
        seq1 (str): First protein sequence.
        seq2 (str): Second protein sequence.
        min_length (int): Minimum length of chunk to report.

    Returns:
        List of common substrings.
    """
    common_chunks = set()
    len1 = len(seq1)

    for i in range(len1):
        for j in range(i + min_length, len1 + 1):
            substring = seq1[i:j]
            if substring in seq2:
                common_chunks.add(substring)

    return sorted(common_chunks, key=lambda x: (-len(x), x))  # longest first

In [None]:
seq1 = "YDNREIVMKYIHYKLSQRGYEWDAGD-DSE-VVHLTLRQAGDDFSRRYRRDFAEMSSQLHLTPFTARGRFATVVEELFRDGVNWGRIVAFFEFGGVMCVESVNREMSPLVDNIALWMTEYLNRHLHTWIQDNGGWDAFVELYG"
seq2 = "MAHAGRTG-YDNREIVMKYIHYKLSQRGYEWDAGD-VGAAPPGAAPAPGIFSSQPGHTPHPAASRDPVARTSPLQTPAAPGAAAGPALSPVPP-VVHLTLRQAGDDFSRRYRRDFAEMSSQLHLTPFTARGRFATVVEELFRDGVNWGRIVAFFEFGGVMCVESVNREMSPLVDNIALWMTEYLNRHLHTWIQDNGGWDAFVELYG"

common_chunks = find_common_chunks(seq1, seq2, min_length=10)
for chunk in common_chunks:
    print(f"Length {len(chunk)}: {chunk}")

Length 112: VVHLTLRQAGDDFSRRYRRDFAEMSSQLHLTPFTARGRFATVVEELFRDGVNWGRIVAFFEFGGVMCVESVNREMSPLVDNIALWMTEYLNRHLHTWIQDNGGWDAFVELYG
Length 111: VHLTLRQAGDDFSRRYRRDFAEMSSQLHLTPFTARGRFATVVEELFRDGVNWGRIVAFFEFGGVMCVESVNREMSPLVDNIALWMTEYLNRHLHTWIQDNGGWDAFVELYG
Length 111: VVHLTLRQAGDDFSRRYRRDFAEMSSQLHLTPFTARGRFATVVEELFRDGVNWGRIVAFFEFGGVMCVESVNREMSPLVDNIALWMTEYLNRHLHTWIQDNGGWDAFVELY
Length 110: HLTLRQAGDDFSRRYRRDFAEMSSQLHLTPFTARGRFATVVEELFRDGVNWGRIVAFFEFGGVMCVESVNREMSPLVDNIALWMTEYLNRHLHTWIQDNGGWDAFVELYG
Length 110: VHLTLRQAGDDFSRRYRRDFAEMSSQLHLTPFTARGRFATVVEELFRDGVNWGRIVAFFEFGGVMCVESVNREMSPLVDNIALWMTEYLNRHLHTWIQDNGGWDAFVELY
Length 110: VVHLTLRQAGDDFSRRYRRDFAEMSSQLHLTPFTARGRFATVVEELFRDGVNWGRIVAFFEFGGVMCVESVNREMSPLVDNIALWMTEYLNRHLHTWIQDNGGWDAFVEL
Length 109: HLTLRQAGDDFSRRYRRDFAEMSSQLHLTPFTARGRFATVVEELFRDGVNWGRIVAFFEFGGVMCVESVNREMSPLVDNIALWMTEYLNRHLHTWIQDNGGWDAFVELY
Length 109: LTLRQAGDDFSRRYRRDFAEMSSQLHLTPFTARGRFATVVEELFRDGVNWGRIVAFFEFGGVMCVESVNREMSPLVDNIALWMTEYLNRHLHTWIQDNGGWDAFVELYG
Length 109: VH