<a href="https://colab.research.google.com/github/liangli217/LLM_learning/blob/main/Drug_discovery_agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from langchain.agents import initialize_agent, Tool

In [6]:
from langchain_openai import ChatOpenAI

In [7]:
import requests

In [30]:
def smiles_to_cid(smiles: str) -> str:
    """Convert SMILES to PubChem CID."""
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles}/cids/JSON"
    r = requests.get(url).json()
    # Add error handling for when CID is not found
    if 'IdentifierList' not in r or not r['IdentifierList']['CID']:
        raise ValueError(f"Could not find CID for SMILES: {smiles}")
    return str(r['IdentifierList']['CID'][0])


def cid_to_info(cid: str) -> dict:
    """Fetch compound information from PubChem (robust version)."""
    # Try description endpoint first
    url_desc = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/description/JSON"
    r = requests.get(url_desc).json()

    if "InformationList" in r and "Information" in r["InformationList"]:
        info = r["InformationList"]["Information"][0]
        return {
            "cid": cid,
            "name": info.get("Title", "Unknown"),
            "description": info.get("Description", ""),
            "pubchem_link": f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}"
        }

    # Fallback: use property endpoint if no description
    url_prop = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/IUPACName,MolecularFormula/JSON"
    r2 = requests.get(url_prop).json()
    if "PropertyTable" in r2 and "Properties" in r2["PropertyTable"]:
        props = r2["PropertyTable"]["Properties"][0]
        return {
            "cid": cid,
            "name": props.get("IUPACName", "Unknown"),
            "description": f"Formula: {props.get('MolecularFormula', 'N/A')}",
            "pubchem_link": f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}"
        }

    # If all fails
    return {
        "cid": cid,
        "name": "Unknown",
        "description": "No description or properties available.",
        "pubchem_link": f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}"
    }

def check_anticancer(cid: str) -> bool:
    """Check if PubChem classification mentions anticancer activity."""
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/classification/JSON"
    r = requests.get(url).json()
    text = str(r)
    return ("antineoplastic" in text.lower()) or ("cancer" in text.lower())

def anticancer_agent_check(smiles: str) -> str:
    """Given SMILES, return anti-cancer status and known targets."""
    try:
        cid = smiles_to_cid(smiles)
        info = cid_to_info(cid)
        is_anticancer = check_anticancer(cid)
        targets = fetch_targets(cid)

        result = f"Compound: {info['name']} (CID {cid})\n"
        result += f"Description: {info['description']}\n"
        result += f"PubChem: {info['pubchem_link']}\n"
        result += f"Anti-cancer activity: {'YES' if is_anticancer else 'NO evidence found'}\n"

        if targets:
            result += "\nKnown Targets:\n"
            for t in targets[:5]:  # just show first 5 to avoid overload
                result += f"- {t.get('target_name','?')} (GeneID: {t.get('gene_id','?')}) → {t.get('outcome','')}\n"
        else:
            result += "\nNo known PubChem targets found."

        return result
    except ValueError as e:
        return f"Error: {e}"
    except Exception as e:
        return f"An unexpected error occurred: {e}"

In [31]:
def fetch_targets(cid: str) -> list:
    """Fetch known targets for a compound from PubChem BioAssay summary."""
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/assaysummary/JSON"
    r = requests.get(url).json()
    targets = []

    try:
        # Add checks for nested keys
        assays = r.get("Table", {}).get("Row", [])
        for assay in assays:
            target_info = {}
            # Add checks for nested keys
            for cell in assay.get("Cell", []):
                if cell.get("ColumnName") == "Target Name":
                    target_info["target_name"] = cell.get("CellValue")
                elif cell.get("ColumnName") == "Target GeneID":
                    target_info["gene_id"] = cell.get("CellValue")
                elif cell.get("ColumnName") == "Activity Outcome":
                    target_info["outcome"] = cell.get("CellValue")
            if target_info:
                targets.append(target_info)
    except Exception:
        # Catch any other potential errors during processing
        pass

    return targets

In [32]:
from google.colab import userdata
import os

# Set the OpenAI API key from Colab secrets
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

llm = ChatOpenAI(model="gpt-4o-mini") # Changed model to a valid one

In [33]:
tools = [
    Tool(
        name="SMILES_to_AntiCancer_Checker",
        func=anticancer_agent_check,
        description="Check if a SMILES string corresponds to a known anti-cancer agent and return PubChem info including known molecular targets"
    )
]

agent = initialize_agent(
    tools,
    llm,
    agent="zero-shot-react-description",
    verbose=True
)

In [34]:
query = "Is CC(=O)OC1=CC=CC=C1C(=O)O an anti-cancer agent?"
print(agent.run(query))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo determine if the provided SMILES string (CC(=O)OC1=CC=CC=C1C(=O)O) corresponds to a known anti-cancer agent, I will use the SMILES_to_AntiCancer_Checker function. 

Action: SMILES_to_AntiCancer_Checker  
Action Input: CC(=O)OC1=CC=CC=C1C(=O)O  [0m
Observation: [36;1m[1;3mCompound: Aspirin (CID 2244)
Description: 
PubChem: https://pubchem.ncbi.nlm.nih.gov/compound/2244
Anti-cancer activity: YES

No known PubChem targets found.[0m
Thought:[32;1m[1;3mI have confirmed that the SMILES string CC(=O)OC1=CC=CC=C1C(=O)O corresponds to Aspirin, which is recognized as an anti-cancer agent. However, there were no known PubChem targets found for this compound.

Final Answer: Yes, CC(=O)OC1=CC=CC=C1C(=O)O is an anti-cancer agent, corresponding to Aspirin.[0m

[1m> Finished chain.[0m
Yes, CC(=O)OC1=CC=CC=C1C(=O)O is an anti-cancer agent, corresponding to Aspirin.


In [24]:
query = "Check if CC(=O)OC1=CC=CC=C1C(=O)O is an anti-cancer agent and list known targets."
print(agent.run(query))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI will check if the SMILES string CC(=O)OC1=CC=CC=C1C(=O)O corresponds to a known anti-cancer agent and retrieve the relevant information. 
Action: SMILES_to_AntiCancer_Checker 
Action Input: CC(=O)OC1=CC=CC=C1C(=O)O [0m
Observation: [36;1m[1;3mCompound: Aspirin (CID 2244)
Description: 
PubChem: https://pubchem.ncbi.nlm.nih.gov/compound/2244
Anti-cancer activity: YES

No known PubChem targets found.[0m
Thought:[32;1m[1;3mI have confirmed that the compound represented by the SMILES string CC(=O)OC1=CC=CC=C1C(=O)O is indeed an anti-cancer agent, specifically Aspirin. However, there are no known molecular targets listed in PubChem for this compound. 

Final Answer: CC(=O)OC1=CC=CC=C1C(=O)O is an anti-cancer agent (Aspirin, CID 2244), but there are no known targets associated with it.[0m

[1m> Finished chain.[0m
CC(=O)OC1=CC=CC=C1C(=O)O is an anti-cancer agent (Aspirin, CID 2244), but there are no known targets associate

In [36]:
query = "Check if C1CC1C(=O)NC2=CC=CC(=C2)NC3=NC=NC(=C3)NC4=CC=CC(=C4)C(F)(F)F is an anti-cancer agent and list known targets."
print(agent.run(query))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to check the given SMILES string to see if it corresponds to a known anti-cancer agent and retrieve any associated information about molecular targets. 
Action: SMILES_to_AntiCancer_Checker 
Action Input: C1CC1C(=O)NC2=CC=CC(=C2)NC3=NC=NC(=C3)NC4=CC=CC(=C4)C(F)(F)F [0m
Observation: [36;1m[1;3mCompound: EGFR inhibitor (CID 9549299)
Description: 
PubChem: https://pubchem.ncbi.nlm.nih.gov/compound/9549299
Anti-cancer activity: YES

No known PubChem targets found.[0m
Thought:[32;1m[1;3mI have confirmed that the SMILES string corresponds to a known anti-cancer agent, specifically an EGFR inhibitor. However, there are no known molecular targets listed for this compound in the provided data. 

Final Answer: The compound represented by the SMILES string C1CC1C(=O)NC2=CC=CC(=C2)NC3=NC=NC(=C3)NC4=CC=CC(=C4)C(F)(F)F is an anti-cancer agent (EGFR inhibitor), but there are no known targets. For more information, you can refer

In [45]:
from langchain.agents import initialize_agent, Tool
from langchain_openai import ChatOpenAI
import requests

# -----------------------------
# PubChem Helpers
# -----------------------------
def smiles_to_cid(smiles: str) -> str:
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles}/cids/JSON"
    r = requests.get(url).json()
    return str(r['IdentifierList']['CID'][0])

def cid_to_info(cid: str) -> dict:
    """Fetch compound info (robust fallback)."""
    url_desc = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/description/JSON"
    r = requests.get(url_desc).json()

    if "InformationList" in r and "Information" in r["InformationList"]:
        info = r["InformationList"]["Information"][0]
        return {
            "cid": cid,
            "name": info.get("Title", "Unknown"),
            "description": info.get("Description", ""),
            "pubchem_link": f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}"
        }

    # fallback: basic properties
    url_prop = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/IUPACName,MolecularFormula/JSON"
    r2 = requests.get(url_prop).json()
    if "PropertyTable" in r2:
        props = r2["PropertyTable"]["Properties"][0]
        return {
            "cid": cid,
            "name": props.get("IUPACName", "Unknown"),
            "description": f"Formula: {props.get('MolecularFormula', 'N/A')}",
            "pubchem_link": f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}"
        }

    return {"cid": cid, "name": "Unknown", "description": "", "pubchem_link": f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}"}

def check_anticancer(cid: str) -> bool:
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/classification/JSON"
    r = requests.get(url).json()
    return "cancer" in str(r).lower() or "antineoplastic" in str(r).lower()

def fetch_targets(cid: str) -> list:
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/assaysummary/JSON"
    r = requests.get(url).json()
    targets = []
    try:
        for assay in r["Table"]["Row"]:
            target_info = {}
            for cell in assay["Cell"]:
                if cell["ColumnName"] == "Target Name":
                    target_info["target_name"] = cell["CellValue"]
                elif cell["ColumnName"] == "Target GeneID":
                    target_info["gene_id"] = cell["CellValue"]
                elif cell["ColumnName"] == "Activity Outcome":
                    target_info["outcome"] = cell["CellValue"]
            if target_info:
                targets.append(target_info)
    except Exception:
        pass
    return targets

# -----------------------------
# PubMed Helper
# -----------------------------
def pubmed_search(query: str, max_results: int = 5):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {"db": "pubmed", "term": query, "retmode": "json", "retmax": max_results}
    r = requests.get(url, params=params).json()
    ids = r.get("esearchresult", {}).get("idlist", [])
    summaries = []

    if ids:
        fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
        fetch_params = {"db": "pubmed", "id": ",".join(ids), "retmode": "json"}
        r2 = requests.get(fetch_url, params=fetch_params).json()
        for pid in ids:
            info = r2.get("result", {}).get(pid, {})
            if info:
                summaries.append({
                    "pmid": pid,
                    "title": info.get("title", "N/A"),
                    "journal": info.get("fulljournalname", "N/A"),
                    "pubdate": info.get("pubdate", "N/A")
                })
    return summaries


def fetch_mechanism_of_action(cid: str) -> str:
    """Get mechanism of action / targets from PubChem PUG View."""
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON"
    r = requests.get(url).json()

    moa = []

    def parse_section(section):
        # If section has text, capture it
        if "Information" in section:
            for info in section["Information"]:
                if "StringValue" in info:
                    text = info["StringValue"]
                    # Heuristic: only keep mechanism/target-related strings
                    if any(kw in text.lower() for kw in ["inhibitor", "target", "mechanism", "kinase"]):
                        moa.append(text)
        # Recursively parse subsections
        if "Section" in section:
            for subsection in section["Section"]:
                parse_section(subsection)

    try:
        for section in r["Record"]["Section"]:
            parse_section(section)
    except Exception:
        pass

    return " | ".join(set(moa)) if moa else "No mechanism of action found"


# -----------------------------
# Main Agent Tool
# -----------------------------
def anticancer_agent_check(smiles: str) -> str:
    cid = smiles_to_cid(smiles)
    info = cid_to_info(cid)
    is_anticancer = check_anticancer(cid)
    targets = fetch_targets(cid)
    moa = fetch_mechanism_of_action(cid)   # <---- NEW
    pubmed_hits = pubmed_search(f"{info['name']} cancer target", max_results=5)

    result = f"🔬 Compound: {info['name']} (CID {cid})\n"
    result += f"📖 Description: {info['description']}\n"
    result += f"🔗 PubChem: {info['pubchem_link']}\n"
    result += f"🎯 Anti-cancer activity (PubChem): {'YES' if is_anticancer else 'No evidence found'}\n"
    result += f"⚙️ Mechanism of Action: {moa}\n"

    if targets:
        result += "\n📌 Known PubChem BioAssay Targets:\n"
        for t in targets[:5]:
            result += f"- {t.get('target_name','?')} (GeneID {t.get('gene_id','?')}) → {t.get('outcome','')}\n"
    else:
        result += "\n📌 No known PubChem BioAssay targets found.\n"

    if pubmed_hits:
        result += "\n📚 PubMed Evidence (top hits):\n"
        for h in pubmed_hits:
            result += f"- {h['title']} ({h['journal']}, {h['pubdate']}) PMID:{h['pmid']}\n"
    else:
        result += "\n📚 No PubMed cancer-target articles found."

    return result


# -----------------------------
# LangChain Agent
# -----------------------------
llm = ChatOpenAI(model="gpt-4o-mini")  # swap with gpt-5 if you want

tools = [
    Tool(
        name="SMILES_to_AntiCancer_Checker",
        func=anticancer_agent_check,
        description="Check if a SMILES corresponds to a known anti-cancer agent; includes PubChem info, targets, and PubMed cancer-target articles."
    )
]

agent = initialize_agent(
    tools,
    llm,
    agent="zero-shot-react-description",
    verbose=True
)

# -----------------------------
# Example run
# -----------------------------
query = "Check if  C1CC1C(=O)NC2=CC=CC(=C2)NC3=NC=NC(=C3)NC4=CC=CC(=C4)C(F)(F)F is an anti-cancer agent and list known targets."
print(agent.run(query))




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to check if the given SMILES corresponds to a known anti-cancer agent, and if so, identify its targets. 
Action: SMILES_to_AntiCancer_Checker
Action Input: C1CC1C(=O)NC2=CC=CC(=C2)NC3=NC=NC(=C3)NC4=CC=CC(=C4)C(F)(F)F[0m
Observation: [36;1m[1;3m🔬 Compound: EGFR inhibitor (CID 9549299)
📖 Description: 
🔗 PubChem: https://pubchem.ncbi.nlm.nih.gov/compound/9549299
🎯 Anti-cancer activity (PubChem): YES
⚙️ Mechanism of Action: No mechanism of action found

📌 No known PubChem BioAssay targets found.

📚 PubMed Evidence (top hits):
- Current indications for surgery in patients with lung cancer after neoadjuvant targeted therapy: a systematic review. (Updates in surgery, 2025 Sep 25) PMID:40996635
- Exploring the potential toxic mechanisms of bisphenol F exposure in acute myeloid leukemia: Insights from network toxicology, molecular docking and experimental validation. (International immunopharmacology, 2025 Sep 23) PMID:40991

In [46]:
query = "Check if  CCCS(=O)(=O)NC1=C(C(=C(C=C1)F)C(=O)C2=CNC3=C2C=C(C=N3)C4=CC=C(C=C4)Cl)F is an anti-cancer agent and list known targets."
print(agent.run(query))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to check if the provided SMILES corresponds to a known anti-cancer agent and gather associated targets. I'll use the SMILES_to_AntiCancer_Checker function for this purpose. 

Action: SMILES_to_AntiCancer_Checker  
Action Input: CCCS(=O)(=O)NC1=C(C(=C(C=C1)F)C(=O)C2=CNC3=C2C=C(C=N3)C4=CC=C(C=C4)Cl)F  [0m
Observation: [36;1m[1;3m🔬 Compound: Vemurafenib (CID 42611257)
📖 Description: 
🔗 PubChem: https://pubchem.ncbi.nlm.nih.gov/compound/42611257
🎯 Anti-cancer activity (PubChem): YES
⚙️ Mechanism of Action: No mechanism of action found

📌 No known PubChem BioAssay targets found.

📚 PubMed Evidence (top hits):
- Thyroid cancer: From molecular insights to therapy (Review). (Oncology letters, 2025 Nov) PMID:40980146
- Combining Molecular Dynamics and Machine Learning to Predict Drug Resistance Causing Variants of BRAF in Colorectal Cancer. (Molecules (Basel, Switzerland), 2025 Aug 30) PMID:40942081
- Targeting melanoma resi