In [1]:
from smolagents import ToolCollection, ToolCallingAgent, OpenAIServerModel
from mcp import StdioServerParameters
import os
import json

In [2]:
os.environ['NEBIUS_API_KEY'] = open('secret.txt', 'r').read().strip()

In [3]:
server = StdioServerParameters(
    command="node",
    args=["/mnt/c/Users/Uniholder/Git/KEGG-MCP-Server/build/index.js", "stdio"],
    cwd="/mnt/c/Users/Uniholder/Git/KEGG-MCP-Server"
)

In [4]:
MODEL = "Qwen/Qwen3-235B-A22B-Instruct-2507"

model = OpenAIServerModel(
    model_id=MODEL,
    api_key=os.environ["NEBIUS_API_KEY"],
    api_base="https://api.studio.nebius.com/v1/",
    temperature=0,
)

In [5]:
with ToolCollection.from_mcp(
        server_parameters=server,
        trust_remote_code=True,
        structured_output=True
    ) as tools:
        # Проверить, что тулы действительно подхватились:
        print([t.name for t in tools.tools])

['get_database_info', 'list_organisms', 'search_pathways', 'get_pathway_info', 'get_pathway_genes', 'search_genes', 'get_gene_info', 'search_compounds', 'get_compound_info', 'search_reactions', 'get_reaction_info', 'search_enzymes', 'get_enzyme_info', 'search_diseases', 'get_disease_info', 'search_drugs', 'get_drug_info', 'get_drug_interactions', 'search_modules', 'get_module_info', 'search_ko_entries', 'get_ko_info', 'search_glycans', 'get_glycan_info', 'search_brite', 'get_brite_info', 'get_pathway_compounds', 'get_pathway_reactions', 'get_compound_reactions', 'get_gene_orthologs', 'batch_entry_lookup', 'convert_identifiers', 'find_related_entries']


In [6]:
def check_tool(tool, **kwargs):
    with ToolCollection.from_mcp(server, trust_remote_code=True, structured_output=False) as tc:
        ex = [t for t in tc.tools if t.name==tool][0]
        out = ex(kwargs if kwargs else {})
        print(out)

In [7]:
check_tool('search_genes', query='APOE')

{
  "query": "APOE",
  "database": "genes",
  "total_found": 1723,
  "returned_count": 50,
  "genes": {
    "hsa:348": "APOE, AD2, APO-E, ApoE4, LDLCQ5, LPG; apolipoprotein E",
    "hsa:7804": "LRP8, APOER2, HSZ75190, LRP-8, MCI1; LDL receptor related protein 8",
    "hsa:4035": "LRP1, A2MR, APOER, APR, CD91, DDH3, IGFBP-3R, IGFBP3R, IGFBP3R1, KPA, LRP, LRP1A, TGFBR5; LDL receptor related protein 1",
    "ptr:449586": "APOE; apolipoprotein E precursor",
    "pps:100967669": "APOE; LOW QUALITY PROTEIN: apolipoprotein E",
    "ggo:101135057": "APOE; LOW QUALITY PROTEIN: apolipoprotein E",
    "pon:100448039": "APOE; LOW QUALITY PROTEIN: apolipoprotein E",
    "ppyg:129019963": "K04524 apolipoprotein E | APOE; (RefSeq) LOW QUALITY PROTEIN: apolipoprotein E",
    "nle:100598188": "APOE; LOW QUALITY PROTEIN: apolipoprotein E",
    "hmh:116482923": "APOE; apolipoprotein E",
    "ssyn:129466996": "K04524 apolipoprotein E | APOE; (RefSeq) LOW QUALITY PROTEIN: apolipoprotein E",
    "mcc:714623

In [30]:
check_tool('find_related_entries', source_db='gene', target_db='disease', source_entries=['hsa:348'])

{
  "source_db": "gene",
  "target_db": "disease",
  "link_count": 1,
  "links": {
    "hsa:348": "ds:H01168"
  }
}


In [8]:
SYSTEM_PROMPT = f'''
You are a bioinformatics knowledge extraction agent connected exclusively to the KEGG MCP server.
Your task is to retrieve and summarize pathway- and function-level information for a given human gene or protein.

Focus on mapping between sequence-level identifiers (UniProt ID or HGNC symbol) and their KEGG pathways, modules, and processes.

For each gene/protein:
- Identify its KEGG entry (e.g. hsa:348 for APOE).
- Retrieve all associated KEGG pathways.
- For each pathway, extract:
  • pathway ID and name
  • short description
  • functional category (Metabolism, Genetic Information Processing, Environmental Information Processing, Cellular Processes, Organismal Systems, Human Diseases)
  • molecular function of the protein within this pathway
- Identify any pathways related to:
  “aging”, “longevity”, “oxidative stress”, “neurodegeneration”, “inflammation”, “FOXO”, “mTOR”, “AMPK”, “SIRT”, “autophagy”.
- If available, retrieve KEGG orthologs (KO identifiers) and enzyme commission (EC) numbers.
- Provide links to KEGG pages for the main entry and each pathway.

Return the result in a structured format: JSON or Markdown table with clear sections.

'''

In [31]:
gene_symbol = 'NRF2'

user_prompt_kegg = f"""
You are a bioinformatics extraction agent connected exclusively to the KEGG MCP server.
Your task is to retrieve all relevant KEGG data for a given human gene or protein.

Use only these tools:
- search_genes
- get_gene_info
- find_related_entries
- get_pathway_info
- get_disease_info
- get_drug_info
- get_module_info
- get_ko_info
- get_gene_orthologs

All information must come directly from KEGG endpoints.

---

### INPUT
Gene symbol: {gene_symbol}
Species: Homo sapiens

---

### TASKS

1. **Locate KEGG gene entry**
   - Use `search_genes` with the symbol and organism.
   - Extract: KEGG ID (e.g. "hsa:348"), gene name, KO (K-number), genomic position, strand, start, end.

2. **Retrieve detailed gene info**
   - Use `get_gene_info` and, if available, `get_ko_info` for functional class.
   - From `get_gene_info`, extract genomic location and parse to:
     position_text (e.g., "chr19:44905754..44909395"), strand, start, end.
   - Add a concise summary of the protein’s main function from KEGG descriptions.

3. **Find all directly related KEGG entities**
   - Use `find_related_entries` to discover:
     - Pathways (`path:...`)
     - Diseases (`ds:...` or `Hxxxxx`)
     - Drugs (`dr:Dxxxxx`)
     - Modules (`md:Mxxxxx`)
   - For each entity, call the corresponding `get_*_info` tool to extract summary data.
   - Include all relevant entities (not just the first).
   - For each section:
     • **Pathways:** add ID, title, description, KEGG map URL, image URL, and a note summarizing the gene’s functional role.  
     • **Diseases:** include name, short summary, BRITE class (if any), KEGG link, and a note connecting APOE to the disease mechanism.  
     • **Drugs:** include pharmacological class, structure image, target flag, and related pathways.  
     • **Modules:** include definition, functional class, and KEGG link.
   - In “notes”, highlight any relationships to longevity, aging, oxidative stress, inflammation, FOXO, mTOR, AMPK, SIRT, or autophagy.

4. **Retrieve orthology information via `get_gene_orthologs`.**
    - Call exactly: {{"gene_id":"<hsa_id>","target_organisms":["Homo sapiens"]}}
    - STRICT: Do NOT include "target_organisms" (invalid).
    - Return ≤10 orthologs (highest identity/SW score). If available, also include human paralogs from SSDB result.
    - Extract top ≤3 paralogs (within Homo sapiens) if available.
    - Include identity %, overlap, SW score, and KEGG entry URLs.
    - If available, include dendrogram URL from SSDB.
    - example: get_gene_orthologs {{"gene_id":"hsa:348","species":["Homo sapiens"]}}

5. **Return results as structured JSON**  
   Include only KEGG-derived data in this structure.
OUTPUT SCHEMA:
{{
  "query": "{{gene_symbol}}",
  "kegg": {{
    "entry": {{
      "hsa_id": "hsa:NNNN",
      "symbol": "<symbol>",
      "name": "<full gene name>",
      "ko": "<Kxxxxx>",
      "organism": "Homo sapiens",
      "position_text": "chr:start..end",
      "strand": "+|-",
      "start": <int>,
      "end": <int>,
      "notes": <text>
    }},
    "pathways": [
      {{
        "map_id": "hsaXXXXX",
        "title": "<pathway title>",
        "map_url": "https://www.kegg.jp/pathway/hsaXXXXX",
        "image_url": "https://www.kegg.jp/kegg/pathway/mapXXXXX.png",
        "notes": "<brief description of the gene's role>"
      }}
    ],
    "diseases": [
      {{
        "entry_id": "HNNNNN",
        "name": "<disease name>",
        "description": "<summary>",
        "brite": ["<classification>"],
        "urls": ["https://www.kegg.jp/entry/HNNNNN"],
        "notes": <text>
      }}
    ],
    "drugs": [
      {{
        "entry_id": "Dxxxxx",
        "name": "<drug name>",
        "class": ["<pharmacological class>"],
        "efficacy": "<short text>",
        "targets": [{{"gene": "hsa:NNNN", "symbol": "<...>", "ko": "<Kxxxxx>"}}],
        "pathways": ["hsaXXXXX"],
        "structure_image_url": "https://www.kegg.jp/ligand/Dxxxxx",
        "is_target_of_gene": <true|false>,
        "notes": <text>
      }}
    ],
    "modules": [
      {{
        "entry_id": "Mxxxxx",
        "name": "<module name>",
        "definition": "<summary>",
        "class": "<functional class>",
        "urls": ["https://www.kegg.jp/module/Mxxxxx"]
      }}
    ],
    "ssdb": {{
      "orthologs_top10": [
        {{
          "species_entry": "<org:gene_id>",
          "ko": "<Kxxxxx>",
          "identity": <float>,
          "overlap": <int>,
          "entry_url": "https://..."
        }}
      ],
      "paralogs": [
        {{
          "hsa_entry": "hsa:NNNN",
          "ko": "<Kxxxxx>",
          "identity": <float>,
          "overlap": <int>,
          "entry_url": "https://..."
        }}
      ]
    }},
    "sources": ["https://www.kegg.jp/entry/<hsa_id>"]
  }}
}}

---

### VALIDATION RULES BEFORE RETURNING
Before returning JSON, VERIFY:
- entry.position_text/strand/start/end are NOT null.
- If `find_related_entries` returns any items, ensure corresponding sections (pathways, diseases, drugs, modules) are populated.
- Each section should include at least one valid KEGG URL.
- Sources must include the main entry and all included pathway/disease/module URLs.
- Notes fields must summarize the biological or functional relevance.
- If any section expected from `find_related_entries` is empty, retry that tool once and rebuild JSON.

---

### OUTPUT FORMAT
Return only the final JSON object as described in the OUTPUT SCHEMA section (no prose, no Markdown formatting).
"""

In [32]:
with ToolCollection.from_mcp(
    server_parameters=server,
    trust_remote_code=True,
    structured_output=False
) as tools:
    agent = ToolCallingAgent(
        model=model,
        tools=[*tools.tools],
        add_base_tools=False,
        max_steps=10,
    )
    agent.prompt_templates["system_prompt"] = SYSTEM_PROMPT
    result = agent.run(user_prompt_kegg)

In [33]:
print(result)

{"query": "NRF2", "kegg": {"entry": {"hsa_id": "hsa:4780", "symbol": "NFE2L2", "name": "NFE2 like bZIP transcription factor 2", "ko": "K05638", "organism": "Homo sapiens", "position_text": null, "strand": null, "start": null, "end": null, "notes": "NRF2 (NFE2L2) is a master transcription factor that regulates the expression of antioxidant proteins and phase II detoxifying enzymes in response to oxidative stress. It plays a central role in cellular defense against reactive oxygen species (ROS), inflammation, and xenobiotics. Activation of NRF2 is linked to longevity, neuroprotection, and resistance to age-related diseases."}, "pathways": [{"map_id": "hsa05418", "title": "Fluid shear stress and atherosclerosis", "map_url": "https://www.kegg.jp/pathway/hsa05418", "image_url": "https://www.kegg.jp/kegg/pathway/map05418.png", "notes": "NRF2 is involved in the endothelial response to fluid shear stress, where it activates antioxidant and anti-inflammatory genes that protect against atheroscl

### Test

In [35]:
%load_ext autoreload
%autoreload 2

In [38]:
from kegg import run_query

In [39]:
output = run_query('SOX2')