In [1]:
import requests
import json
import asyncio
import re

from typing import Optional
from agents import Agent, function_tool
from agents import Runner
from toyaikit.chat import IPythonChatInterface
from toyaikit.chat.runners import OpenAIAgentsSDKRunner
from minsearch import Index
from bs4 import BeautifulSoup
from markdown import markdown as md_to_html
from minsearch import AppendableIndex

In [2]:
def get_page_content(url:str):
    reader_url_prefix = 'https://r.jina.ai/'
    request_url = reader_url_prefix + url
    response = requests.get(request_url)
    return response.content.decode('utf8')

In [3]:
reader_url_prefix = "https://r.jina.ai/"

def get_page_content(url: str) -> Optional[str]:
    """
    Fetch the Mark-down content of a web page using the Jina Reader service.

    This function prepends the Jina Reader proxy URL to the provided `url`,
    sends a GET request with a timeout, and decodes the response as UTF-8 text.

    Args:
        url (str): The URL of the page to fetch.

    Returns:
        Optional[str]: The JSON-formatted content of the page if the request
        succeeds; otherwise, None.

    Raises:
        None: All network or decoding errors are caught and suppressed.
               Logs or error messages could be added as needed.
    """
    reader_url = reader_url_prefix + url

    try:
        response = requests.get(reader_url, timeout=10)
        response.raise_for_status()  # raises for 4xx/5xx HTTP errors
        return response.content.decode("utf-8")
    except (requests.exceptions.RequestException, UnicodeDecodeError) as e:
        # Optional: log or print the error for debugging
        print(f"Error fetching content from {url}: {e}")
        return None

In [4]:
def sliding_window(seq, size, step):
    result = []
    for i in range(0, len(seq), step):
        batch = seq[i:i+size]
        result.append(batch)
        if i + size >= len(seq):
            break
    return result

In [5]:
def clean(text):
    text = re.sub(r"\s*\[\d+\]\s*", " ", text) 
    return re.sub(r"\s{2,}", " ", text.strip())

In [6]:
def to_sentences(text):
    return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]

In [7]:
def html_to_paragraph_text(html):
    soup = BeautifulSoup(html, "lxml")
    paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
    if not paragraphs:
        paragraphs = [soup.get_text(" ", strip=True)]
    return clean(" ".join(paragraphs))

In [8]:
html = md_to_html(get_page_content("https://en.wikipedia.org/wiki/Capybara"))
text = html_to_paragraph_text(html)
print(text[:500] + ("..." if len(text) > 500 else ""))

Title: Capybara URL Source: https://en.wikipedia.org/wiki/Capybara Published Time: 2001-10-13T20:04:37Z Markdown Content: From Wikipedia, the free encyclopedia | Capybara |
| --- |
| |
| In Petrópolis , Brazil |
| Conservation status |
| Least Concern ( IUCN 3.1 ) |
| Scientific classification |
| Kingdom: | Animalia |
| Phylum: | Chordata |
| Class: | Mammalia |
| Order: | Rodentia |
| Family: | Caviidae |
| Genus: | Hydrochoerus |
| Species: | H.hydrochaeris |
| Binomial name |
| Hydrochoerus ...


In [9]:
WINDOW_SENTENCES = 8
STEP_SENTENCES   = 4

sentences = to_sentences(text)
windows = sliding_window(sentences, size=WINDOW_SENTENCES, step=STEP_SENTENCES)

docs = [
    {"id": i+1, "text": " ".join(win), "source": "markdown"}
    for i, win in enumerate(windows) if win
]

print(f"The system has generated  {len(docs)} chunks.")
print(docs[0]["text"][:400] + "...")

The system has generated  30 chunks.
Title: Capybara URL Source: https://en.wikipedia.org/wiki/Capybara Published Time: 2001-10-13T20:04:37Z Markdown Content: From Wikipedia, the free encyclopedia | Capybara |
| --- |
| |
| In Petrópolis , Brazil |
| Conservation status |
| Least Concern ( IUCN 3.1 ) |
| Scientific classification |
| Kingdom: | Animalia |
| Phylum: | Chordata |
| Class: | Mammalia |
| Order: | Rodentia |
| Family: | ...


In [10]:
index=AppendableIndex(text_fields=["text"])
index.fit(docs)

# Test search:
results = index.search('What are threats to capybara populations?', num_results=1)
print(results)

[{'id': 9, 'text': 'They are superb swimmers and can hold their breath underwater for up to five minutes at a time. Capybara have flourished in cattle ranches. They roam in home ranges averaging 10 hectares (25 acres) in high-density populations. Many escapees from captivity can also be found in similar watery habitats around the world. Sightings are fairly common in Florida , although a breeding population has not yet been confirmed. In 2011, one specimen was spotted on the Central Coast of California . These escaped populations occur in areas where prehistoric capybaras inhabited; late Pleistocene capybaras inhabited Florida and Hydrochoerus hesperotiganites in California and Hydrochoerus gaylordi in Grenada , and feral capybaras in North America may actually fill the ecological niche of the Pleistocene species. A capybara eating hay at Franklin Park Zoo , Boston, Massachusetts Capybaras are herbivores , grazing mainly on grasses and aquatic plants , as well as fruit and tree bark.',

In [11]:
def search(query:str):
    """Search for relevant documents."""
    return index.search(
        query=query,
        num_results=5
    )

In [12]:
assistant_instructions = """
You're a helpful assistant that helps answer user questions.
"""

assistant = Agent(
    name='assistant',
    tools=[function_tool(get_page_content),
           function_tool(search)
          ],
    instructions=assistant_instructions,
    model='gpt-4o-mini'
)

In [13]:
runner = Runner()

In [14]:
user_prompt = "Summarize the content of https://en.wikipedia.org/wiki/Capybara "
result = await runner.run(assistant, input=user_prompt)

In [15]:
chat_interface = IPythonChatInterface()

runner = OpenAIAgentsSDKRunner(
    chat_interface=chat_interface,
    agent=assistant
)

In [None]:
await runner.run();