In [1]:
!pip install dspy pyaxis



In [1]:
import requests
import json
from pyaxis import pyaxis
from bs4 import BeautifulSoup
import pandas as pd
import dspy
import asyncio
from google.colab import userdata

MAKSTAT_URL = "https://makstat.stat.gov.mk/PXWeb/pxweb/mk/MakStat/search"

lm = dspy.LM('openai/gpt-4o-mini', api_key=userdata.get('OpenAI_API'))
dspy.configure(lm=lm)

In [17]:
from IPython.display import display, HTML
import sys
import textwrap

def keyword_search(keyword: str) -> dict[str, list[str]]:
    """A function that performs keyword search (in Macedonian) to retrieve a list of relevant tables from the National Statistics Office.
    """
    session = requests.Session()

    # Step 1: initial GET
    r = session.get(MAKSTAT_URL, params={"searchquery": keyword})
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    # Step 2: collect hidden inputs
    data = {}
    for inp in soup.select("input[type=hidden]"):
        if inp.get("name"):
            data[inp["name"]] = inp.get("value", "")

    # Step 3: add your desired options
    data["ctl00$ContentPlaceHolderMain$pxSearch$txtSearch"] = keyword
    data["ctl00$ContentPlaceHolderMain$pxSearch$cmdSearch"] = "Пребарај"
    data["ctl00$ContentPlaceHolderMain$searchOptions"] = "select"   # "Пребарај само"
    data["ctl00$ContentPlaceHolderMain$chkTitle"] = "on"            # "Наслов на табелата"

    # Step 4: POST back
    r2 = session.post(MAKSTAT_URL, data=data)
    r2.raise_for_status()

    # wrap in soup
    soup2 = BeautifulSoup(r2.text, "html.parser")

    rows = soup2.select("tr")
    results = {"titles": [], "links": [], "published": []}

    for row in rows:
        a = row.select_one("td.searchCellTable a")
        pub = row.select_one("td.searchCellPublished")
        if a and pub:
            results["titles"].append(a.get_text(strip=True))
            results["links"].append("https://makstat.stat.gov.mk" + a["href"])
            results["published"].append(pub.get_text(strip=True))

    return results

def get_table(url: str) -> pd.DataFrame:
    """A function that retrieves the full contents of the table in markdown. The table might be long, so use with caution only when necessary.

    Make sure the URL you provide as an argument is exactly what you see in the keyword search results!
    """
    API_BASE_URL = "https://makstat.stat.gov.mk:443/PXWeb/api/v1/"
    API_suffix = url.replace("MakStat__", "").replace("__", "/").split("pxweb/")[1]
    TABLE_URL = API_BASE_URL + API_suffix
    query = {
      "query": [],
      "response": {
        "format": "px"
      }
    }
    response = requests.post(TABLE_URL, json=query)
    if response.status_code == 200:
        print("Request successful!")
    else:
        print(f"Request failed with status code: {response.status_code}")

    with open("result.px", 'wb') as f:
        f.write(response.content)

    #try:
    result = pyaxis.parse('result.px', encoding='utf-8')
    #except UnicodeDecodeError:
    #    print("UTF-8 decoding failed, trying ISO-8859-5")
    #    result = pyaxis.parse('result.px', encoding='ISO-8859-5')


    table_df = pd.DataFrame(result['DATA'])

    return table_df.to_markdown()


react_agent = dspy.ReAct(
    signature="question -> answer",
    tools=[keyword_search, get_table],
    max_iters=5
)

#result = react_agent(question="Колку табели можеш да најдеш поврзани со трговија?")

stream_listeners = [
    # dspy.ReAct has a built-in output field called "next_thought".
    dspy.streaming.StreamListener(signature_field_name="next_thought", allow_reuse=True),
]
stream_react = dspy.streamify(react_agent, stream_listeners=stream_listeners)

async def read_output_stream(width=80):
    output = stream_react(question="Колку табели можеш да најдеш поврзани со трговија?")
    return_value = None
    async for chunk in output:
        if isinstance(chunk, dspy.streaming.StreamResponse):
            #print(chunk.chunk)
            raw = getattr(chunk, "chunk", chunk)
            if isinstance(raw, bytes):
                raw = raw.decode("utf-8", "replace")
            # remove leading newlines (common when server sends "\n" at chunk start)
            raw = raw.lstrip("\n")
            # if you also want to strip internal newlines, use replace:
            # raw = raw.replace("\n", " ")
            sys.stdout.write(raw)
            sys.stdout.flush()
        elif isinstance(chunk, dspy.Prediction):
            return_value = chunk
    return return_value

print(await read_output_stream())

Multilingual PX file


ERROR:pyaxis.pyaxis:Generic exception: Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/pyaxis/pyaxis.py", line 142, in parse
    pc_axis = read(uri, encoding, timeout)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pyaxis/pyaxis.py", line 112, in read
    raw_pcaxis = file_object.read()
                 ^^^^^^^^^^^^^^^^^^
  File "<frozen codecs>", line 322, in decode
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb2 in position 220: invalid start byte



Имам проблем со добивањето на деталите за втората табела. Можеби ќе треба да направам ново пребарување за табели поврзани со трговија за да видиме дали можам да најдам друга табела или да добијам информации од друга извор.Не можам да добијам деталите за втората табела поради проблем со кодирањето. Можеби ќе треба да завршам со задачата, бидејќи веќе имам информации за првата табела и не можам да добијам дополнителни податоци.

In [18]:
result = react_agent(question="Пребарај табели поврзани со трговија и дај ми го линкот за табелата за моторни возила?")
print(result.answer)
print("Tool calls made:", result.trajectory)

Multilingual PX file


In [19]:
result = react_agent(question="Прво најди ги табелите поврзани со образование, и потоа извлечи ја најинтересната и покажи ми ја.")
print(result.answer)
print("Tool calls made:", result.trajectory)

ERROR:pyaxis.pyaxis:Generic exception: Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/pyaxis/pyaxis.py", line 142, in parse
    pc_axis = read(uri, encoding, timeout)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pyaxis/pyaxis.py", line 112, in read
    raw_pcaxis = file_object.read()
                 ^^^^^^^^^^^^^^^^^^
  File "<frozen codecs>", line 322, in decode
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xce in position 222: invalid continuation byte



ERROR:pyaxis.pyaxis:Generic exception: Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/pyaxis/pyaxis.py", line 142, in parse
    pc_axis = read(uri, encoding, timeout)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pyaxis/pyaxis.py", line 112, in read
    raw_pcaxis = file_object.read()
                 ^^^^^^^^^^^^^^^^^^
  File "<frozen codecs>", line 322, in decode
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc7 in position 239: invalid continuation byte



Multilingual PX file


# Build eval dataset