# SearchXNG

In [None]:
from pyserxng.models import SafeSearchLevel, SearchConfig, TimeRange, SearchCategory
from pyserxng import SearXNGClient
from pyserxng.models import InstanceInfo 

url_sear = "http://localhost:8888"

target_site = "kajima.co.jp"

query = f"site:{target_site} filetype:pdf ir report"

client = SearXNGClient()
local_instance = InstanceInfo(url=url_sear)
config = SearchConfig(
    engines=["duckduckgo"],
    
)
config.time_range = TimeRange.YEAR
results = client.search(
    query,
    instance=local_instance,
    config=config
)
report_urls = []
if results.results:
    for result in results.results:
        print(f"  URL: {result.url}")
        report_urls.append(result.url)
else:
    print("  No results\n")

2026-01-27 22:07:15,219 - pyserxng.client - INFO - Search completed: 1 results in 1.17s from http://localhost:8888/


  URL: http://localhost:8888/info/en/about


In [None]:
import os
import requests
from urllib.parse import urlparse

def download_reports(report_urls, save_dir="reports", timeout=30):
    """
    Download list of report URLs to local machine
    and return list of saved file paths

    :param report_urls: list[str] or list[HttpUrl]
    :param save_dir: folder to save pdf files
    :param timeout: request timeout (seconds)
    :return: list[str] saved file paths
    """
    os.makedirs(save_dir, exist_ok=True)
    saved_files = []

    for url in report_urls:
        url = str(url)  # HttpUrl -> str
        try:
            parsed = urlparse(url)
            filename = os.path.basename(parsed.path)

            if not filename.lower().endswith(".pdf"):
                filename += ".pdf"

            save_path = os.path.join(save_dir, filename)

            print(f"⬇️ Downloading: {filename}")
            r = requests.get(url, stream=True, timeout=timeout)
            r.raise_for_status()

            with open(save_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)

            saved_files.append(save_path)
            print(f"✅ Saved: {save_path}")

        except Exception as e:
            print(f"❌ Failed: {url}")
            print(f"   Reason: {e}")

    return saved_files

In [None]:
save_dir = "data/" + target_site
saved_report = download_reports(report_urls, save_dir = save_dir)

⬇️ Downloading: ir_e_all_2.pdf
✅ Saved: data/kajima.co.jp/ir_e_all_2.pdf
⬇️ Downloading: ir_e_p03-04.pdf
✅ Saved: data/kajima.co.jp/ir_e_p03-04.pdf
⬇️ Downloading: ir_e_p105-106.pdf
✅ Saved: data/kajima.co.jp/ir_e_p105-106.pdf
⬇️ Downloading: ir_e_p63-104.pdf
✅ Saved: data/kajima.co.jp/ir_e_p63-104.pdf
⬇️ Downloading: ir_e_p107-108.pdf
✅ Saved: data/kajima.co.jp/ir_e_p107-108.pdf
⬇️ Downloading: ir_e_p23-32.pdf
✅ Saved: data/kajima.co.jp/ir_e_p23-32.pdf
⬇️ Downloading: ir_e_p13-22.pdf
✅ Saved: data/kajima.co.jp/ir_e_p13-22.pdf
⬇️ Downloading: ir_e_p05-10.pdf
✅ Saved: data/kajima.co.jp/ir_e_p05-10.pdf
⬇️ Downloading: 20250514-fs.pdf
✅ Saved: data/kajima.co.jp/20250514-fs.pdf


# Pageindex

In [9]:
from pageindex import PageIndexClient
import pageindex.utils as utils
import os
from dotenv import load_dotenv
load_dotenv()

pi_client = PageIndexClient(api_key=os.getenv("PAGEINDEX_API_KEY"))

### Add one file

In [None]:
resp = pi_client.submit_document(saved_report[0])
print(resp) ##### pi-cmkuy164101lg09pgciqo5oqj

{'doc_id': 'pi-cmkuy164101lg09pgciqo5oqj'}


#### Get tree

In [12]:
doc_id = resp['doc_id']
if pi_client.is_retrieval_ready(doc_id):
    tree = pi_client.get_tree(doc_id, node_summary=True)['result']
    print('Simplified Tree Structure of the Document:')
    utils.print_tree(tree)
else:
    print("Processing document, please try again later...")

Simplified Tree Structure of the Document:
[{'title': '2025', 'node_id': '0000', 'summary': 'This document, the KAJIMA Integrated Rep...'},
 {'title': '23 countries and regions',
  'node_id': '0001',
  'prefix_summary': "The text outlines Kajima's global presen...",
  'nodes': [{'title': 'Carrying forward the Kajima DNA',
             'node_id': '0002',
             'summary': 'The text discusses the enduring relevanc...'},
            {'title': "The Kajima Group's Vision",
             'node_id': '0003',
             'summary': "The Kajima Group's vision, outlined in i..."},
            {'title': 'Awareness of the business environment',
             'node_id': '0004',
             'summary': 'The text discusses the increasing uncert...'},
            {'title': 'Review of FY2024',
             'node_id': '0005',
             'summary': 'The text reviews FY2024, highlighting in...'},
            {'title': 'Enhancing the profitability of our value...',
             'node_id': '0006',
   

In [13]:
tree 

[{'title': '2025',
  'node_id': '0000',
  'page_index': 1,
  'summary': "This document, the KAJIMA Integrated Report 2025, outlines Kajima's corporate philosophy, emphasizing creative progress, humanitarian outlook, and societal contribution through its construction business since 1840. It details the Kajima Group Vision, which centers on embracing challenges with ideas and technology, and highlights core values of Openness, Diversity, and Initiative, alongside a strong Corporate Culture. The report also presents key financial and operational data at a glance, including consolidated revenues, operating income, net income, owners' equity, ROE, dividends, group company structure, employee numbers, and main business areas, all indicating growth and development.",
  'text': "# 2025\n\nKajima's Corporate Philosophy\n\nAs a group of individuals working together as one, we pursue creative progress and development founded on both rational, scientific principles and a humanitarian outlook, thro

In [15]:
tree_without_text = utils.remove_fields(tree.copy(), fields=['text'])
tree_without_text

[{'title': '2025',
  'node_id': '0000',
  'page_index': 1,
  'summary': "This document, the KAJIMA Integrated Report 2025, outlines Kajima's corporate philosophy, emphasizing creative progress, humanitarian outlook, and societal contribution through its construction business since 1840. It details the Kajima Group Vision, which centers on embracing challenges with ideas and technology, and highlights core values of Openness, Diversity, and Initiative, alongside a strong Corporate Culture. The report also presents key financial and operational data at a glance, including consolidated revenues, operating income, net income, owners' equity, ROE, dividends, group company structure, employee numbers, and main business areas, all indicating growth and development."},
 {'title': '23 countries and regions',
  'node_id': '0001',
  'page_index': 5,
  'prefix_summary': "The text outlines Kajima's global presence with subsidiaries in 23 countries and regions. It includes a message from President H

In [25]:
import openai
import os
from dotenv import load_dotenv
import json

load_dotenv()

client = openai.OpenAI()

def call_llm_openai(
    prompt: str,
    model: str = "gemini-2.5-flash",
    temperature: float = 0.2,
):
    response = client.chat.completions.create(
        model=model, 
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
    )
    return response.choices[0].message.content.strip()

In [29]:
def tree_search(
    query: str,
    tree: dict,
):
    tree_without_text = utils.remove_fields(tree.copy(), fields=['text'])

    search_prompt = f"""
    You are given a question and a tree structure of a document.
    Each node contains a node id, node title, and a corresponding summary.
    Your task is to find all nodes that are likely to contain the answer to the question.
    Not use markdown

    Question: {query}

    Document tree structure:
    {json.dumps(tree_without_text, indent=2)}

    Please reply in the following JSON format:
    {{
        "thinking": "<Your thinking process on which nodes are relevant to the question>",
        "node_list": ["node_id_1", "node_id_2", ..., "node_id_n"]
    }}
    Directly return the final JSON structure. Do not output anything else.
    """

    tree_search_result = call_llm_openai(search_prompt)
    return json.loads(tree_search_result)

tree_search_result = tree_search(
    query="Shareholder return policy",
    tree=tree,
)
print("Tree Search Result:")
tree_search_result


Tree Search Result:


{'thinking': "The user is asking for nodes related to 'Shareholder return policy'. I will look for nodes that explicitly mention shareholder/stockholder returns, dividends, share buybacks, or profit allocation policies. I will prioritize nodes that describe the *policy* or *strategy* rather than just reporting financial figures or general performance that might enable returns.",
 'node_list': ['0008', '0028', '0029', '0031', '0093', '0125']}

In [30]:
import pageindex.utils as utils

node_map = utils.create_node_mapping(tree)
node_list = tree_search_result["node_list"]
for node_id in tree_search_result["node_list"]:
    node = node_map[node_id]
relevant_content = "\n\n".join(node_map[node_id]["text"] for node_id in node_list)

print('Retrieved Context:\n')
utils.print_wrapped(relevant_content)

Retrieved Context:

## In closing

As a company that provides safety and security to society, we are expected to realize fair and
appropriate management with the values and ethics required of being a member of society. While
performance figures are, of course, important, what is truly important for a company is its
employees' sense of accomplishment and satisfaction with the quality of their work and their
constant desire for these things. This healthy desire is the source of the Kajima Group's strength,
supporting the Company from its foundations and preventing quality defects

and compliance issues. My goal is to build a corporate group with this same high level of
engagement.

To achieve sustainable growth, our policy is to consider all stakeholders and provide appropriate
returns. In addition to rewarding our employees with wages, we are steadily advancing human capital
investment in areas such as improving the work environment, maintaining dormitories and company
housing, and prov