In [1]:
from pageindex import PageIndexClient
import pageindex.utils as utils
import os
from dotenv import load_dotenv
load_dotenv()

# Get your PageIndex API key from https://dash.pageindex.ai/api-keys

pi_client = PageIndexClient(api_key=os.getenv("PAGEINDEX_API_KEY"))

In [None]:
from ollama import chat
from google import genai
import os

def call_llm(prompt, model="llama3.1:8b", temperature=0):
    response = chat(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        options={"temperature": temperature}
    )
    return response.message.content.strip()

client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))

def call_llm_gemini(
    prompt,
    model="gemini-3-flash-preview",
):
    response = client.models.generate_content(
        model=model,
        contents=prompt,
    )
    return response.text.strip()


In [3]:
import os, requests

# You can also use our GitHub repo to generate PageIndex tree
# https://github.com/VectifyAI/PageIndex

pdf_path = "data/123.pdf"

doc_id = pi_client.submit_document(pdf_path)["doc_id"]
print('Document Submitted:', doc_id)

Document Submitted: pi-cmkusk8ud00cu08pg5rblgz0j


In [46]:
if pi_client.is_retrieval_ready(doc_id):
    tree = pi_client.get_tree(doc_id, node_summary=True)['result']
    print('Simplified Tree Structure of the Document:')
    utils.print_tree(tree)
else:
    print("Processing document, please try again later...")

Processing document, please try again later...


In [38]:
tree_without_text = utils.remove_fields(tree.copy(), fields=['text'])
print(tree_without_text)

[{'title': '2025', 'node_id': '0000', 'page_index': 1, 'summary': "This document, the KAJIMA Integrated Report 2025, outlines Kajima's corporate philosophy, emphasizing creative progress, humanitarian outlook, and societal contribution through its construction business since 1840. It details the Kajima Group Vision, which centers on embracing challenges with ideas and technology, and highlights core values of Openness, Diversity, and Initiative, alongside a strong Corporate Culture. The report also presents key financial and operational data at a glance, including consolidated revenues, operating income, net income, owners' equity, ROE, dividends, group company structure, employee numbers, and main business areas, all indicating growth and development."}, {'title': '23 countries and regions', 'node_id': '0001', 'page_index': 5, 'prefix_summary': "The text outlines Kajima's global presence with subsidiaries in 23 countries and regions. It includes a message from President Hiromasa Amano

In [39]:
import json

query = "Shareholder Return Policy"

tree_without_text = utils.remove_fields(tree.copy(), fields=['text'])

search_prompt = f"""
You are given a question and a tree structure of a document.
Each node contains a node id, node title, and a corresponding summary.
Your task is to find all nodes that are likely to contain the answer to the question.
Not use markdown

Question: {query}

Document tree structure:
{json.dumps(tree_without_text, indent=2)}

Please reply in the following JSON format:
{{
    "thinking": "<Your thinking process on which nodes are relevant to the question>",
    "node_list": ["node_id_1", "node_id_2", ..., "node_id_n"]
}}
Directly return the final JSON structure. Do not output anything else.
"""

tree_search_result = call_llm_gemini(search_prompt)
print(tree_search_result)

{
    "thinking": "The question asks for the Shareholder Return Policy (referred to as 'Stockholder Return' in the document). Several nodes discuss this policy at different levels of detail. Node 0008 explicitly mentions detailing the stockholder return policy in its summary. Node 0028, 0029, and 0031 discuss the progress, strategy, and specific updates to this policy (including the 40% payout ratio and share buybacks) as part of the Medium-Term Business Plan and financial strategy. Node 0091 is the most direct section, titled 'Basic Profit Allocation Policy and Payment of Dividends', which outlines the dividend policy targets and allocation strategy. Node 0123 details subsequent events involving dividends and share buybacks aimed at improving shareholder returns.",
    "node_list": ["0008", "0028", "0029", "0031", "0091", "0123"]
}


In [40]:
node_map = utils.create_node_mapping(tree)
tree_search_result_json = json.loads(tree_search_result)

print('Reasoning Process:')
utils.print_wrapped(tree_search_result_json['thinking'])

print('\nRetrieved Nodes:')
for node_id in tree_search_result_json["node_list"]:
    node = node_map[node_id]
    print(f"Node ID: {node['node_id']}\t Page: {node['page_index']}\t Title: {node['title']}")

Reasoning Process:
The question asks for the Shareholder Return Policy (referred to as 'Stockholder Return' in the
document). Several nodes discuss this policy at different levels of detail. Node 0008 explicitly
mentions detailing the stockholder return policy in its summary. Node 0028, 0029, and 0031 discuss
the progress, strategy, and specific updates to this policy (including the 40% payout ratio and
share buybacks) as part of the Medium-Term Business Plan and financial strategy. Node 0091 is the
most direct section, titled 'Basic Profit Allocation Policy and Payment of Dividends', which
outlines the dividend policy targets and allocation strategy. Node 0123 details subsequent events
involving dividends and share buybacks aimed at improving shareholder returns.

Retrieved Nodes:
Node ID: 0008	 Page: 11	 Title: In closing
Node ID: 0028	 Page: 36	 Title: Progress on the Medium-Term Business Plan
Node ID: 0029	 Page: 38	 Title: Message from the General Manager of the Treasury Division


In [41]:
node_list = json.loads(tree_search_result)["node_list"]
relevant_content = "\n\n".join(node_map[node_id]["text"] for node_id in node_list)

print('Retrieved Context:\n')
utils.print_wrapped(relevant_content)

Retrieved Context:

## In closing

As a company that provides safety and security to society, we are expected to realize fair and
appropriate management with the values and ethics required of being a member of society. While
performance figures are, of course, important, what is truly important for a company is its
employees' sense of accomplishment and satisfaction with the quality of their work and their
constant desire for these things. This healthy desire is the source of the Kajima Group's strength,
supporting the Company from its foundations and preventing quality defects

and compliance issues. My goal is to build a corporate group with this same high level of
engagement.

To achieve sustainable growth, our policy is to consider all stakeholders and provide appropriate
returns. In addition to rewarding our employees with wages, we are steadily advancing human capital
investment in areas such as improving the work environment, maintaining dormitories and company
housing, and prov

In [None]:
# pi_client.delete_document(doc_id)
# print('Document deleted.')

Document deleted.
