In [1]:
from ollama import chat
import json
import re
import fitz  
from tqdm import tqdm
import os
from dotenv import load_dotenv
load_dotenv()

class PDFProcessor:
    def __init__(self, path_pdf, model_name: str = "qwen2.5:1.5b"):
        self.path_pdf = path_pdf
        self.model_name = model_name
        self.SYSTEM_PROMPT  = '''
        You read ONE page of a financial report.

        Task:
        List the main topics discussed on this page.

        Rules:
        - Topics must be short noun phrases.
        - Only include topics clearly mentioned.
        - No explanations.

        Output:
        Return ONLY a JSON array of strings.
        '''

    def extract_features_clean(self, text: str) -> list[str]:
        return list(dict.fromkeys(
            s.strip() for s in re.findall(r'"([^"]+)"', text)
        ))
    
    def extract_page_topics(
        self,
        page_content: str,
        ) -> list[str]:

        response = chat(
            model=self.model_name,
            messages=[
                {"role": "system", "content": self.SYSTEM_PROMPT},
                {
                    "role": "user",
                    "content": f"""
                    Page content:
                    \"\"\"
                    {page_content}
                    \"\"\"
                    """
                },
            ],
            options={
                "temperature": 0
            }
        )
        raw = response.message.content.strip()
        return raw

    
    def read_pdf_by_page(self):
        doc = fitz.open(self.path_pdf)
        results_list = []
        result_text = ""

        for page_index, page in enumerate(
            tqdm(doc, desc="Reading PDF pages", unit="page"),
            start=1
        ):
            text = page.get_text().strip()
            ingredient = {
                "page": page_index,
                "text": self.extract_features_clean(
                    self.extract_page_topics(text)
                )
            }
            results_list.append(ingredient)
            result_text += " " + str(ingredient)

        return results_list, result_text

In [None]:
import google.generativeai as genai

class RetrievalWithPDFPage:
    def __init__(self, model_name):
        self.model_name = model_name
        
    def run_gemini(
        self,
        user_prompt: str,
        ) -> dict:
        """
        Send text to Gemini and return parsed JSON output.
        """

        model = genai.GenerativeModel(
            model_name=self.model_name,
        )

        response = model.generate_content(
            user_prompt,
            generation_config={
                "temperature": 0,
                "response_mime_type": "application/json"
            }
        )

        raw = response.text.strip()
        return raw

    def retrieve_pages(self, list_indicators, context, descriptions):
        user_prompt = f"""
        You are given a list of indicators and a document structure.
        Each page id describes the main information covered on that page.

        Your task is to identify, for EACH indicator, the pages that are most likely to contain information relevant to that indicator.

        It is normal and expected that the same page may be relevant to multiple indicators.
        If an indicator includes a description or explanation, you MUST use that description as the primary semantic reference when identifying relevant pages, and prioritize pages that explicitly match the described concept over pages that only loosely relate by title.

        Indicators:
        {list_indicators}

        Document structure:
        {context}

        Description:
        {descriptions}

        Output format (STRICT):
        Return ONLY a single JSON object in the following structure:

        {{
        "<indicator_1>": {{
            "thinking": "<Why these pages are relevant to this indicator>",
            "page_list": ["x", "y"]
        }},
        "<indicator_2>": {{
            "thinking": "<Why these pages are relevant to this indicator>",
            "page_list": ["a"]
        }}
        }}

        Do NOT output anything outside the JSON.
        """
        return self.run_gemini(user_prompt=user_prompt)


In [12]:
path_pdf = "data/kajima.co.jp/ir_e_all_2.pdf"

In [13]:
model_ollama = "qwen2.5:1.5b"
kajima_process = PDFProcessor(path_pdf, model_name=model_ollama)
data = kajima_process.read_pdf_by_page()

Reading PDF pages: 100%|██████████| 168/168 [04:14<00:00,  1.52s/page]


In [28]:
retrieve = RetrievalWithPDFPage(model_name="gemini-2.5-flash")
list_indicator = ["Shareholder Return Policy", "IR Event Frequency", "Owner-managed Company"]
context = data[1]
descriptions = {"Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"}

result = retrieve.retrieve_pages(list_indicator, context, descriptions)
isolation_json = json.loads(result)
print(type(isolation_json))
print(isolation_json)

<class 'dict'>
{'Shareholder Return Policy': {'thinking': "This indicator refers to the company's approach to returning value to its shareholders. Relevant pages would discuss dividends, stock buybacks, equity, financial strategy, and overall stockholder returns. Pages 5, 36, 40, 41, 45, 91, 100, 101, 109, 115, and 137 directly mention these concepts, including 'Dividend', 'Stockholder Returns', 'Dividend Policy', 'Acquisition of Own Shares', 'ROE', 'financial_strategy', 'Stock Remuneration System', 'Performance-Linked Remuneration', 'Stockholder Dialogue', 'Basic Profit Allocation Policy', and 'Treasury Stock'.", 'page_list': ['5', '36', '40', '41', '45', '91', '100', '101', '109', '115', '137']}, 'IR Event Frequency': {'thinking': "This indicator relates to how often a company engages with investors through Investor Relations (IR) events. Pages that mention 'IR Activities', 'Financial Results Briefings', or 'Dialogue with Institutional Investors and Securities Analysts' are highly re

# Gen Answer

In [29]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat

pipeline_options = PdfPipelineOptions(
    do_ocr=False,      
    do_table_structure=True,    
)
doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
            backend=PyPdfiumDocumentBackend
        )
    }
)

In [34]:
class GenAnswerByPageContext:
    def __init__(self, model_name:str, path_pdf:str):
        self.model_name = model_name
        self.path_pdf = path_pdf
        
    def run_gemini(self, user_prompt):
        
        model = genai.GenerativeModel(
            model_name=self.model_name,
        )
        response = model.generate_content(
            user_prompt,
            generation_config={
                "temperature": 0,
            }
        )
        raw = response.text.strip()
        return raw

    def docling_parse_text(self, num_page):
        result = doc_converter.convert(
            source=self.path_pdf,
            page_range=(num_page, num_page)
        )
        return result.document.export_to_markdown()

    def gen_answers(self, isolation_json, descriptions):
        answers = {}
        for indicator, value in isolation_json.items():
            
            answers[indicator] = {}

            print("Processing indicator:", indicator)
            context = ""
            for page in value['page_list']:
                context += self.docling_parse_text(int(page)) + "\n"
            
            user_prompt = f"""
            You are an expert financial analyst. 
            Based on the context provided, please extract information related to producted indicator.
            If the indicator includes a description or explanation, you MUST treat it as a strict definition and use it as the primary semantic reference when extracting information from the context.

            NOT: AUTO GENERATE ANYTHING THAT IS NOT IN THE CONTEXT.
            
            Indicator: {indicator}

            Descriptions:
            {descriptions}

            Context:
            {context}
            """
            answer = self.run_gemini(user_prompt)
            
            answers[indicator]["answer"] = answer
            answers[indicator]["page_list"] = value['page_list']
            answers[indicator]["source"] = self.path_pdf

        return answers

In [35]:
gen_answer_by_page_context = GenAnswerByPageContext(model_name="gemini-2.5-flash", path_pdf=path_pdf)
finall_answers = gen_answer_by_page_context.gen_answers(isolation_json, descriptions)

Processing indicator: Shareholder Return Policy
Processing indicator: IR Event Frequency
Processing indicator: Owner-managed Company


# Multi file

In [8]:
from ollama import chat
import json
import re
import fitz  
from tqdm import tqdm
import os
import google.generativeai as genai
import time
from dotenv import load_dotenv
load_dotenv()

True

In [9]:
class PDFProcessor:
    def __init__(self, model_name: str = "qwen2.5:1.5b"):
        self.model_name = model_name
        self.SYSTEM_PROMPT  = '''
        You read ONE page of a financial report.

        Task:
        List the main topics discussed on this page.

        Rules:
        - Topics must be short noun phrases.
        - Only include topics clearly mentioned.
        - No explanations.

        Output:
        Return ONLY a JSON array of strings.
        '''

    def extract_features_clean(self, text: str) -> list[str]:
        return list(dict.fromkeys(
            s.strip() for s in re.findall(r'"([^"]+)"', text)
        ))
    
    def extract_page_topics(
        self,
        page_content: str,
        ) -> list[str]:

        response = chat(
            model=self.model_name,
            messages=[
                {"role": "system", "content": self.SYSTEM_PROMPT},
                {
                    "role": "user",
                    "content": f"""
                    Page content:
                    \"\"\"
                    {page_content}
                    \"\"\"
                    """
                },
            ],
            options={
                "temperature": 0
            }
        )
        raw = response.message.content.strip()
        return raw

    
    def read_pdf_by_page(self, path_pdf: str):
        doc = fitz.open(path_pdf)
        results_list = []
        result_text = ""

        for page_index, page in enumerate(
            tqdm(doc, desc="Reading PDF pages", unit="page"),
            start=1
        ):
            text = page.get_text().strip()
            ingredient = {
                "page": page_index,
                "text": self.extract_features_clean(
                    self.extract_page_topics(text)
                )
            }
            results_list.append(ingredient)
            result_text += " " + str(ingredient)

        return results_list, result_text

In [10]:
class PDFPageStructure:
    def __init__(self, model_name):
        self.model_name = model_name
        
    def run_gemini(
        self,
        user_prompt: str,
        ) -> dict:
        """
        Send text to Gemini and return parsed JSON output.
        """

        model = genai.GenerativeModel(
            model_name=self.model_name,
        )

        response = model.generate_content(
            user_prompt,
            generation_config={
                "temperature": 0,
                "response_mime_type": "application/json"
            }
        )

        raw = response.text.strip()
        return raw

    def isolated_pages(self, list_indicators, context, descriptions):
        user_prompt = f"""
        You are given a list of indicators and a document structure.
        Each page id describes the main information covered on that page.

        Your task is to identify, for EACH indicator, the pages that are most likely to contain information relevant to that indicator.

        It is normal and expected that the same page may be relevant to multiple indicators.
        If an indicator includes a description or explanation, you MUST use that description as the primary semantic reference when identifying relevant pages, and prioritize pages that explicitly match the described concept over pages that only loosely relate by title.

        Indicators:
        {list_indicators}

        Document structure:
        {context}

        Description:
        {descriptions}

        Output format (STRICT):
        Return ONLY a single JSON object in the following structure:

        {{
        "<indicator_1>": {{
            "thinking": "<Why these pages are relevant to this indicator>",
            "page_list": ["x", "y"]
        }},
        "<indicator_2>": {{
            "thinking": "<Why these pages are relevant to this indicator>",
            "page_list": ["a"]
        }}
        }}

        Do NOT output anything outside the JSON.
        """
        return self.run_gemini(user_prompt=user_prompt)


In [11]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat

pipeline_options = PdfPipelineOptions(
    do_ocr=False,      
    do_table_structure=True,    
)
doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
            backend=PyPdfiumDocumentBackend
        )
    }
)

In [12]:
class GenAnswerByPageContext:
    def __init__(self, model_name:str):
        self.model_name = model_name
        
    def run_gemini(self, user_prompt):
        
        model = genai.GenerativeModel(
            model_name=self.model_name,
        )
        response = model.generate_content(
            user_prompt,
            generation_config={
                "temperature": 0,
            }
        )
        raw = response.text.strip()
        return raw

    def docling_parse_text(self, num_page, path_pdf):
        result = doc_converter.convert(
            source=path_pdf,
            page_range=(num_page, num_page)
        )
        return result.document.export_to_markdown()

    def gen_answers(self, isolation_dic, descriptions):
        
        outputs = {}
        
        for indicator, meta in isolation_dic.items():
            outputs[indicator] = {}
            outputs[indicator]["information"] = {}
            
            context = ""
            
            for path, page_list in meta.items():
                if path not in outputs[indicator]["information"]:
                    outputs[indicator]["information"][path] = {
                        "page_list": [],
                        "context": ""
                    }
                outputs[indicator]["information"][path]["page_list"] = page_list
                if len(page_list) != 0:
                    for page in page_list:
                        context += self.docling_parse_text(int(page), path) + "\n"
                    outputs[indicator]["information"][path]["context"] = context
                
            user_prompt = f"""
            You are an expert financial analyst. 
            Based on the context provided, please extract information related to producted indicator.
            If the indicator includes a description or explanation, you MUST treat it as a strict definition and use it as the primary semantic reference when extracting information from the context.
            The context may not contain information relevant to the indicator. If the required information is NOT explicitly stated in the context, you MUST respond exactly with: "HAVE NOT INFORMATION".

            NOT: AUTO GENERATE ANYTHING THAT IS NOT IN THE CONTEXT.
            
            Indicator: {indicator}

            Descriptions:
            {descriptions}

            Context:
            {context}
            """
            answer = self.run_gemini(user_prompt)
            outputs[indicator]["answer"] = answer

        return outputs 

In [13]:
class PDFRretrieval:
    def __init__(self, mini_model, big_model):
        self.mini_model = mini_model
        self.big_model = big_model
        self.pdf_processor = PDFProcessor(model_name = mini_model)
        self.pdf_page_structure = PDFPageStructure(model_name = big_model)
        self.gen_answer_by_page_context = GenAnswerByPageContext(model_name=big_model)

    def run(self, file_paths, list_indicators, descriptions, type_report, target_site):
        use_paths = file_paths[target_site][type_report]
        TREES = {}
        ISOLATED_DIC = {}

        print("Start processing PDF pages - Gentree")
        time_gentree = 0

        for path in use_paths:
            start = time.time()
            data = self.pdf_processor.read_pdf_by_page(path)
            TREES[path] = data[1]
            end = time.time()
            time_value = end - start
            time_gentree += time_value
            
            print(f"Gentree: {path}", round(time_value, 2), "seconds")
        print("--"*20)
        print("Total Gentree time:", round(time_gentree, 2), "seconds")
        print()

        print("Start isolated")
        start = time.time()
        
        for indicator in list_indicators:
            ISOLATED_DIC[indicator] = {}
            
        for path, context in TREES.items():
            result = self.pdf_page_structure.isolated_pages(list_indicators, context, descriptions)
            isolation_json = json.loads(result)
            for key_indicator, value in isolation_json.items():
                if path not in ISOLATED_DIC[key_indicator]:
                    ISOLATED_DIC[key_indicator][path] = []
                ISOLATED_DIC[key_indicator][path].extend(value['page_list'])

        end = time.time()
        print("Total isolated pages: ", round(end - start, 2), "seconds")
        print()
        
        print("Generate answer from context")
        start = time.time()
        RESULTS = self.gen_answer_by_page_context.gen_answers(ISOLATED_DIC, descriptions)
        end = time.time()
        print("Total generate answer from context: ", round(end - start, 2), "seconds")
        print()
        
        print("------Finished processing------")

        return RESULTS, TREES, ISOLATED_DIC

In [None]:
target_site = "toyota"
type_report = "ir_report"

file_paths = {
    "toyota": 
        {"ir_report":
            [
                # "data/kajima.co.jp/20250514-fs.pdf",
                # "data/kajima.co.jp/ir_e_all_2.pdf",
                # "data/kajima.co.jp/ir_e_p03-04.pdf",
                # "data/kajima.co.jp/ir_e_p05-10.pdf",
                # "data/kajima.co.jp/ir_e_p13-22.pdf",
                # "data/kajima.co.jp/ir_e_p23-32.pdf",
                # "data/kajima.co.jp/ir_e_p63-104.pdf",
                # "data/kajima.co.jp/ir_e_p105-106.pdf",
                # "data/kajima.co.jp/ir_e_p107-108.pdf"
                # "data/kajima.co.jp/20251111-pm.pdf",
                # "data/kajima.co.jp/ir_e_all_2.pdf",
                # "data/kajima.co.jp/ir_e_p128.pdf"
                "data/ir_toyata.pdf"
            ]
        }
}

list_indicators = [
            "Shareholder Return Policy",
            "IR Event Frequency",
            "Major Shareholder Structure",
            "Controlling Shareholder"
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS, TREES, ISOLATED_DIC = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

In [29]:
ISOLATED_DIC

{'Shareholder Return Policy': {'data/kajima.co.jp/20251111-pm.pdf': ['2',
   '14'],
  'data/kajima.co.jp/ir_e_all_2.pdf': ['5',
   '36',
   '40',
   '41',
   '45',
   '91',
   '109',
   '115',
   '121',
   '122',
   '137',
   '156'],
  'data/kajima.co.jp/ir_e_p128.pdf': []},
 'IR Event Frequency': {'data/kajima.co.jp/20251111-pm.pdf': ['1',
   '2',
   '3',
   '13',
   '14',
   '16',
   '27',
   '30'],
  'data/kajima.co.jp/ir_e_all_2.pdf': ['13', '109'],
  'data/kajima.co.jp/ir_e_p128.pdf': ['1']},
 'Owner-managed Company': {'data/kajima.co.jp/20251111-pm.pdf': [],
  'data/kajima.co.jp/ir_e_all_2.pdf': ['94', '110'],
  'data/kajima.co.jp/ir_e_p128.pdf': []}}

In [28]:
for indicator, meta in RESULTS.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Shareholder Return Policy
Answer: The Shareholder Return Policy is as follows:

*   **Basic Policy:** The company's basic policy is to allocate profits taking into account a balance between growth investment and stockholder returns, with the aim of achieving sustainable growth and increasing corporate value, while maintaining financial soundness.
*   **Dividends:**
    *   The company plans to pay dividends with a target payout ratio of 40%.
    *   The target dividend payout ratio was raised from 30% to 40% in the current Medium-Term Business Plan.
    *   Dividends have been increased for five consecutive fiscal years, with the dividend per share gradually increasing from ¥50 in FY2019 to ¥104.
    *   Following the upward revision of financial results forecast, the full-year dividend per share has been revised upward by 20 yen, from 112 yen to 132 yen.
    *   For FY2026, the full-year dividend will be maintained at a minimum of ¥132 per share.
    *   The company aims to