## Base

In [12]:
from ollama import chat
import json
import re
import fitz  
from tqdm import tqdm
import os
from dotenv import load_dotenv
load_dotenv()

class PDFProcessor:
    def __init__(self, path_pdf, model_name: str = "qwen2.5:1.5b"):
        self.path_pdf = path_pdf
        self.model_name = model_name
        self.SYSTEM_PROMPT  = '''
        You read ONE page of a financial report.

        Task:
        List the main topics discussed on this page.

        Rules:
        - Topics must be short noun phrases.
        - Only include topics clearly mentioned.
        - No explanations.

        Output:
        Return ONLY a JSON array of strings.
        '''

    def extract_features_clean(self, text: str) -> list[str]:
        return list(dict.fromkeys(
            s.strip() for s in re.findall(r'"([^"]+)"', text)
        ))
    
    def extract_page_topics(
        self,
        page_content: str,
        ) -> list[str]:

        response = chat(
            model=self.model_name,
            messages=[
                {"role": "system", "content": self.SYSTEM_PROMPT},
                {
                    "role": "user",
                    "content": f"""
                    Page content:
                    \"\"\"
                    {page_content}
                    \"\"\"
                    """
                },
            ],
            options={
                "temperature": 0
            }
        )
        raw = response.message.content.strip()
        return raw

    
    def read_pdf_by_page(self):
        doc = fitz.open(self.path_pdf)
        results_list = []
        result_text = ""

        for page_index, page in enumerate(
            tqdm(doc, desc="Reading PDF pages", unit="page"),
            start=1
        ):
            text = page.get_text().strip()
            # print()
            # print(text)
            # print()
            ingredient = {
                "page": page_index,
                "text": self.extract_features_clean(
                    self.extract_page_topics(text)
                )
            }
            results_list.append(ingredient)
            result_text += " " + str(ingredient)

        return results_list, result_text

In [13]:
import google.generativeai as genai

class RetrievalWithPDFPage:
    def __init__(self, model_name):
        self.model_name = model_name
        
    def run_gemini(
        self,
        user_prompt: str,
        ) -> dict:
        """
        Send text to Gemini and return parsed JSON output.
        """

        model = genai.GenerativeModel(
            model_name=self.model_name,
        )

        response = model.generate_content(
            user_prompt,
            generation_config={
                "temperature": 0,
                "response_mime_type": "application/json"
            }
        )

        raw = response.text.strip()
        return raw

    def retrieve_pages(self, list_indicators, context, descriptions):
        user_prompt = f"""
        You are given a list of indicators and a document structure.
        Each page id describes the main information covered on that page.

        Your task is to identify, for EACH indicator, the pages that are most likely to contain information relevant to that indicator.

        It is normal and expected that the same page may be relevant to multiple indicators.
        If an indicator includes a description or explanation, you MUST use that description as the primary semantic reference when identifying relevant pages, and prioritize pages that explicitly match the described concept over pages that only loosely relate by title.

        Indicators:
        {list_indicators}

        Document structure:
        {context}

        Description:
        {descriptions}

        Output format (STRICT):
        Return ONLY a single JSON object in the following structure:

        {{
        "<indicator_1>": {{
            "thinking": "<Why these pages are relevant to this indicator>",
            "page_list": ["x", "y"]
        }},
        "<indicator_2>": {{
            "thinking": "<Why these pages are relevant to this indicator>",
            "page_list": ["a"]
        }}
        }}

        Do NOT output anything outside the JSON.
        """
        return self.run_gemini(user_prompt=user_prompt)


In [15]:
path_pdf = "data/123.pdf"
model_ollama = "qwen2.5:1.5b"
kajima_process = PDFProcessor(path_pdf, model_name=model_ollama)
data = kajima_process.read_pdf_by_page()

Reading PDF pages: 100%|██████████| 168/168 [03:32<00:00,  1.27s/page]


In [20]:
retrieve = RetrievalWithPDFPage(model_name="gemini-2.5-flash")
list_indicator = ["Shareholder Return Policy", "IR Event Frequency", "Major Shareholder Structure"]
context = data[1]
descriptions = {}

result = retrieve.retrieve_pages(list_indicator, context, descriptions)
isolation_json = json.loads(result)
print(type(isolation_json))
print(isolation_json)

<class 'dict'>
{'Shareholder Return Policy': {'thinking': "This indicator refers to the company's strategies and actions to return value to its shareholders. Pages mentioning 'Dividend', 'Stockholder Returns', 'Dividend Policy', 'Acquisition of Own Shares', 'Treasury Stock', 'Profit Allocation Policy', and 'ROE' (as a key performance metric for shareholder value) are directly relevant.", 'page_list': ['5', '36', '41', '45', '91', '115', '121', '122', '137', '156', '157']}, 'IR Event Frequency': {'thinking': "This indicator refers to the company's engagement with investors through various events and communications. Pages explicitly mentioning 'IR Activities', 'Financial Results Briefings', 'Stockholder Dialogue', and 'Dialogue with Institutional Investors and Securities Analysts' are most relevant.", 'page_list': ['13', '109']}, 'Major Shareholder Structure': {'thinking': "This indicator describes the composition and distribution of the company's ownership among its major shareholders. 

In [None]:
{
    "Shareholder Return Policy": {
        "thinking": "This indicator refers to the company's strategies and actions to return value to its shareholders. Pages mentioning 'Dividend', 'Stockholder Returns', 'Dividend Policy', 'Acquisition of Own Shares', 'Treasury Stock', 'Profit Allocation Policy', and 'ROE' (as a key performance metric for shareholder value) are directly relevant.",
        "page_list": [
            "5",
            "36",
            "41",
            "45",
            "91",
            "115",
            "121",
            "122",
            "137",
            "156",
            "157",
        ],
    },
    "IR Event Frequency": {
        "thinking": "This indicator refers to the company's engagement with investors through various events and communications. Pages explicitly mentioning 'IR Activities', 'Financial Results Briefings', 'Stockholder Dialogue', and 'Dialogue with Institutional Investors and Securities Analysts' are most relevant.",
        "page_list": [
            "13", 
            "109"
        ],
    },
    "Major Shareholder Structure": {
        "thinking": "This indicator describes the composition and distribution of the company's ownership among its major shareholders. Pages that explicitly mention 'Stock Ownership Breakdown' or 'Shareholdings' are relevant.",
        "page_list": [
            "94", 
            "110"
        ],
    },
}

# Gen Answer

In [1]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat

pipeline_options = PdfPipelineOptions(
    do_ocr=False,      
    do_table_structure=True,    
)
doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
            backend=PyPdfiumDocumentBackend
        )
    }
)

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
class GenAnswerByPageContext:
    def __init__(self, model_name:str, path_pdf:str):
        self.model_name = model_name
        self.path_pdf = path_pdf
        
    def run_gemini(self, user_prompt):
        
        model = genai.GenerativeModel(
            model_name=self.model_name,
        )
        response = model.generate_content(
            user_prompt,
            generation_config={
                "temperature": 0,
            }
        )
        raw = response.text.strip()
        return raw

    def docling_parse_text(self, num_page):
        result = doc_converter.convert(
            source=self.path_pdf,
            page_range=(num_page, num_page)
        )
        return result.document.export_to_markdown()

    def gen_answers(self, isolation_json, descriptions):
        answers = {}
        for indicator, value in isolation_json.items():
            
            answers[indicator] = {}

            print("Processing indicator:", indicator)
            context = ""
            for page in value['page_list']:
                context += self.docling_parse_text(int(page)) + "\n"
            print(context)
            user_prompt = f"""
            You are an expert financial analyst. 
            Based on the context provided, please extract information related to producted indicator.
            If the indicator includes a description or explanation, you MUST treat it as a strict definition and use it as the primary semantic reference when extracting information from the context.

            NOT: AUTO GENERATE ANYTHING THAT IS NOT IN THE CONTEXT.
            
            Indicator: {indicator}

            Descriptions:
            {descriptions}

            Context:
            {context}
            """
            answer = self.run_gemini(user_prompt)
            
            answers[indicator]["answer"] = answer
            answers[indicator]["page_list"] = value['page_list']
            answers[indicator]["source"] = self.path_pdf

        return answers

In [23]:
gen_answer_by_page_context = GenAnswerByPageContext(model_name="gemini-2.5-flash", path_pdf=path_pdf)
finall_answers = gen_answer_by_page_context.gen_answers(isolation_json, descriptions)

Processing indicator: Shareholder Return Policy
Established

1840

Consolidated revenues

¥

2,911.8 billion

(Up ¥

246.6 billion from FY2023)

Revenue ratio by region

Europe

6.0%

Oceania

13.0%

Asia

19.2%

Group companies

322

(as of March 31, 2025)

Consolidated subsidiaries

181

18 in Japan

Equity-method affiliates

141

90 in Japan

163 outside Japan

51 outside Japan

¥

Overseas revenues

1,116.8 billion

UP

Consolidated operating income

151.8 billion

UP

¥

(Up ¥

15.6 billion from FY2023)

Other regions

0.3%

North

America

61.5%

Overseas

¥

Net income attributable to owners of the parent

125.8 billion

(Up ¥

10.7 billion from FY2023)

Consolidated revenues

38.4%

¥

2,911.8 billion

Number of employees

Consolidated

Group companies outside Japan

6,789

Consolidated

Group companies

in Japan

7,356

UP

March 31, 2025

25,339

Japan

61.6%

Non-consolidated

11,194

¥104

UP

(¥90 in FY2023)

<!-- image -->
## Progress on the Medium-Term Business Plan

| M

In [26]:
for key, value in finall_answers.items():
    print(f"Answer: {value['answer']}")
    print("--------------------------------------------------")

Answer: **Shareholder Return Policy**

*   **Basic Policy:** The Company's basic policy is to allocate profits taking into account the balance between growth investment and stockholder returns, aiming for sustainable growth and increased corporate value while maintaining financial soundness.
*   **Dividends:**
    *   The Company plans to pay dividends with a target payout ratio of 40%.
    *   It aims to increase dividends in line with profit growth.
    *   The target dividend payout ratio was raised from 30% to 40% in the current Medium-Term Business Plan.
    *   Dividends have been increased for five consecutive fiscal years, with the dividend per share more than doubling from ¥50 in FY2019 to ¥104.
    *   A further dividend increase is planned for the current fiscal year (FY2025), in line with profit growth.
    *   For the fiscal year ended March 31, 2025, an annual dividend of ¥104 per share was paid (¥59 year-end, ¥45 interim).
    *   For the fiscal year ending March 31, 202

# Multi file

In [1]:
from ollama import chat
import json
import re
import fitz  
from tqdm import tqdm
import os
import google.generativeai as genai
import time
from dotenv import load_dotenv
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm

All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  import google.generativeai as genai


True

In [2]:
class PDFProcessor:
    def __init__(self, model_name: str = "qwen2.5:1.5b"):
        self.model_name = model_name
        self.SYSTEM_PROMPT  = '''
        You read ONE page of a financial report.

        Task:
        List the main topics discussed on this page.

        Rules:
        - Topics must be short noun phrases.
        - Only include topics clearly mentioned.
        - No explanations.

        Output:
        Return ONLY a JSON array of strings.
        '''

    def extract_features_clean(self, text: str) -> list[str]:
        return list(dict.fromkeys(
            s.strip() for s in re.findall(r'"([^"]+)"', text)
        ))
    
    def extract_page_topics(
        self,
        page_content: str,
        ) -> list[str]:

        response = chat(
            model=self.model_name,
            messages=[
                {"role": "system", "content": self.SYSTEM_PROMPT},
                {
                    "role": "user",
                    "content": f"""
                    Page content:
                    \"\"\"
                    {page_content}
                    \"\"\"
                    """
                },
            ],
            options = {
                "temperature": 0.0,

                "num_ctx": 8192,        
                "num_predict": 256,   

                "top_p": 0.9,
                "repeat_penalty": 1.1,
            }
        )
        raw = response.message.content.strip()
        return raw

    
    def read_pdf_by_page(self, path_pdf: str):
        doc = fitz.open(path_pdf)
        results_list = []
        result_text = ""

        for page_index, page in enumerate(
            tqdm(doc, desc="Reading PDF pages", unit="page"),
            start=1
        ):
            text = page.get_text().strip()
            ingredient = {
                "page": page_index,
                "text": self.extract_features_clean(
                    self.extract_page_topics(text)
                )
            }
            results_list.append(ingredient)
            result_text += " " + str(ingredient)

        return results_list, result_text

In [3]:
class PDFPageStructure:
    def __init__(self, model_name):
        self.model_name = model_name
        
    def run_gemini(
        self,
        user_prompt: str,
        ) -> dict:
        """
        Send text to Gemini and return parsed JSON output.
        """

        model = genai.GenerativeModel(
            model_name=self.model_name,
        )

        response = model.generate_content(
            user_prompt,
            generation_config={
                "temperature": 0,
                "response_mime_type": "application/json"
            }
        )

        raw = response.text.strip()
        return raw

    def isolated_pages(self, list_indicators, context, descriptions):
        user_prompt = f"""
        You are given a list of indicators and a document structure.
        Each page id describes the main information covered on that page.

        Your task is to identify, for EACH indicator, the pages that are most likely to contain information relevant to that indicator.

        It is normal and expected that the same page may be relevant to multiple indicators.
        If an indicator includes a description or explanation, you MUST use that description as the primary semantic reference when identifying relevant pages, and prioritize pages that explicitly match the described concept over pages that only loosely relate by title.

        Indicators:
        {list_indicators}

        Document structure:
        {context}

        Description:
        {descriptions}

        Output format (STRICT):
        Return ONLY a single JSON object in the following structure:

        {{
        "<indicator_1>": {{
            "thinking": "<Why these pages are relevant to this indicator>",
            "page_list": ["x", "y"]
        }},
        "<indicator_2>": {{
            "thinking": "<Why these pages are relevant to this indicator>",
            "page_list": ["a"]
        }}
        }}

        Do NOT output anything outside the JSON.
        """
        return self.run_gemini(user_prompt=user_prompt)


In [4]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat

pipeline_options = PdfPipelineOptions(
    do_ocr=False,      
    do_table_structure=True,    
)
doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
            backend=PyPdfiumDocumentBackend
        )
    }
)

In [5]:
class GenAnswerByPageContext:
    def __init__(self, model_name:str):
        self.model_name = model_name
        
    def run_gemini(self, user_prompt):
        
        model = genai.GenerativeModel(
            model_name=self.model_name,
        )
        response = model.generate_content(
            user_prompt,
            generation_config={
                "temperature": 0,
            }
        )
        raw = response.text.strip()
        return raw

    def docling_parse_text(self, num_page: int, path_pdf: str):
        # Lấy tổng số page của PDF
        doc = fitz.open(path_pdf)
        total_pages = doc.page_count
        doc.close()

        # Clamp page range
        start_page = max(1, num_page - 1)
        end_page = min(total_pages, num_page + 1)

        if start_page > end_page:
            return ""

        result = doc_converter.convert(
            source=path_pdf,
            page_range=(start_page, end_page)
        )

        return result.document.export_to_markdown()

    def gen_answers(self, isolation_dic, descriptions):
        
        outputs = {}
        
        for indicator, meta in isolation_dic.items():
            outputs[indicator] = {}
            outputs[indicator]["information"] = {}
            
            context = ""
            
            for path, page_list in meta.items():
                if path not in outputs[indicator]["information"]:
                    outputs[indicator]["information"][path] = {
                        "page_list": [],
                        "context": ""
                    }
                outputs[indicator]["information"][path]["page_list"] = page_list
                if len(page_list) != 0:
                    for page in page_list:
                        context += self.docling_parse_text(int(page), path) + "\n"
                    outputs[indicator]["information"][path]["context"] = context
                
            user_prompt = f"""
            You are an expert financial analyst. 
            Based on the context provided, please extract information related to producted indicator.
            If the indicator includes a description or explanation, you MUST treat it as a strict definition and use it as the primary semantic reference when extracting information from the context.
            The context may not contain information relevant to the indicator. If the required information is NOT explicitly stated in the context, you MUST respond exactly with: "HAVE NOT INFORMATION".

            NOT: AUTO GENERATE ANYTHING THAT IS NOT IN THE CONTEXT.
            
            Indicator: {indicator}

            Descriptions:
            {descriptions}

            Context:
            {context}
            """
            answer = self.run_gemini(user_prompt)
            outputs[indicator]["answer"] = answer

        return outputs 

In [6]:
class PDFRretrieval:
    def __init__(self, mini_model, big_model):
        self.mini_model = mini_model
        self.big_model = big_model
        self.pdf_processor = PDFProcessor(model_name = mini_model)
        self.pdf_page_structure = PDFPageStructure(model_name = big_model)
        self.gen_answer_by_page_context = GenAnswerByPageContext(model_name=big_model)

    def run(self, file_paths, list_indicators, descriptions, type_report, target_site):
        use_paths = file_paths[target_site][type_report]
        TREES = {}
        ISOLATED_DIC = {}

        print("Start processing PDF pages - Gentree")
        time_gentree = 0

        for path in use_paths:
            start = time.time()
            data = self.pdf_processor.read_pdf_by_page(path)
            TREES[path] = data[1]
            end = time.time()
            time_value = end - start
            time_gentree += time_value
            
            print(f"Gentree: {path}", round(time_value, 2), "seconds")
        print("--"*20)
        print("Total Gentree time:", round(time_gentree, 2), "seconds")
        print()

        print("Start isolated")
        start = time.time()
        
        for indicator in list_indicators:
            ISOLATED_DIC[indicator] = {}
            
        for path, context in TREES.items():
            result = self.pdf_page_structure.isolated_pages(list_indicators, context, descriptions)
            isolation_json = json.loads(result)
            for key_indicator, value in isolation_json.items():
                if path not in ISOLATED_DIC[key_indicator]:
                    ISOLATED_DIC[key_indicator][path] = []
                ISOLATED_DIC[key_indicator][path].extend(value['page_list'])

        end = time.time()
        print("Total isolated pages: ", round(end - start, 2), "seconds")
        print()
        
        print("Generate answer from context")
        start = time.time()
        RESULTS = self.gen_answer_by_page_context.gen_answers(ISOLATED_DIC, descriptions)
        end = time.time()
        print("Total generate answer from context: ", round(end - start, 2), "seconds")
        print()
        
        print("------Finished processing------")

        return RESULTS, TREES, ISOLATED_DIC

In [7]:
import os
import re
import requests
from urllib.parse import urlparse


def make_safe_filename(url: str) -> str:
    """
    Generate unique, readable filename from URL
    Example:
    https://ssl4.eir-parts.net/doc/6920/tdnet/2691142/00.pdf
    -> 6920_tdnet_2691142_00.pdf
    """
    parsed = urlparse(url)
    parts = parsed.path.strip("/").split("/")

    # Remove leading 'doc' if exists
    if parts and parts[0] == "doc":
        parts = parts[1:]

    filename = "_".join(parts)

    # Keep safe characters only
    filename = re.sub(r"[^a-zA-Z0-9._-]", "_", filename)

    if not filename.lower().endswith(".pdf"):
        filename += ".pdf"

    return filename


def download_reports(report_urls, save_dir="reports", timeout=30):
    """
    Download list of report URLs to local machine
    and return list of saved file paths
    """
    os.makedirs(save_dir, exist_ok=True)
    saved_files = []

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0.0.0 Safari/537.36"
        ),
        "Accept": "application/pdf",
        "Referer": "https://ssl4.eir-parts.net/",
    }

    for url in report_urls:
        url = str(url)

        try:
            filename = make_safe_filename(url)
            save_path = os.path.join(save_dir, filename)

            # Skip if already downloaded
            if os.path.exists(save_path):
                print(f"⏭️ Skipped (exists): {filename}")
                saved_files.append(save_path)
                continue

            print(f"⬇️ Downloading: {filename}")

            r = requests.get(
                url,
                stream=True,
                timeout=timeout,
                headers=headers,
            )
            r.raise_for_status()

            with open(save_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)

            saved_files.append(save_path)
            print(f"✅ Saved: {save_path}")

        except Exception as e:
            print(f"❌ Failed: {url}")
            print(f"   Reason: {e}")

    return saved_files

## No_1 - E24050 - ＥＮＥＯＳホールディングス株式会社 - ENEOS Holdings,Inc.

In [8]:
# save_dir = "data/" + "hd.eneos.co.jp"
# report_urls = [
#     "https://ssl4.eir-parts.net/doc/5020/ir_material_for_fiscal_ym9/190074/00.pdf",
#     "https://www.hd.eneos.co.jp/esgdb/pdf/system_governance_report.pdf"
# ]
# saved_report = download_reports(report_urls, save_dir = save_dir)

⬇️ Downloading: 00.pdf
❌ Failed: https://ssl4.eir-parts.net/doc/5020/ir_material_for_fiscal_ym9/190074/00.pdf
   Reason: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
⬇️ Downloading: system_governance_report.pdf
✅ Saved: data/hd.eneos.co.jp/system_governance_report.pdf


In [15]:
target_site = "hd.eneos.co.jp"

file_paths = {
    target_site: 
        {"ir_report":
            [
                "data/hd.eneos.co.jp/00.pdf",
            ],
        "governance_report":
            [
                "data/hd.eneos.co.jp/system_governance_report.pdf"
            ]
        }
}

In [16]:
type_report = "ir_report"
list_indicators = [
            "Shareholder Return Policy",
            "IR Event Frequency",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_ir, TREES_ir, ISOLATED_DIC_ir = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 88/88 [02:00<00:00,  1.37s/page]


Gentree: data/hd.eneos.co.jp/00.pdf 120.3 seconds
----------------------------------------
Total Gentree time: 120.3 seconds

Start isolated
Total isolated pages:  5.33 seconds

Generate answer from context
Total generate answer from context:  16.53 seconds

------Finished processing------


In [17]:
ISOLATED_DIC_ir

{'Shareholder Return Policy': {'data/hd.eneos.co.jp/00.pdf': ['10',
   '18',
   '23',
   '81']},
 'IR Event Frequency': {'data/hd.eneos.co.jp/00.pdf': ['48', '64', '79']}}

In [18]:
for indicator, meta in RESULTS_ir.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Shareholder Return Policy
Answer: As stated in the Shareholder Return Policy, the company has positioned the return of profits to shareholders as an important management issue and strives to continue to provide stable dividends with the basic policy of implementing returns reflecting medium-term consolidated business performance and forecasts. The company will continue to enhance shareholder returns, keeping in mind the pursuit of an optimal capital structure and the utilization of allocation management.

Specific actions and plans include:
*   **Third Medium-Term Management Plan (FY2023–FY2024):**
    *   Completed share buybacks totaling approximately 250.0 billion yen.
    *   Increased the annual dividend by 4 yen per share, from 22 yen to 26 yen.
    *   The total payout ratio during this period was 77%.
*   **Fourth Medium-Term Management Plan (FY2025 onwards):**
    *   Decided to further increase the annual dividend by 4 yen per share, from 26 yen to 30 yen, in fisca

In [19]:
type_report = "governance_report"
list_indicators = [
            "Major Shareholder Structure",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_governance, TREES_governance, ISOLATED_DIC_governance = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 18/18 [00:38<00:00,  2.12s/page]


Gentree: data/hd.eneos.co.jp/system_governance_report.pdf 38.18 seconds
----------------------------------------
Total Gentree time: 38.18 seconds

Start isolated
Total isolated pages:  6.32 seconds

Generate answer from context
Total generate answer from context:  9.21 seconds

------Finished processing------


In [20]:
ISOLATED_DIC_governance

{'Major Shareholder Structure': {'data/hd.eneos.co.jp/system_governance_report.pdf': ['4',
   '5']}}

In [21]:
for indicator, meta in RESULTS_governance.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Major Shareholder Structure
Answer: ## 【大株主の状況】

| 氏名又は名称                                   | 所有株式数（株）   |   割合（％） |
|------------------------------------------------|--------------------|--------------|
| 日本マスタートラスト信託銀行株式会社（信託口） | 503,965,300        |        18.66 |
| 株式会社日本カストディ銀行（信託口）           | 171,881,400        |         6.36 |
| STATE STREET BANK WEST CLIENT － TREATY 505234 | 59,616,294         |         2.2  |
| STATE STREET BANK AND TRUST COMPANY505001      | 57,684,408         |         2.13 |
| JPモルガン証券株式会社                         | 54,923,159         |         2.03 |
| 高知信用金庫                                   | 44,540,350         |         1.64 |
| JP MORGAN CHASE BANK 385781                    | 38,647,696         |         1.43 |
| 日本証券金融株式会社                           | 26,983,317         |         0.99 |
| ENEOSグループ従業員持株会                      | 25,512,782         |         0.94 |
| SMBC日興証券株式会社                           | 24,840,789         |         0.92 |



## No_2 - E02520 - 三菱商事株式会社 - Mitsubishi Corporation

In [34]:
save_dir = "data/" + "mitsubishicorp.com"
report_urls = [
    "https://www.mitsubishicorp.com/jp/en/ir/library/ar/assets_r24/pdf/areport/2025/all.pdf?20251216_01",
    "https://www.mitsubishicorp.com/jp/ja/ir/library/assets_r24/pdf/governance_report_j.pdf?2025102201"
]
saved_report = download_reports(report_urls, save_dir = save_dir)

⬇️ Downloading: all.pdf
✅ Saved: data/mitsubishicorp.com/all.pdf
⬇️ Downloading: governance_report_j.pdf
✅ Saved: data/mitsubishicorp.com/governance_report_j.pdf


In [35]:
target_site = "mitsubishicorp.com"

file_paths = {
    target_site: 
        {"ir_report":
            [
                "data/mitsubishicorp.com/all.pdf",
            ],
        "governance_report":
            [
                "data/mitsubishicorp.com/governance_report_j.pdf"
            ]
        }
}

In [25]:
type_report = "ir_report"
list_indicators = [
            "Shareholder Return Policy",
            "IR Event Frequency",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_ir, TREES_ir, ISOLATED_DIC_ir = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 56/56 [01:29<00:00,  1.60s/page]


Gentree: data/mitsubishicorp.com/all.pdf 89.41 seconds
----------------------------------------
Total Gentree time: 89.41 seconds

Start isolated
Total isolated pages:  6.79 seconds

Generate answer from context
Total generate answer from context:  18.54 seconds

------Finished processing------


In [26]:
ISOLATED_DIC_ir

{'Shareholder Return Policy': {'data/mitsubishicorp.com/all.pdf': ['6',
   '9',
   '13',
   '14',
   '21',
   '23',
   '24']},
 'IR Event Frequency': {'data/mitsubishicorp.com/all.pdf': ['6',
   '9',
   '16',
   '29',
   '33',
   '42']}}

In [27]:
for indicator, meta in RESULTS_ir.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Shareholder Return Policy
Answer: The shareholder return policy consists of progressive dividends and flexible share buybacks.

Specifics of the policy include:
*   **Progressive Dividends:**
    *   Dividends were increased to ¥110 per share for FY2025, a ¥10 increase over FY2024.
    *   The total dividend payout over the three-year period of CS 2027 is projected to be ¥1.4 trillion+.
    *   The company has steadily increased dividends since adopting progressive dividends in FY2016, from ¥27 in FY2016 to ¥110 in FY2025.
    *   Future dividend increases will be flexibly considered based on improvements in earning power and underlying operating CF.
*   **Flexible Share Buybacks:**
    *   A ¥1 trillion share buyback program was announced in April 2025 for FY2025.
    *   The decision for the ¥1 trillion share buyback was driven by the need to rebalance accumulated capital, reflecting improved financial soundness, and to demonstrate commitment to capital-efficient managemen

In [36]:
type_report = "governance_report"
list_indicators = [
            "Major Shareholder Structure",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_governance, TREES_governance, ISOLATED_DIC_governance = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 24/24 [00:54<00:00,  2.25s/page]


Gentree: data/mitsubishicorp.com/governance_report_j.pdf 54.06 seconds
----------------------------------------
Total Gentree time: 54.06 seconds

Start isolated
Total isolated pages:  4.68 seconds

Generate answer from context
Total generate answer from context:  9.41 seconds

------Finished processing------


In [37]:
ISOLATED_DIC_governance

{'Major Shareholder Structure': {'data/mitsubishicorp.com/governance_report_j.pdf': ['7']}}

In [38]:
for indicator, meta in RESULTS_governance.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Major Shareholder Structure
Answer: | 日本マスタートラスト信託銀行株式会社（信託口）                             | 619,726,000   |   15.50 |
|----------------------------------------------------------------------------|---------------|---------|
| BNYM AS AGT／CLTS 10 PERCENT                                               | 411,385,693   |   10.29 |
| 株式会社日本カストディ銀行（信託口）                                       | 212,905,700   |    5.32 |
| 明治安田生命保険相互会社                                                   | 140,084,605   |    3.5  |
| 日本マスタートラスト信託銀行株式会社（退職給付信託口・議決権受託者行使型） | 96,830,184    |    2.42 |
| 東京海上日動火災保険株式会社                                               | 85,851,615    |    2.14 |
| STATE STREET BANK WEST CLIENT  -  TREATY 505234                            | 69,802,979    |    1.74 |
| ＪＰモルガン証券株式会社                                                   | 58,603,583    |    1.46 |
| STATE STREET BANK AND TRUST COMPANY 505001                                 | 58,285,334    |    1.45 |
| JP MORGAN CHASE BA

## No_3 - E01991 - レーザーテック株式会社 - Lasertec corporation

In [2]:
save_dir = "data/" + "lasertec.co.jp"
report_urls = [
    "https://ssl4.eir-parts.net/doc/6920/ir_material_for_fiscal_ym15/192733/00.pdf",
    "https://ssl4.eir-parts.net/doc/6920/tdnet/2691142/00.pdf"
]
saved_report = download_reports(report_urls, save_dir = save_dir)

⬇️ Downloading: 6920_ir_material_for_fiscal_ym15_192733_00.pdf
✅ Saved: data/lasertec.co.jp/6920_ir_material_for_fiscal_ym15_192733_00.pdf
⬇️ Downloading: 6920_tdnet_2691142_00.pdf
✅ Saved: data/lasertec.co.jp/6920_tdnet_2691142_00.pdf


In [13]:
target_site = "lasertec.co.jp"

file_paths = {
    target_site: 
        {"ir_report":
            [
                "data/lasertec.co.jp/6920_ir_material_for_fiscal_ym15_192733_00.pdf",
            ],
        "governance_report":
            [
                "data/lasertec.co.jp/6920_tdnet_2691142_00.pdf"
            ]
        }
}

In [27]:
type_report = "ir_report"

list_indicators = [
            "Shareholder Return Policy",
            "IR Event Frequency",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_ir, TREES_ir, ISOLATED_DIC_ir = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 58/58 [02:07<00:00,  2.20s/page]


Gentree: data/lasertec.co.jp/6920_ir_material_for_fiscal_ym15_192733_00.pdf 127.79 seconds
----------------------------------------
Total Gentree time: 127.79 seconds

Start isolated
Total isolated pages:  6.41 seconds

Generate answer from context
Total generate answer from context:  26.78 seconds

------Finished processing------


In [28]:
ISOLATED_DIC_ir

{'Shareholder Return Policy': {'data/lasertec.co.jp/6920_ir_material_for_fiscal_ym15_192733_00.pdf': ['52',
   '55']},
 'IR Event Frequency': {'data/lasertec.co.jp/6920_ir_material_for_fiscal_ym15_192733_00.pdf': ['6']}}

In [29]:
for indicator, meta in RESULTS_ir.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Shareholder Return Policy
Answer: 当社の株主還元に関する基本的な考え方は、事業成長をしっかりと果たし、その成果を株主・投資家の皆様にお返しする「成長の結果としての還元」です。

具体的な基準として、連結配当性向35％を目安に、業績に応じた弾力的な配当を実施していく方針です。

2026年6月期は、一時的に売上高や利益が減少する見通しですが、2025年6月期と同額の1株当たり配当（2025年6月期実績は329.0円）とする予定です。これは、安定的な株主還元を継続するとともに、今後の持続的成長に向けた当社の強い意志と、それを可能にする確かな技術があることを示すためです。この結果、2026年6月期の連結配当性向は49.5％となる見込みですが、従来の方針は変えておらず、その枠組みの中での弾力的な運用を検討した結果です。

さらに、資本効率の向上と株主への還元のため、当社としては17年ぶりとなる総額120億円規模の自己株式取得を実施することとしました。

これらの決定は、取締役会で「成長投資と株主還元のバランス」について徹底的に議論した結果、財務健全性を維持しながら、今後の成長に必要な投資余力を確保し、株主・投資家の皆様に還元していくための最適解として、配当維持と自己株式取得を決定したものです。

----------------------------------------------------------------------------------------------------

Indicator: IR Event Frequency
Answer: HAVE NOT INFORMATION

----------------------------------------------------------------------------------------------------



In [30]:
type_report = "governance_report"
list_indicators = [
            "Major Shareholder Structure",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_governance, TREES_governance, ISOLATED_DIC_governance = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 15/15 [00:26<00:00,  1.78s/page]


Gentree: data/lasertec.co.jp/6920_tdnet_2691142_00.pdf 26.63 seconds
----------------------------------------
Total Gentree time: 26.63 seconds

Start isolated
Total isolated pages:  5.96 seconds

Generate answer from context
Total generate answer from context:  9.58 seconds

------Finished processing------


In [31]:
ISOLATED_DIC_governance

{'Major Shareholder Structure': {'data/lasertec.co.jp/6920_tdnet_2691142_00.pdf': ['5']}}

In [32]:
for indicator, meta in RESULTS_governance.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Major Shareholder Structure
Answer: 【大株主の状況】

| 氏名又は名称                                   | 所有株式数（株）   |   割合（％） |
|------------------------------------------------|--------------------|--------------|
| 日本マスタートラスト信託銀行株式会社（信託口） | 16,505,300         |        18.3  |
| 株式会社日本カストディ銀行（信託口）           | 6,509,670          |         7.21 |
| 内山 洋                                        | 2,813,200          |         3.11 |
| 内山 秀                                        | 2,788,558          |         3.09 |
| 前田　せつ子                                   | 2,734,300          |         3.03 |
| STATE STREET BANKWEST CLIENT-TREATY 505234     | 1,672,000          |         1.85 |
| 株式会社三菱UFJ銀行                            | 1,504,000          |         1.66 |
| UCHIYAMA　HOLDINGS株式会社                     | 1,477,200          |         1.63 |
| 高橋　はる香                                   | 1,440,000          |         1.59 |
| 楽天証券株式会社                               | 1,044,300          |         1.15 

## No_4 - 伊藤忠商事株式会社 - ITOCHU Corporation

In [12]:
save_dir = "data/" + "itochu.co.jp"
report_urls = [
    "https://www.itochu.co.jp/ja/ir/download/__icsFiles/afieldfile/2025/09/05/ar2025J.pdf",
    "https://www.itochu.co.jp/ja/files/corporate_governance.pdf"
]
saved_report = download_reports(report_urls, save_dir = save_dir)

⬇️ Downloading: ja_ir_download___icsFiles_afieldfile_2025_09_05_ar2025J.pdf
✅ Saved: data/itochu.co.jp/ja_ir_download___icsFiles_afieldfile_2025_09_05_ar2025J.pdf
⏭️ Skipped (exists): ja_files_corporate_governance.pdf


In [13]:
target_site = "itochu.co.jp"

file_paths = {
    target_site: 
        {"ir_report":
            [
                "data/itochu.co.jp/ja_ir_download___icsFiles_afieldfile_2025_09_05_ar2025J.pdf",
            ],
        "governance_report":
            [
                "data/itochu.co.jp/ja_files_corporate_governance.pdf"
            ]
        }
}

In [14]:
type_report = "ir_report"

list_indicators = [
            "Shareholder Return Policy",
            "IR Event Frequency",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_ir, TREES_ir, ISOLATED_DIC_ir = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 50/50 [02:36<00:00,  3.14s/page]


Gentree: data/itochu.co.jp/ja_ir_download___icsFiles_afieldfile_2025_09_05_ar2025J.pdf 156.91 seconds
----------------------------------------
Total Gentree time: 156.91 seconds

Start isolated
Total isolated pages:  6.38 seconds

Generate answer from context
Total generate answer from context:  35.56 seconds

------Finished processing------


In [15]:
ISOLATED_DIC_ir

{'Shareholder Return Policy': {'data/itochu.co.jp/ja_ir_download___icsFiles_afieldfile_2025_09_05_ar2025J.pdf': ['12',
   '15',
   '17',
   '18']},
 'IR Event Frequency': {'data/itochu.co.jp/ja_ir_download___icsFiles_afieldfile_2025_09_05_ar2025J.pdf': ['50']}}

In [16]:
for indicator, meta in RESULTS_ir.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Shareholder Return Policy
Answer: **2024年度実績:**
*   総還元性向: 約50%
*   1株当たり配当: 200円（40円の増配）
*   自己株式取得: 1,500億円

**2025年度経営計画:**
*   総還元性向: 50%目途
*   配当: 1株当たり200円または配当性向30%の何れか高い方（今後の利益積み上がりの進捗を踏まえて、早期での増配を目指す所存）
*   自己株式取得: 約1,700億円

**長期的な経営方針:**
*   総還元性向: 40%以上
*   配当性向: 30%、または、1株当たり配当200円の何れか高い方
*   株主還元の持続的な拡充を重視し、基礎収益力の向上にあわせて配当額を着実に増額すると共に、10年連続となる機動的・継続的な自己株式取得を継続し、株主還元を拡充。
*   「3つのバランス（成長投資、株主還元、有利子負債コントロール）」に基づいた財務基盤の堅持。

----------------------------------------------------------------------------------------------------

Indicator: IR Event Frequency
Answer: IR Event Frequency:

**2022年度実績 (2022 results):**
*   アナリスト・機関投資家向け個別ミーティング: 333回
*   アナリスト・機関投資家向け決算説明会: 4回
*   アナリスト・機関投資家向け事業説明会／サステナビリティ説明会: 1回
*   アナリスト・機関投資家向け施設見学会: 0回
*   海外ロードショー: 5回
*   証券会社主催コンファレンス: 6回
*   個人投資家向け説明会: 3回

**2023年度実績 (2023 results):**
*   アナリスト・機関投資家向け個別ミーティング: 426回
*   アナリスト・機関投資家向け決算説明会: 4回
*   アナリスト・機関投資家向け事業説明会／サステナビリティ説明会: 1回
*   アナリスト・機関投資家向け施設見学会: 1回
*   海外ロードショー: 5回
*   証券会

In [19]:
type_report = "governance_report"
list_indicators = [
            "Major Shareholder Structure",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_governance, TREES_governance, ISOLATED_DIC_governance = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 42/42 [01:38<00:00,  2.35s/page]


Gentree: data/itochu.co.jp/ja_files_corporate_governance.pdf 98.73 seconds
----------------------------------------
Total Gentree time: 98.73 seconds

Start isolated
Total isolated pages:  6.04 seconds

Generate answer from context
Total generate answer from context:  8.64 seconds

------Finished processing------


In [20]:
ISOLATED_DIC_governance

{'Major Shareholder Structure': {'data/itochu.co.jp/ja_files_corporate_governance.pdf': ['1',
   '10']}}

In [21]:
for indicator, meta in RESULTS_governance.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Major Shareholder Structure
Answer: 【大株主の状況】

| 氏名又は名称                                   | 所有株式数（株）   | 割合（％）   |
|------------------------------------------------|--------------------|--------------|
| 日本マスタートラスト信託銀行株式会社（信託口） | 232,181,600        | 16.36        |
| BNYM AS AGT/CLTS 10 PERCENT                    | 146,102,576        | 10.29        |
| 株式会社日本カストディ銀行（信託口）           | 72,943,700         | 5.14         |
| JPモルガン証券株式会社                         | 36,655,679         | 2.58         |
| 日本生命保険相互会社                           | 34,056,023         | 2.40         |
| 株式会社みずほ銀行                             | 31,200,000         | 2.20         |
| STATE STREET BANK AND TRUST COMPANY 505001     | 31,073,941         | 2.19         |
| STATE STREET BANK WEST CLIENT － TREATY 505234 | 24,439,951         | 1.72         |
| JP MORGAN CHASE BANK 385781                    | 18,941,688         | 1.33         |
| 朝日生命保険相互会社                           | 18,720,500         | 1.32         |

## No_5 - カシオ計算機株式会社 - CASIO COMPUTER CO.,LTD.

In [None]:
save_dir = "data/" + "casio.com"
report_urls = [
    "https://www.casio.co.jp/content/dam/casio/global/corporate/ir/library/annual/2025/integrated-2025.pdf",
    "https://finance-frontend-pc-dist.west.edge.storage-yahoo.jp/disclosure/20251224/20251218522473.pdf"
]
saved_report = download_reports(report_urls, save_dir = save_dir)

⬇️ Downloading: ja_ir_download___icsFiles_afieldfile_2025_09_05_ar2025J.pdf
✅ Saved: data/itochu.co.jp/ja_ir_download___icsFiles_afieldfile_2025_09_05_ar2025J.pdf
⏭️ Skipped (exists): ja_files_corporate_governance.pdf


In [None]:
target_site = "casio.com"

file_paths = {
    target_site: 
        {"ir_report":
            [
                "data/casio.com/content_dam_casio_global_corporate_ir_library_annual_2025_integrated-2025.pdf",
            ],
        "governance_report":
            [
                "data/casio.com/disclosure_20251224_20251218522473.pdf"
            ]
        }
}

In [None]:
type_report = "ir_report"

list_indicators = [
            "Shareholder Return Policy",
            "IR Event Frequency",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_ir, TREES_ir, ISOLATED_DIC_ir = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

In [None]:
ISOLATED_DIC_ir

In [None]:
for indicator, meta in RESULTS_ir.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

In [None]:
type_report = "governance_report"
list_indicators = [
            "Major Shareholder Structure",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_governance, TREES_governance, ISOLATED_DIC_governance = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

In [None]:
ISOLATED_DIC_governance

In [None]:
for indicator, meta in RESULTS_governance.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

## No_6 - 株式会社バンク・オブ・イノベーション - Bank of Innovation, Inc.

In [None]:
save_dir = "data/" + "boi.jp"
report_urls = [
    "https://contents.xj-storage.jp/xcontents/AS80485/6a8dfa8d/7071/47c1/ac15/9a4be8a876b8/140120251113500915.pdf",
    "https://contents.xj-storage.jp/xcontents/AS80485/2e4d59dd/a617/4b86/b05d/2fbfda9ec6a2/S100XD6Y.pdf"
    "https://shikiho.toyokeizai.net/files/tdnet/140120251119506073.pdf"
]
saved_report = download_reports(report_urls, save_dir = save_dir)

⏭️ Skipped (exists): xcontents_AS80485_6a8dfa8d_7071_47c1_ac15_9a4be8a876b8_140120251113500915.pdf
⏭️ Skipped (exists): xcontents_AS80485_2e4d59dd_a617_4b86_b05d_2fbfda9ec6a2_S100XD6Y.pdf
⏭️ Skipped (exists): files_tdnet_140120251119506073.pdf


In [35]:
target_site = "boi.jp"

file_paths = {
    target_site: 
        {"ir_report":
            [
                "data/boi.jp/xcontents_AS80485_6a8dfa8d_7071_47c1_ac15_9a4be8a876b8_140120251113500915.pdf",
                "data/boi.jp/xcontents_AS80485_2e4d59dd_a617_4b86_b05d_2fbfda9ec6a2_S100XD6Y.pdf"
            ],
        "governance_report":
            [
                "data/boi.jp/files_tdnet_140120251119506073.pdf"
            ]
        }
}

In [36]:
type_report = "ir_report"

list_indicators = [
            "Shareholder Return Policy",
            "IR Event Frequency",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_ir, TREES_ir, ISOLATED_DIC_ir = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 13/13 [00:12<00:00,  1.04page/s]


Gentree: data/boi.jp/xcontents_AS80485_6a8dfa8d_7071_47c1_ac15_9a4be8a876b8_140120251113500915.pdf 12.55 seconds


Reading PDF pages: 100%|██████████| 85/85 [01:44<00:00,  1.23s/page]


Gentree: data/boi.jp/xcontents_AS80485_2e4d59dd_a617_4b86_b05d_2fbfda9ec6a2_S100XD6Y.pdf 104.59 seconds
----------------------------------------
Total Gentree time: 117.14 seconds

Start isolated
Total isolated pages:  14.54 seconds

Generate answer from context
Total generate answer from context:  56.63 seconds

------Finished processing------


In [None]:
ISOLATED_DIC_ir

{'Shareholder Return Policy': {'data/casio.com/content_dam_casio_global_corporate_ir_library_annual_2025_integrated-2025.pdf': ['12',
   '20',
   '51',
   '53']},
 'IR Event Frequency': {'data/casio.com/content_dam_casio_global_corporate_ir_library_annual_2025_integrated-2025.pdf': ['41',
   '47',
   '53']}}

In [None]:
for indicator, meta in RESULTS_ir.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Shareholder Return Policy
Answer: カシオ計算機株式会社の株主還元方針は以下の通りです。

*   **基本方針**: 財務面の中期的視点から、3年間で創出するキャッシュのうち、事業投資と株主還元の双方の強化を重視します。
*   **株主還元額**: 3年間で、配当などの株主還元に300億円超を充てる計画です。また、追加株主還元として150億円＋αを確保しています。
*   **配当方針**: DOE（Dividend on Equity）5%水準での安定配当の維持を目指します。
*   **柔軟性**: 戦略投資枠の未使用分は追加株主還元を検討します。
*   **ROE改善との連動**: ROE改善を目指し、最適資本構成を意識しながら、戦略投資枠の進捗も考慮の上で株主還元を進めます。

----------------------------------------------------------------------------------------------------

Indicator: IR Event Frequency
Answer: 機関投資家や証券アナリストの方々に向けた決算説明会を四半期ごとに開催

----------------------------------------------------------------------------------------------------



In [37]:
type_report = "governance_report"
list_indicators = [
            "Major Shareholder Structure",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_governance, TREES_governance, ISOLATED_DIC_governance = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 10/10 [00:17<00:00,  1.72s/page]


Gentree: data/boi.jp/files_tdnet_140120251119506073.pdf 17.21 seconds
----------------------------------------
Total Gentree time: 17.21 seconds

Start isolated
Total isolated pages:  6.25 seconds

Generate answer from context
Total generate answer from context:  14.58 seconds

------Finished processing------


In [38]:
ISOLATED_DIC_governance

{'Major Shareholder Structure': {'data/boi.jp/files_tdnet_140120251119506073.pdf': ['1',
   '2',
   '3']}}

In [39]:
for indicator, meta in RESULTS_governance.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Major Shareholder Structure
Answer: 【大株主の状況】

| 氏名又は名称                                                                       | 所有株式数（株）   |   割合（％） |
|------------------------------------------------------------------------------------|--------------------|--------------|
| 樋口　智裕                                                                         | 1,743,100          |        43.85 |
| 田中　大介                                                                         | 280,000            |         7.04 |
| 楽天証券株式会社                                                                   | 152,700            |         3.84 |
| 株式会社SBI証券                                                                    | 147,964            |         3.72 |
| 株式会社Cygames                                                                    | 79,100             |         1.99 |
| 須田　忠雄                                                                         | 52,300             |         1.31 |
| NOMURA IN

## No_7 - 株式会社　商船三井 - Mitsui O.S.K. Lines, Ltd.

In [40]:
save_dir = "data/" + "mol.co.jp"
report_urls = [
    "https://ir.mol.co.jp/ja/ir/library/integrated_report/main/01/teaserItems2/0/linkList/0/link/(J)MOL%20REPORT_2025.pdf",
    "https://www.mol.co.jp/sustainability/governance/corporate/policy/pdf/governance-report.pdf"
]
saved_report = download_reports(report_urls, save_dir = save_dir)

⬇️ Downloading: ja_ir_library_integrated_report_main_01_teaserItems2_0_linkList_0_link__J_MOL_20REPORT_2025.pdf
✅ Saved: data/mol.co.jp/ja_ir_library_integrated_report_main_01_teaserItems2_0_linkList_0_link__J_MOL_20REPORT_2025.pdf
⬇️ Downloading: sustainability_governance_corporate_policy_pdf_governance-report.pdf
✅ Saved: data/mol.co.jp/sustainability_governance_corporate_policy_pdf_governance-report.pdf


In [9]:
target_site = "mol.co.jp"

file_paths = {
    target_site: 
        {"ir_report":
            [
                "data/mol.co.jp/ja_ir_library_integrated_report_main_01_teaserItems2_0_linkList_0_link__J_MOL_20REPORT_2025.pdf"
            ],
        "governance_report":
            [
                "data/mol.co.jp/sustainability_governance_corporate_policy_pdf_governance-report.pdf"
            ]
        }
}

In [42]:
type_report = "ir_report"

list_indicators = [
            "Shareholder Return Policy",
            "IR Event Frequency",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_ir, TREES_ir, ISOLATED_DIC_ir = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 70/70 [02:13<00:00,  1.91s/page]


Gentree: data/mol.co.jp/ja_ir_library_integrated_report_main_01_teaserItems2_0_linkList_0_link__J_MOL_20REPORT_2025.pdf 133.75 seconds
----------------------------------------
Total Gentree time: 133.75 seconds

Start isolated
Total isolated pages:  9.33 seconds

Generate answer from context
Total generate answer from context:  40.24 seconds

------Finished processing------


In [43]:
ISOLATED_DIC_ir

{'Shareholder Return Policy': {'data/mol.co.jp/ja_ir_library_integrated_report_main_01_teaserItems2_0_linkList_0_link__J_MOL_20REPORT_2025.pdf': ['8',
   '10',
   '14',
   '22',
   '23',
   '24',
   '40',
   '42',
   '43',
   '69',
   '70']},
 'IR Event Frequency': {'data/mol.co.jp/ja_ir_library_integrated_report_main_01_teaserItems2_0_linkList_0_link__J_MOL_20REPORT_2025.pdf': ['8',
   '40',
   '42',
   '69',
   '70']}}

In [44]:
for indicator, meta in RESULTS_ir.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Shareholder Return Policy
Answer: The Shareholder Return Policy of the company is as follows:

*   **Overall Goal**: To enhance corporate value by achieving stable profits and stable dividends, fostering trust and expectations. The company aims to be a trusted and anticipated corporate group in the long term, with a goal of increasing PBR to 1.2-1.5 times.
*   **Dividend Payout Ratio**: In Phase 1, the dividend payout ratio was 30%. For Phase 2, the company is considering strengthening shareholder returns. At the beginning of the fiscal year, they were considering raising the dividend payout ratio from the current 30% to 40% due to the increase in equity. However, after listening to investors' voices, they are also considering setting an absolute dividend amount and increasing the dividend as performance improves, and will discuss internally what the optimal return policy supported by investors is.
*   **Minimum Dividend**: A minimum dividend of 150 yen per share was introdu

In [10]:
type_report = "governance_report"
list_indicators = [
            "Major Shareholder Structure",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_governance, TREES_governance, ISOLATED_DIC_governance = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 22/22 [00:46<00:00,  2.10s/page]


Gentree: data/mol.co.jp/sustainability_governance_corporate_policy_pdf_governance-report.pdf 46.15 seconds
----------------------------------------
Total Gentree time: 46.15 seconds

Start isolated
Total isolated pages:  9.76 seconds

Generate answer from context
Total generate answer from context:  12.81 seconds

------Finished processing------


In [None]:
ISOLATED_DIC_governance

{'Major Shareholder Structure': {'data/boi.jp/files_tdnet_140120251119506073.pdf': ['1',
   '2',
   '3']}}

In [None]:
for indicator, meta in RESULTS_governance.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Major Shareholder Structure
Answer: 【大株主の状況】

| 氏名又は名称                                                                       | 所有株式数（株）   |   割合（％） |
|------------------------------------------------------------------------------------|--------------------|--------------|
| 樋口　智裕                                                                         | 1,743,100          |        43.85 |
| 田中　大介                                                                         | 280,000            |         7.04 |
| 楽天証券株式会社                                                                   | 152,700            |         3.84 |
| 株式会社SBI証券                                                                    | 147,964            |         3.72 |
| 株式会社Cygames                                                                    | 79,100             |         1.99 |
| 須田　忠雄                                                                         | 52,300             |         1.31 |
| NOMURA IN

## No_8 - 任天堂株式会社	Nintendo Co., Ltd.

In [13]:
save_dir = "data/" + "nintendo.co.jp"
report_urls = [
    "https://www.nintendo.co.jp/ir/pdf/2025/annual2503e.pdf",
    "https://www.nintendo.co.jp/ir/en/management/governance.pdf"
]
saved_report = download_reports(report_urls, save_dir = save_dir)

⬇️ Downloading: ir_pdf_2025_annual2503e.pdf
✅ Saved: data/nintendo.co.jp/ir_pdf_2025_annual2503e.pdf
⬇️ Downloading: ir_en_management_governance.pdf
✅ Saved: data/nintendo.co.jp/ir_en_management_governance.pdf


In [14]:
target_site = "nintendo.co.jp"

file_paths = {
    target_site: 
        {"ir_report":
            [
                "data/nintendo.co.jp/ir_pdf_2025_annual2503e.pdf"
            ],
        "governance_report":
            [
                "data/nintendo.co.jp/ir_en_management_governance.pdf"
            ]
        }
}

In [15]:
type_report = "ir_report"

list_indicators = [
            "Shareholder Return Policy",
            "IR Event Frequency",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_ir, TREES_ir, ISOLATED_DIC_ir = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 103/103 [01:50<00:00,  1.07s/page]


Gentree: data/nintendo.co.jp/ir_pdf_2025_annual2503e.pdf 110.2 seconds
----------------------------------------
Total Gentree time: 110.2 seconds

Start isolated
Total isolated pages:  10.05 seconds

Generate answer from context
Total generate answer from context:  74.24 seconds

------Finished processing------


In [16]:
ISOLATED_DIC_ir

{'Shareholder Return Policy': {'data/nintendo.co.jp/ir_pdf_2025_annual2503e.pdf': ['2',
   '33',
   '34',
   '39',
   '65',
   '66',
   '67',
   '76',
   '77']},
 'IR Event Frequency': {'data/nintendo.co.jp/ir_pdf_2025_annual2503e.pdf': ['44',
   '103']}}

In [17]:
for indicator, meta in RESULTS_ir.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Shareholder Return Policy
Answer: The Company's basic policy is to internally provide capital for future growth (including capital investments) and maintain a strong, liquid financial position. Direct profit returns to shareholders are made through dividends based on profit levels achieved in each fiscal period.

The Company's basic policy is to distribute surplus twice per year as an interim dividend and a year-end dividend. The interim dividend is determined by the Board of Directors, and the year-end dividend by the General Meeting of Shareholders.

The annual dividend per share is set at the higher of:
1.  33% of consolidated operating profit divided by the total number of outstanding shares (excluding treasury shares) as of the fiscal year-end, rounded up to the nearest yen.
2.  50% consolidated profit standard, rounded up to the nearest yen.

The interim dividend per share is calculated by dividing 33% of consolidated operating profit of the six-month period by the tot

In [18]:
type_report = "governance_report"
list_indicators = [
            "Major Shareholder Structure",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_governance, TREES_governance, ISOLATED_DIC_governance = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 24/24 [00:21<00:00,  1.10page/s]


Gentree: data/nintendo.co.jp/ir_en_management_governance.pdf 21.84 seconds
----------------------------------------
Total Gentree time: 21.84 seconds

Start isolated
Total isolated pages:  5.47 seconds

Generate answer from context
Total generate answer from context:  9.32 seconds

------Finished processing------


In [19]:
ISOLATED_DIC_governance

{'Major Shareholder Structure': {'data/nintendo.co.jp/ir_en_management_governance.pdf': ['1',
   '7']}}

In [20]:
for indicator, meta in RESULTS_governance.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Major Shareholder Structure
Answer: ## [Status of Major Shareholders] New

| Name / Company Name                                                                       | Number of Shares Owned   |   Percentage (%) |
|-------------------------------------------------------------------------------------------|--------------------------|------------------|
| The Master Trust Bank of Japan, Ltd. (Trust Account)                                      | 194,088,600              |            16.67 |
| Custody Bank of Japan, Ltd. (Trust Account)                                               | 64,986,449               |             5.58 |
| The Bank of Kyoto, Ltd.                                                                   | 48,802,000               |             4.19 |
| JP Morgan Chase Bank 380815                                                               | 43,244,600               |             3.71 |
| The Nomura Trust and Banking Co., Ltd. (MUFG Bank,  Ltd. Retiree Allowan

## No_9 - トヨタ自動車株式会社	TOYOTA MOTOR CORPORATION

In [22]:
save_dir = "data/" + "global.toyota"
report_urls = [
    "https://global.toyota/pages/global_toyota/ir/library/annual/2024_001_integrated_en.pdf",
    "https://shikiho.toyokeizai.net/files/tdnet/140120250721517384.pdf"
]
saved_report = download_reports(report_urls, save_dir = save_dir)

⏭️ Skipped (exists): pages_global_toyota_ir_library_annual_2024_001_integrated_en.pdf
⏭️ Skipped (exists): files_tdnet_140120250721517384.pdf


In [23]:
target_site = "global.toyota"

file_paths = {
    target_site: 
        {"ir_report":
            [
                "data/global.toyota/pages_global_toyota_ir_library_annual_2024_001_integrated_en.pdf"
            ],
        "governance_report":
            [
                "data/global.toyota/files_tdnet_140120250721517384.pdf"
            ]
        }
}

In [24]:
type_report = "ir_report"

list_indicators = [
            "Shareholder Return Policy",
            "IR Event Frequency",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_ir, TREES_ir, ISOLATED_DIC_ir = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 138/138 [04:26<00:00,  1.93s/page]


Gentree: data/global.toyota/pages_global_toyota_ir_library_annual_2024_001_integrated_en.pdf 266.24 seconds
----------------------------------------
Total Gentree time: 266.24 seconds

Start isolated
Total isolated pages:  6.23 seconds

Generate answer from context
Total generate answer from context:  26.93 seconds

------Finished processing------


In [25]:
ISOLATED_DIC_ir

{'Shareholder Return Policy': {'data/global.toyota/pages_global_toyota_ir_library_annual_2024_001_integrated_en.pdf': ['101',
   '102',
   '134']},
 'IR Event Frequency': {'data/global.toyota/pages_global_toyota_ir_library_annual_2024_001_integrated_en.pdf': ['91',
   '92']}}

In [26]:
for indicator, meta in RESULTS_ir.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Shareholder Return Policy
Answer: The Company deems meeting the expectations of its shareholders as an important element of its management policy, and its basic policy for shareholder returns is to reward long-term shareholders.

Specifically, the Company strives to ensure stable and continuous dividend payments. This includes a policy of implementing stable and continuous dividend increases and a shifting of focus to dividends to reward long-term shareholders. In the financial results briefing for November 2024, a forecast for a year-end dividend was announced to further reassure shareholders of this commitment. For fiscal 2024, Toyota issued annual dividend payments of ¥75 per share, comprising an interim dividend of ¥30 per share and a year-end dividend of ¥45 per share, representing a year-on-year increase of ¥15 per share.

The Company also flexibly repurchases its common stock while comprehensively considering factors such as the price of common stock. Repurchased shar

In [27]:
type_report = "governance_report"
list_indicators = [
            "Major Shareholder Structure",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_governance, TREES_governance, ISOLATED_DIC_governance = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 36/36 [01:07<00:00,  1.87s/page]


Gentree: data/global.toyota/files_tdnet_140120250721517384.pdf 67.46 seconds
----------------------------------------
Total Gentree time: 67.46 seconds

Start isolated
Total isolated pages:  16.9 seconds

Generate answer from context
Total generate answer from context:  45.43 seconds

------Finished processing------


In [28]:
ISOLATED_DIC_governance

{'Major Shareholder Structure': {'data/global.toyota/files_tdnet_140120250721517384.pdf': ['6',
   '9',
   '11',
   '22',
   '23',
   '24',
   '25',
   '26',
   '27',
   '28']}}

In [29]:
for indicator, meta in RESULTS_governance.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Major Shareholder Structure
Answer: ## 【大株主の状況】

| 氏名又は名称                                                                                                          | 所有株式数（株）   |   割合（％） |
|-----------------------------------------------------------------------------------------------------------------------|--------------------|--------------|
| 日本マスタートラスト信託銀行株式会社                                                                                  | 180,560,474        |        13.84 |
| 株式会社豊田自動織機                                                                                                  | 119,233,092        |         9.14 |
| 株式会社日本カストディ銀行                                                                                            | 811,647,360        |         6.22 |
| 日本生命保険相互会社                                                                                                  | 633,220,965        |         4.85 |
| ステートストリートバンクアンドトラストカンパニー （常任代理人（株）みずほ銀行決済営業部）               

## No_10 - 株式会社ディー・エヌ・エー	DeNA Co., Ltd

In [41]:
save_dir = "data/" + "dena.com"
report_urls = [
    # "https://asset.dena.com/files/intl/ir/pdf/report/00_2025_en.pdf"
    "https://shikiho.toyokeizai.net/files/tdnet/140120251112598427.pdf"
]
saved_report = download_reports(report_urls, save_dir = save_dir)

⬇️ Downloading: files_tdnet_140120251112598427.pdf
✅ Saved: data/dena.com/files_tdnet_140120251112598427.pdf


In [42]:
target_site = "dena.com"

file_paths = {
    target_site: 
        {"ir_report":
            [
                "data/dena.com/00_2025_en.pdf"
            ],
        "governance_report":
            [
                "data/dena.com/files_tdnet_140120251112598427.pdf"
            ]
        }
}

In [43]:
type_report = "ir_report"

list_indicators = [
            "Shareholder Return Policy",
            "IR Event Frequency",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_ir, TREES_ir, ISOLATED_DIC_ir = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 106/106 [01:47<00:00,  1.01s/page]


Gentree: data/dena.com/00_2025_en.pdf 107.46 seconds
----------------------------------------
Total Gentree time: 107.46 seconds

Start isolated
Total isolated pages:  11.68 seconds

Generate answer from context
Total generate answer from context:  34.66 seconds

------Finished processing------


In [44]:
ISOLATED_DIC_ir

{'Shareholder Return Policy': {'data/dena.com/00_2025_en.pdf': ['1',
   '4',
   '14',
   '20',
   '102',
   '104',
   '106']},
 'IR Event Frequency': {'data/dena.com/00_2025_en.pdf': ['1',
   '4',
   '71',
   '77',
   '106']}}

In [45]:
for indicator, meta in RESULTS_ir.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Shareholder Return Policy
Answer: DeNA positions shareholder returns as one of its key management policies.

For FY2024, the company implemented a special cash dividend in addition to its regular cash dividend, resulting in a consolidated payout ratio of 30%, due to significant growth in revenue and Non-GAAP operating profit, and a profit in IFRS operating profit.

Future dividends will be comprehensively determined by considering:
*   The balance with growth investments in the mid-term strategy.
*   Business performance trends.
*   Financial status.

The company also engages in share buybacks, with 10.9 billion yen in FY2021 and 15.0 billion yen in FY2022. Consistent dividends are paid based on the company's dividend policy.

----------------------------------------------------------------------------------------------------

Indicator: IR Event Frequency
Answer: HAVE NOT INFORMATION

------------------------------------------------------------------------------------------

In [46]:
type_report = "governance_report"
list_indicators = [
            "Major Shareholder Structure",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_governance, TREES_governance, ISOLATED_DIC_governance = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 54/54 [00:50<00:00,  1.06page/s]


Gentree: data/dena.com/files_tdnet_140120251112598427.pdf 50.8 seconds
----------------------------------------
Total Gentree time: 50.8 seconds

Start isolated
Total isolated pages:  8.97 seconds

Generate answer from context
Total generate answer from context:  9.41 seconds

------Finished processing------


In [47]:
ISOLATED_DIC_governance

{'Major Shareholder Structure': {'data/dena.com/files_tdnet_140120251112598427.pdf': ['15',
   '16']}}

In [48]:
for indicator, meta in RESULTS_governance.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Major Shareholder Structure
Answer: ## [Status of Major Shareholders] [Updated]

| Name / Company Name                                                                                                                                 | Number of Shares Owned   | Percentage (%) |
|-----------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------|----------------|
| Tomoko Namba                                                                                                                                        | 19,826,698               | 17.78          |
| Nintendo Co., Ltd.                                                                                                                                  | 15,081,000               | 13.53          |
| The Master Trust Bank of Japan, Ltd.                                                                                      

## No_11 - ＬＩＮＥヤフー株式会社	LY Corporation

In [8]:
save_dir = "data/" + "lycorp.co.jp"
report_urls = [
    # "https://www.lycorp.co.jp/ja/ir/library/report/main/00/teaserItems2/01111/linkList/0/link/integrated_report_FY2024_jp.pdf"
    "https://shikiho.toyokeizai.net/files/tdnet/140120251226527170.pdf"
]
saved_report = download_reports(report_urls, save_dir = save_dir)

⏭️ Skipped (exists): files_tdnet_140120251226527170.pdf


In [9]:
target_site = "lycorp.co.jp"

file_paths = {
    target_site: 
        {"ir_report":
            [
                "data/lycorp.co.jp/integrated_report_FY2024_jp.pdf"
            ],
        "governance_report":
            [
                "data/lycorp.co.jp/files_tdnet_140120251226527170.pdf"
            ]
        }
}

In [10]:
type_report = "ir_report"

list_indicators = [
            "Shareholder Return Policy",
            "IR Event Frequency",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_ir, TREES_ir, ISOLATED_DIC_ir = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 59/59 [02:18<00:00,  2.35s/page]


Gentree: data/lycorp.co.jp/integrated_report_FY2024_jp.pdf 138.85 seconds
----------------------------------------
Total Gentree time: 138.85 seconds

Start isolated
Total isolated pages:  6.4 seconds

Generate answer from context
Total generate answer from context:  32.36 seconds

------Finished processing------


In [11]:
ISOLATED_DIC_ir

{'Shareholder Return Policy': {'data/lycorp.co.jp/integrated_report_FY2024_jp.pdf': ['20',
   '21',
   '23',
   '51']},
 'IR Event Frequency': {'data/lycorp.co.jp/integrated_report_FY2024_jp.pdf': ['51',
   '54']}}

In [12]:
for indicator, meta in RESULTS_ir.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Shareholder Return Policy
Answer: LINEヤフーグループの株主還元政策は以下の通りです。

*   **資本政策の考え方:**
    *   利益成長を意識した安定配当
    *   機動的な自己株式取得
    *   今後5年間（2025年度～2029年度）で累計総還元性向70%以上を目指す

*   **具体的な施策（2024年度）:**
    *   プライム市場への上場維持を目的とした約1,500億円規模の自己株式取得を実施。
    *   1株当たり配当を5.56円から7.00円に増配。

*   **今後の配当政策:**
    *   当期利益の成長率を鑑み、安定配当を継続しつつ、増配も検討。
    *   ROE向上など資本効率の改善を目的として、適切なタイミングで自己株式の取得を実施。

----------------------------------------------------------------------------------------------------

Indicator: IR Event Frequency
Answer: 投資家フィードバック・決算説明方針・決算報告：毎四半期

----------------------------------------------------------------------------------------------------



In [None]:
type_report = "governance_report"
list_indicators = [
            "Major Shareholder Structure",
            "Controlling Shareholder""
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_governance, TREES_governance, ISOLATED_DIC_governance = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 40/40 [00:36<00:00,  1.11page/s]


Gentree: data/lycorp.co.jp/files_tdnet_140120251226527170.pdf 36.18 seconds
----------------------------------------
Total Gentree time: 36.18 seconds

Start isolated
Total isolated pages:  6.35 seconds

Generate answer from context
Total generate answer from context:  11.61 seconds

------Finished processing------


In [14]:
ISOLATED_DIC_governance

{'Major Shareholder Structure': {'data/lycorp.co.jp/files_tdnet_140120251226527170.pdf': ['1',
   '15',
   '17',
   '18']}}

In [15]:
for indicator, meta in RESULTS_governance.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Major Shareholder Structure
Answer: ## Major Shareholder Structure

**Status of Major Shareholders (as of March 31, 2025):**

| Name or Company Name                                    | Number of Shares Owned   | Percentage (%)   |
|---------------------------------------------------------|--------------------------|------------------|
| A Holdings Corporation                                  | 4,467,326,675            | 62.44            |
| The Master Trust Bank of Japan, Ltd. (Trust Account)    | 508,913,300              | 7.11             |
| STATE STREET BANK AND TRUST COMPANY 505325              | 235,044,681              | 3.29             |
| Custody Bank of Japan, Ltd. (Trust account)             | 208,661,700              | 2.92             |
| STATE STREET BANK AND TRUST COMPANY 505001              | 97,103,019               | 1.36             |
| STATE STREET BANK WEST CLIENT-TREATY 505234             | 56,668,849               | 0.79             |
| STATE STREET 

In [16]:
type_report = "governance_report"
list_indicators = [
            "Major Shareholder Structure",
            "Controlling Shareholder",
        ]

descriptions = {
    # "Owner-managed Company": "A company run by one or a few individuals, including the founder or family members, and holding more than 50% of the shares -> Family company"
    }

main = PDFRretrieval(mini_model="qwen2.5:1.5b", big_model="gemini-2.5-flash")
RESULTS_governance, TREES_governance, ISOLATED_DIC_governance = main.run(file_paths= file_paths, list_indicators=list_indicators, descriptions=descriptions, type_report=type_report, target_site=target_site)

Start processing PDF pages - Gentree


Reading PDF pages: 100%|██████████| 40/40 [00:36<00:00,  1.09page/s]


Gentree: data/lycorp.co.jp/files_tdnet_140120251226527170.pdf 36.85 seconds
----------------------------------------
Total Gentree time: 36.85 seconds

Start isolated
Total isolated pages:  4.72 seconds

Generate answer from context
Total generate answer from context:  27.44 seconds

------Finished processing------


In [17]:
ISOLATED_DIC_governance

{'Major Shareholder Structure': {'data/lycorp.co.jp/files_tdnet_140120251226527170.pdf': ['1',
   '15',
   '17',
   '18',
   '19',
   '22']},
 'Controlling Shareholder': {'data/lycorp.co.jp/files_tdnet_140120251226527170.pdf': ['1',
   '15',
   '17',
   '18',
   '22']}}

In [18]:
for indicator, meta in RESULTS_governance.items():
    print(f"Indicator: {indicator}")
    print(f"Answer: {meta['answer']}")
    print()
    print("-" * 100)
    print()

Indicator: Major Shareholder Structure
Answer: ## Status of Major Shareholders

| Name or Company Name                                    | Number of Shares Owned   | Percentage (%)   |
|---------------------------------------------------------|--------------------------|------------------|
| A Holdings Corporation                                  | 4,467,326,675            | 62 . 44          |
| The Master Trust Bank of Japan, Ltd. (Trust Account)    | 508,913,300              | 7.11             |
| STATE STREET BANK AND TRUST COMPANY 505325              | 235,044,681              | 3.29             |
| Custody Bank of Japan, Ltd. (Trust account)             | 208,661,700              | 2.92             |
| STATE STREET BANK AND TRUST COMPANY 505001              | 97,103,019               | 1.36             |
| STATE STREET BANK WEST CLIENT-TREATY 505234             | 56,668,849               | 0.79             |
| STATE STREET BANK AND TRUST COMPANY 505223              | 48,958,854  