In [None]:
import json
import os
import os.path
import re
import sys
import zipfile
from pathlib import Path

import openpyxl
import pandas as pd
import requests

from dotenv import load_dotenv

load_dotenv()

sys.path.append("..")


ADOBE PDF Extractions API

In [None]:
from function.adobe import adobeLoader, extract_text_from_file_adobe, get_dict_xlsx

client_id = os.getenv("ADOBE_CLIENT_ID")
client_secret = os.getenv("ADOBE_CLIENT_SECRET")
# Adobe output zip file path
input_pdf = "../data/raw/Interim-Report-2024.pdf"
output_zip_path = "../data/processed/adobe_result/Interim-Report-2024/sdk.zip"
output_zipextract_folder = "../data/processed/adobe_result/Interim-Report-2024/"
# Run adobe API
adobeLoader(
    input_pdf,
    output_zip_path,
)
# unzip adobe output zipfile, extract text and table from adobe output zip file
json_strings = extract_text_from_file_adobe(output_zip_path, output_zipextract_folder)

output zipextract folder: ../data/processed/adobe_result/Interim-Report-2024/
output zip path: ../data/processed/adobe_result/Interim-Report-2024/sdk.zip
2025-08-24 21:42:19 JSON file already exists. Skipping extraction.
2025-08-24 21:42:19 open json file
2025-08-24 21:42:19 extract text


In [3]:
output_folder = "../data/processed/adobe_result/Interim-Report-2024/tables"
xlsx_files = [f for f in os.listdir(output_folder) if f.endswith(".xlsx")]
for xlsx_file in xlsx_files:
    print(f"\nTable from {xlsx_file}:")
    data_dict = get_dict_xlsx(output_folder, xlsx_file)
    df = pd.DataFrame(data_dict)
    print(df.to_markdown(index=False))
    print("\n---")


Table from fileoutpart10.xlsx:
| Amounts in millions of euros                             | June 30, 2024    | December 31, 2023    |
|:---------------------------------------------------------|:-----------------|:---------------------|
| Gross carrying amount loans and advances to customers    | 467,704          | 453,550              |
| Hedge accounting adjustment                              | (11,175)         | (11,379)             |
| Impairment allowances on loans and advances to customers | (2,897)          | (2,909)              |
| Total loans and advances to customers                    | 453,632          | 439,262              |

---

Table from fileoutpart6.xlsx:
| Amounts in millions of euros                                   |   First half-year 2024  | First half-year 2023    |
|:---------------------------------------------------------------|------------------------:|:------------------------|
| Additions and releases of provisions                           |          

In [4]:
# create output md file
md_output_path = "../data/processed/adobe_result/Interim-Report-2024/markdown_tables.md"
with open(md_output_path, "w") as f:
    for xlsx_file in xlsx_files:
        f.write(f"\n## Table from {xlsx_file}:\n\n")
        data_dict = get_dict_xlsx(output_folder, xlsx_file)
        df = pd.DataFrame(data_dict)
        f.write(df.to_markdown(index=False))


Agentic document extraction Landing AI

In [22]:
api_key = os.getenv("LANDINGAI_API_KEY")
input_pdf = "../data/raw/Our-Impact-in-2023.pdf"

url = "https://api.va.landing.ai/v1/tools/agentic-document-analysis"
files = {"pdf": open(f"{input_pdf}", "rb")}
headers = {
    "Authorization": f"Basic {api_key}",
}
response = requests.post(url, files=files, headers=headers)

print(response.json())

{'data': {'markdown': 'Introduction <!-- text, from page 0 (l=0.034,t=0.022,r=0.078,b=0.042), with ID ee87f48a-031d-48ea-91de-9912ddd4b830 -->\n\nAbout This Report <!-- text, from page 0 (l=0.106,t=0.022,r=0.166,b=0.043), with ID 6dcdd2ae-563b-414f-91dd-861f75420d5f -->\n\nForeword <!-- text, from page 0 (l=0.193,t=0.023,r=0.228,b=0.042), with ID 9cbd804d-1ae0-48b5-ad12-94c3e41192fa -->\n\nRabobank at a Glance <!-- text, from page 0 (l=0.255,t=0.023,r=0.326,b=0.042), with ID 80db21b2-6410-4bca-88c4-c27d992d1bbc -->\n\nAbout Us <!-- text, from page 0 (l=0.353,t=0.023,r=0.387,b=0.042), with ID 998525a8-074f-4d3b-875e-75c933244c5b -->\n\nValue Creation <!-- text, from page 0 (l=0.414,t=0.023,r=0.464,b=0.042), with ID e109f1b9-b78c-4822-b3b4-c3462e0fe26a -->\n\nBetter World <!-- text, from page 0 (l=0.492,t=0.023,r=0.535,b=0.042), with ID 3554e185-2c3b-49b9-8183-c632c1103242 -->\n\nBetter Bank <!-- text, from page 0 (l=0.563,t=0.023,r=0.603,b=0.042), with ID 6be8fe16-3a02-4b2c-810d-713f3ac

In [24]:
# Extract the markdown content from the response
markdown_content = response.json()["data"]["markdown"]
output_md = "../data/processed/landingai_result/Our-Impact-in-2023.md"
os.makedirs(os.path.dirname(output_md), exist_ok=True)

# Save the markdown output
with open(output_md, "w") as f:
    f.write(markdown_content)

Docling

In [None]:
from docling.document_converter import DocumentConverter

# input and output paths
input_pdf = "../data/raw/Interim-Report-2024.pdf"
output_md = "../data/processed/docling_result/Interim-Report-2024.md"

converter = DocumentConverter()
result = converter.convert(input_pdf)

# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(output_md), exist_ok=True)
# save  md output
with open(output_md, "w") as f:
    f.write(result.document.export_to_markdown())


In [46]:
import os
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend

# Configure OCR and image processing
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True 
pipeline_options.do_table_structure = True 
pipeline_options.table_structure_options.do_cell_matching = True

pdf_format_options = PdfFormatOption(
    pipeline_options=pipeline_options,
    backend=PyPdfiumDocumentBackend 
)

# Create converter with image processing enabled
converter = DocumentConverter(
    format_options={
        InputFormat.PDF: pdf_format_options,
    }
)

# Input and output paths
input_pdf = "../data/raw/Our-Impact-in-2023.pdf"
output_md = "../data/processed/docling_result/Our-Impact-in-2023.md"

# Convert with OCR and image processing
result = converter.convert(input_pdf)

# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(output_md), exist_ok=True)

# Save markdown output (now with OCR'd content from images)
with open(output_md, "w") as f:
    f.write(result.document.export_to_markdown())

print(f"Docling output with OCR saved to {output_md}")

Docling output with OCR saved to ../data/processed/docling_result/Our-Impact-in-2023.md


MISTRAL OCR

In [None]:
from mistralai import DocumentURLChunk, Mistral
from mistralai.models import OCRResponse

api_key = os.getenv("MISTRAL_API_KEY")
client = Mistral(api_key=api_key)


def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
    for img_name, base64_str in images_dict.items():
        markdown_str = markdown_str.replace(
            f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
        )
    return markdown_str


def get_combined_markdown(ocr_response: OCRResponse) -> str:
    markdowns: list[str] = []
    for page in ocr_response.pages:
        image_data = {}
        for img in page.images:
            image_data[img.id] = img.image_base64
        markdowns.append(replace_images_in_markdown(page.markdown, image_data))

    return "\n\n".join(markdowns)


# input and output paths
input_pdf = "../data/raw/Interim-Report-2024.pdf"
output_md = "../data/processed/mistral_result/Interim-Report-2024.md"

os.makedirs(os.path.dirname(output_md), exist_ok=True)
with open(input_pdf, "rb") as f:
    uploaded_file = client.files.upload(
        file={
            "file_name": Path(input_pdf).stem,
            "content": f.read(),
        },
        purpose="ocr",
    )

# get signed URL
signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)

# process with OCR
pdf_response = client.ocr.process(
    document=DocumentURLChunk(document_url=signed_url.url),
    model="mistral-ocr-latest",
    include_image_base64=True,
)

# generate markdown
markdown = get_combined_markdown(pdf_response)

# save the markdown output
with open(output_md, "w") as f:
    f.write(markdown)

print(f"Mistral output saved to {output_md}")

Mistral output saved to ../data/processed/mistral_result/Interim-Report-2024.md


In [None]:
import os
from huggingface_hub import snapshot_download
from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions
from docling.document_converter import (
    ConversionResult,
    DocumentConverter,
    InputFormat,
    PdfFormatOption,
)

source = "../data/raw/Our-Impact-in-2023.pdf"

print("Downloading RapidOCR models")
download_path = snapshot_download(repo_id="SWHL/RapidOCR")

det_model_path = os.path.join(
    download_path, "PP-OCRv4", "en_PP-OCRv3_det_infer.onnx"
)
rec_model_path = os.path.join(
    download_path, "PP-OCRv4", "ch_PP-OCRv4_rec_server_infer.onnx"
)
cls_model_path = os.path.join(
    download_path, "PP-OCRv3", "ch_ppocr_mobile_v2.0_cls_train.onnx"
)

ocr_options = RapidOcrOptions(
    det_model_path=det_model_path,
    rec_model_path=rec_model_path,
    cls_model_path=cls_model_path,
)

pipeline_options = PdfPipelineOptions(
    ocr_options=ocr_options,
)

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
        ),
    },
)

conversion_result: ConversionResult = converter.convert(source=source)
doc = conversion_result.document
md = doc.export_to_markdown()
print(md) # => still skip parsing images

Downloading RapidOCR models




Introduction

About This Report

Foreword

Rabobank at a Glance

About Us

Value Creation

<!-- image -->

Introduction

About This Report

Foreword

Rabobank at a Glance

About Us

Value Creation

Better World

Better Bank

Appendix

Disclaimer

PCAF's Data Quality Scores (score 1-5) are used to qualify the level of precision of the estimates we disclose per PCAF asset class. Data quality scores are specific to each asset class. Data with PCAF scores 1/2 , (verified and unverified respectively) company-level reported emissions data, represent the highest data quality and results in the most accurate emissions tial real estimates. This is followed by physical activity-based emissions (PCAF score 3), and PCAF scores 4/5, given for an economic activity-based emissions approach. We strive to include the best data quality score based on data availability and feasibility. s Details on the average data quality score per portfolio can be found in Appendix 3 of the Annual Report s for each of 

In [48]:
import pytesseract
from PIL import Image

image = Image.open("../data/raw/figures/Our-Impact-in-2023.jpg")
text = pytesseract.image_to_string(image)
print(text)

Introduction About This Report

Financed emissions in Mt CO2e

48.8 51.5

Financed Financed
emissions emissions
2022 2021

Foreword

2022 2021

5 3 Mt CO2e

Avoided emissions
via renewable energy
portfolio

2021: 4.8 Mt CO2e

Our Impact in 2023 - Better World

—_

Rabobank at a Glance

About Us Value Creation

Loans to private individuals

MM 19 2022

| 2.0 2021

Residential
real estate

Dutch business clients
9.3 2022
stekeme 2021
Trade, Industry 2.3
& Services ya §=2021

Commercial 0.3 2022
Real Estate 0.3 2021

Food &
Agriculture

Wholesale & Rural

Wholesale wae 2022

clients PA 2021

Rural ewe 2022
clients eke) 2021
Leasing international

DLL tractors 1.6 2022

assets 1.5 2021

DLL transport 0.4 2022

aunens 0.4 2021

Other assets

Sovereign |] 1.4 2022
Bonds 1.8 2021
aides | 0.2 2022
Investments 0.1 2021

Better World

Better Bank Appendix Disclaimer

PCAF's Data Quality Scores (score 1-5) are used to qualify the level of precision of the estimates we disclose per PCAF asset
clas

### Sending documents directly to LLM's API

In [60]:
import pathlib

from google import genai
from google.genai import types

client = genai.Client()

# Retrieve and encode the PDF byte
input_pdf = "../data/raw/Our-Impact-in-2023.pdf"
filepath = pathlib.Path(input_pdf)

prompt = "Extract all the content from the document, including the tables and images"
response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=[
        types.Part.from_bytes(
            data=filepath.read_bytes(),
            mime_type="application/pdf",
        ),
        prompt,
    ],
)
print(response.text)

**Page 1**

**Header Navigation:**
Introduction | About This Report | Foreword | Rabobank at a Glance | About Us | Value Creation | Better World | Better Bank | Appendix | Disclaimer | [Hamburger menu icon]

**Main Content:**

**Financed emissions in Mt CO2e**

**Left Chart: Bar Chart - Financed Emissions**
*   **48.8** Financed emissions **2022**
*   **51.5** Financed emissions **2021**
*   (Bar for 2022 is slightly shorter than for 2021)

**Left Text (below chart):**
**5.3 Mt CO2e**
Avoided emissions via renewable energy portfolio
2021: 4.8 Mt CO2e
(Image: A wind turbine with solar panels at its base and a small figure standing next to them.)

**Right Chart: Bar Charts by Loan Category**

**Loans to private individuals**
*   Residential real estate:
    *   1.9 **2022**
    *   2.0 **2021**

**Dutch business clients**
*   Food & Agriculture:
    *   9.3 **2022**
    *   10.0 **2021**
*   Trade, Industry & Services:
    *   2.3 **2022**
    *   2.9 **2021**
*   Commercial Real Estate:

In [None]:
import base64

from openai import OpenAI

client = OpenAI()
input_pdf = "../data/raw/Our-Impact-in-2023.pdf"

with open(input_pdf, "rb") as f:
    data = f.read()

base64_string = base64.b64encode(data).decode("utf-8")

completion = client.chat.completions.create(
    model="gpt-5",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "file",
                    "file": {
                        "filename": "draconomicon.pdf",
                        "file_data": f"data:application/pdf;base64,{base64_string}",
                    },
                },
                {
                    "type": "text",
                    "text": "Extract all the content from the document, including the tables and images",
                },
            ],
        },
    ],
)

print(completion.choices[0].message.content)

Below is a complete extraction of the document’s content, including all headings, narrative text, tables, and what is visible in the page images. I’ve organized it by page, reflecting the layout shown in the images.

Page 1 (as shown in the image for page 1)
Top navigation/header (visible in the page image):
Foreword Management Report Interim Financial Statements 2024

Footer/page reference (visible in the page image):
Interim Report 2024 - Management Report 27

Section: Balance Sheet Developments

Balance Sheet
Amounts in billions of euros    06-30-2024    12-31-2023
Cash and cash equivalents       80.6          90.5
Loans and advances to customers 453.6         439.3
Financial assets                24.2          20.4
Loans and advances to banks     25.9          26.5
Derivatives                     21.0          22.0
Other assets                    16.3          15.1
Total assets                    621.6         613.8

Deposits from customers         401.6         391.4
Debt securiti