In [None]:
import os
import uuid
import logging
from dotenv import load_dotenv

from utils import (
    extract_pdf_content_streams,
    build_pdf_prompt,
    call_llm_generate_streams,
    build_full_pdf_from_streams
)

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load environment variables (OLLAMA_BASE_URL and LLM)
load_dotenv(".env")
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL")
LLM_NAME = os.getenv("LLM")

# Reusable function for PDF accessibility processing
def process_pdf_for_accessibility(input_pdf_path, metadata="{}", output_dir="/tmp", retries=2):
    logger.info("📄 Processing file: %s", input_pdf_path)

    logger.info("🔍 Extracting PDF content streams...")
    content_streams = extract_pdf_content_streams(input_pdf_path)
    logger.info("📄 Extracted %d content stream(s)", len(content_streams))

    logger.info("🧾 Building LLM prompt")
    prompt_lines = [
        "You are a PDF cleaner. Your task is to rewrite PDF content streams for accessibility.",
        "Each input is a low-level PDF page content stream (BT/ET block).",
        "For each stream, return an improved version using Helvetica font, 12pt size, and correctly positioned text.",
        "Ensure the syntax remains valid PDF content streams with BT/ET blocks.",
        "Respond only with updated PDF streams — no explanation or formatting.",
        "Each stream must be returned between `<< /Length L >>` and `endstream`, where L is the byte length of the stream body.",
        "Do not wrap the response in code fences or markdown."
    ]

    prompt = build_pdf_prompt(prompt_lines, content_streams)
    logger.debug("📨 Prompt preview:\n%s", prompt[:500])

    logger.info("🔁 Calling LLM with up to %d retries", retries)
    improved_streams = call_llm_generate_streams(
        prompt, model=LLM_NAME, base_url=OLLAMA_BASE_URL, retries=retries
    )
    logger.info("✅ LLM returned %d improved stream(s)", len(improved_streams))

    output_path = os.path.join(output_dir, f"accessible_{os.path.basename(input_pdf_path)}")
    logger.info("🛠️  Building PDF to: %s", output_path)
    build_full_pdf_from_streams(improved_streams, output_path)

    return output_path

# Batch processing loop
def process_pdf_directory(input_dir, output_dir, retries=2):
    os.makedirs(output_dir, exist_ok=True)
    for filename in os.listdir(input_dir):
        if filename.lower().endswith(".pdf"):
            input_path = os.path.join(input_dir, filename)
            try:
                output_path = process_pdf_for_accessibility(input_path, output_dir=output_dir, retries=retries)
                print(f"✅ Processed: {filename} → {output_path}")
            except Exception as e:
                print(f"❌ Failed: {filename} — {e}")

# 🔧 Define your input/output directories here
INPUT_DIR = "./ipynb-input"
OUTPUT_DIR = "./ipynb-output"

# 🏃 Run batch processing
process_pdf_directory(INPUT_DIR, OUTPUT_DIR)


INFO:__main__:📄 Processing file: ./ipynb-input/2018-08_Poster-Ersteinschaetzung_UT-1.pdf
INFO:__main__:🔍 Extracting PDF content streams...
INFO:utils:📂 Opening PDF for content stream extraction: ./ipynb-input/2018-08_Poster-Ersteinschaetzung_UT-1.pdf
INFO:utils:📃 Page 1
INFO:utils:📄 Extracted content (page 1):

q
0 0 1190.551 1683.78 re
W
n
/GS0 gs
/CS0 cs
.898 0 .312 scn
1 0 0 1 70.984299 918.553589 cm
0 0 m
0 -41.858 l
0 -48.486 5.373 -53.858 12 -53.858 c
449.02 -53.858 l
455.647 -53.858 461.02 -48.486 461.02 -41.858 c
461.02 0 l
f
Q
BT
/GS0 gs
/CS0 cs
1 1 1 scn
/F0 1 Tf
32 0 0 32 249.4554 879.6245 Tm
(Sofort)Tj
ET
q
0 0 1190.551 1683.78 re
W
n
1 .833 0 scn
1 0 0 1 70.984299 790.994629 cm
0 0 m
0 -41.858 l
0 -48.486 5.373 -53.858 12 -53.858 c
449.02 -53.858 l
455.647 -53.858 461.02 -48.486 461.02 -41.858 c
461.02 0 l
f
Q
BT
/GS1 gs
0 0 0 scn
32 0 0 32 230.6035 752.0655 Tm
(Dringend)Tj
ET
q
0 0 1190.551 1683.78 re
W
n
/GS2 gs
.964 .626 0 scn
1 0 0 1 70.984299 854.774109 cm
0 0 m
0 -41

❌ Failed: 2018-08_Poster-Ersteinschaetzung_UT-1.pdf — LLM failed to return valid PDF stream content after 2 attempts.
