In [None]:
import os
import logging
import subprocess
from pathlib import Path
from dotenv import load_dotenv
import fitz  # PyMuPDF
import requests
from fpdf import FPDF

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load env
load_dotenv(".env")
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL")
LLM_NAME = os.getenv("LLM")

MAX_RETRIES_LLM_LOOP_DEFAULT = int(os.getenv("MAX_RETRIES_LLM_LOOP_DEFAULT", "5"))

# Optional hyperparameters
LIMIT_LINES = os.getenv("LIMIT_LINES", "False").lower() == "true"
LINES_LIMIT = int(os.getenv("LINES_LIMIT", "1000"))

logger.info("🤖 Using LLM model: %s", LLM_NAME)
logger.info("🔧 LIMIT_LINES=%s, LINES_LIMIT=%d", LIMIT_LINES, LINES_LIMIT)

# --- QPDF decompression ---
def uncompress_pdf(input_pdf_path: Path, output_pdf_path: Path):
    logger.info("🔓 Uncompressing: %s", input_pdf_path)
    subprocess.run([
        "qpdf", "--qdf", "--object-streams=disable",
        str(input_pdf_path), str(output_pdf_path)
    ], check=True)
    logger.info("💾 Saved uncompressed to: %s", output_pdf_path)

# --- Extract content streams from PDF ---
def extract_pdf_content_streams(path: str, limit_lines=False, max_lines=1000) -> list[str]:
    logger.info("📂 Extracting from PDF: %s", path)
    doc = fitz.open(path)
    streams = []
    for i, page in enumerate(doc):
        xrefs = page.get_contents()
        if not xrefs:
            logger.warning("⚠️ No content stream on page %d", i + 1)
            streams.append("% No content")
            continue
        stream = b"".join([doc.xref_stream(x) for x in xrefs])
        try:
            decoded = stream.decode("utf-8", errors="ignore")
        except Exception as e:
            logger.warning("❗ Decode error: %s", e)
            decoded = "% Decode error"
        if limit_lines:
            decoded = "\n".join(decoded.splitlines()[:max_lines])
        streams.append(decoded)
    doc.close()
    logger.info("✅ Extracted %d stream(s)", len(streams))
    return streams


# --- Build prompt using examples and target streams ---
def build_prompt_with_examples(example_pairs, target_streams, limit_lines=False, max_lines=1000):
    header = [
        "You are a low-level PDF code generator.",
        "Given raw PDF page content streams (e.g., BT/ET text drawing blocks), your job is to rewrite each stream for accessibility.",
        "Only return the improved PDF stream. Do not explain or comment.",
        "Do not interpret the data. Do not describe the content. Do not wrap in markdown or text blocks.",
        "Each input stream will be followed by your output stream.",
        "Here are some examples of non-barrier-free PDF source code, and its barrier-free counterpart, respectively:"
    ]
    prompt = "\n".join(header)
    for i, (na, a) in enumerate(example_pairs):
        input_block = "\n".join(na.strip().splitlines()[:max_lines]) if limit_lines else na.strip()
        output_block = "\n".join(a.strip().splitlines()[:max_lines]) if limit_lines else a.strip()
        prompt += f"\n\n% Example {i+1}:\nInput:\n{input_block}\nOutput:\n{output_block}"
    prompt += "\n\nNow improve the following:"
    for stream in target_streams:
        body = "\n".join(stream.strip().splitlines()[:max_lines]) if limit_lines else stream.strip()
        prompt += f"\n\n{body}"
    return prompt

# --- Call LLM via POST ---
def call_llm_generate_streams(prompt: str, model: str, base_url: str, retries: int = 2) -> list[str]:
    logger.info("🚀 Calling LLM model '%s' at %s", model, base_url)
    for attempt in range(1, retries + 1):
        logger.info("🧠 LLM call attempt %d/%d", attempt, retries)
        try:
            r = requests.post(
                f"{base_url}/api/generate",
                json={"model": model, "prompt": prompt, "stream": False}
            )
            r.raise_for_status()
        except Exception as e:
            logger.error("🚨 LLM request failed: %s", e)
            continue

        response = r.json().get("response", "")
        logger.info("💬 LLM response (first 500 chars): %s", response[:500].replace("\n", "\\n"))

        if "```" in response:
            logger.warning("⚠️ Code block markers found in response — skipping.")
            continue

        parts = [blk.strip() + "\nendstream" for blk in response.split("endstream") if blk.strip()]
        if not parts:
            logger.warning("⚠️ No usable stream blocks found.")
            continue

        logger.info("✅ Parsed %d stream blocks", len(parts))
        return parts

    raise RuntimeError("LLM failed to return valid stream content after retries.")

# --- Write PDF with fpdf ---
def build_full_pdf_from_streams(streams: list[str], output_path: str, limit_lines=False, max_lines=1000):
    pdf = FPDF()
    pdf.set_font("Helvetica", size=12)
    for stream in streams:
        pdf.add_page()
        lines = stream.splitlines()
        if limit_lines:
            lines = lines[:max_lines]
        for line in lines:
            pdf.cell(0, 10, line[:100], ln=True)
    pdf.output(output_path)
    logger.info("📄 PDF written to: %s", output_path)

# --- Collect example PDF pairs ---
def collect_example_pairs(input_dir: Path, max_pages=1, limit_lines=False, max_lines=1000):
    pairs = []
    for bf_path in input_dir.glob("* bf.pdf"):
        orig_path = input_dir / bf_path.name.replace(" bf.pdf", ".pdf")
        if not orig_path.exists():
            continue
        logger.info("📚 Pair: %s + %s", orig_path.name, bf_path.name)

        tmp_dir = Path("/tmp/pdf_examples")
        tmp_dir.mkdir(parents=True, exist_ok=True)
        orig_u = tmp_dir / f"{orig_path.stem}_u.pdf"
        bf_u = tmp_dir / f"{bf_path.stem}_u.pdf"
        uncompress_pdf(orig_path, orig_u)
        uncompress_pdf(bf_path, bf_u)

        orig_streams = extract_pdf_content_streams(str(orig_u), limit_lines=limit_lines, max_lines=max_lines)[:max_pages]
        bf_streams = extract_pdf_content_streams(str(bf_u), limit_lines=limit_lines, max_lines=max_lines)[:max_pages]
        pairs.extend(zip(orig_streams, bf_streams))

    logger.info("🧾 Loaded %d example stream pairs", len(pairs))
    return pairs

# --- Process a single PDF with prompt and output ---
def process_pdf_for_accessibility(input_pdf_path, example_pairs=None, output_dir="/tmp", retries=2):
    input_path = Path(input_pdf_path)
    uncompressed_path = Path(output_dir) / f"{input_path.stem}_uncompressed{input_path.suffix}"
    uncompress_pdf(input_path, uncompressed_path)

    content_streams = extract_pdf_content_streams(str(uncompressed_path), limit_lines=LIMIT_LINES, max_lines=LINES_LIMIT)
    prompt = build_prompt_with_examples(example_pairs or [], content_streams, limit_lines=LIMIT_LINES, max_lines=LINES_LIMIT)

    prompt_file = Path(output_dir) / f"{input_path.stem}.prompt.txt"
    prompt_file.write_text(prompt, encoding="utf-8")

    improved_streams = call_llm_generate_streams(prompt, model=LLM_NAME, base_url=OLLAMA_BASE_URL, retries=retries)

    if LIMIT_LINES:
        response_text = "\n\n".join(["\n".join(s.splitlines()[:LINES_LIMIT]) for s in improved_streams])
    else:
        response_text = "\n\n".join(improved_streams)
    response_file = Path(output_dir) / f"{input_path.stem}.response.txt"
    response_file.write_text(response_text, encoding="utf-8")

    output_filename = f"{input_path.stem} bf{input_path.suffix}"
    output_path = Path(output_dir) / output_filename
    build_full_pdf_from_streams(improved_streams, str(output_path), limit_lines=LIMIT_LINES, max_lines=LINES_LIMIT)

    return str(output_path)

# --- Batch directory runner ---
def process_pdf_directory(input_dir, output_dir, example_dir, retries=2):
    os.makedirs(output_dir, exist_ok=True)
    input_dir = Path(input_dir)
    example_dir = Path(example_dir)

    example_pairs = collect_example_pairs(example_dir, limit_lines=LIMIT_LINES, max_lines=LINES_LIMIT)

    for file in input_dir.glob("*.pdf"):
        if " bf.pdf" in file.name:
            continue
        if (input_dir / f"{file.stem} bf.pdf").exists():
            continue
        try:
            output_path = process_pdf_for_accessibility(
                str(file), example_pairs=example_pairs, output_dir=output_dir, retries=retries
            )
            print(f"✅ Processed: {file.name} → {output_path}")
        except Exception as e:
            print(f"❌ Failed: {file.name} — {e}")

# --- Entry point ---
INPUT_DIR = "./ipynb-input"
OUTPUT_DIR = "./ipynb-output"
EXAMPLE_DIR = "./ipynb-examples"

process_pdf_directory(INPUT_DIR, OUTPUT_DIR, EXAMPLE_DIR, MAX_RETRIES_LLM_LOOP_DEFAULT)