In [None]:
# transcribe_pdf.py
# Usage: from transcribe_pdf import transcribe_pdf_to_text
# Requirements (pip): PyPDF2, pdf2image, pytesseract, pillow
# Note: pdf2image requires poppler; pytesseract requires Tesseract OCR installed.

from typing import Optional, Iterable
import os

def _try_import(name: str):
    try:
        return __import__(name)
    except Exception as e:
        raise ImportError(f"Missing dependency '{name}': {e}")

PyPDF2 = _try_import("PyPDF2")

def transcribe_pdf_to_text(
    pdf_path: str,
    ocr: bool = False,
    pages: Optional[Iterable[int]] = None,
    dpi: int = 300,
    write_to: Optional[str] = None,
) -> str:
    """
    Extract text from a PDF file. Uses direct PDF text extraction first.
    If ocr=True or no text found, falls back to OCR (pytesseract + pdf2image).

    Args:
      pdf_path: path to the PDF file
      ocr: force OCR fallback or allow OCR when no text found
      pages: iterable of 0-based page indices to process (default: all)
      dpi: resolution for OCR page images
      write_to: optional path to write the resulting text

    Returns:
      The extracted text as a single string.
    """
    if not os.path.isfile(pdf_path):
        raise FileNotFoundError(pdf_path)

    reader = PyPDF2.PdfReader(pdf_path)
    if getattr(reader, "is_encrypted", False):
        try:
            reader.decrypt("")  # try empty password
        except Exception:
            raise RuntimeError("PDF is encrypted and cannot be decrypted automatically.")

    # Normalize pages selection
    total = len(reader.pages)
    if pages is None:
        page_indices = range(total)
    else:
        page_indices = [p for p in pages if 0 <= p < total]

    # 1) Try direct text extraction
    text_parts = []
    for i in page_indices:
        try:
            page = reader.pages[i]
            txt = page.extract_text() or ""
        except Exception:
            txt = ""
        text_parts.append(txt)

    full_text = "\n\n".join(part for part in text_parts if part and part.strip())

    # 2) If no text found and OCR allowed, do OCR
    if (not full_text.strip()) and ocr:
        pdf2image = _try_import("pdf2image")
        pytesseract = _try_import("pytesseract")
        from PIL import Image  # pillow

        # convert selected pages to images
        # pdf2image.convert_from_path accepts first_page/last_page (1-based)
        # We'll convert the minimal page range if pages is continuous, otherwise convert all and index.
        images = pdf2image.convert_from_path(pdf_path, dpi=dpi)
        ocr_texts = []
        for i in page_indices:
            img = images[i]
            ocr_texts.append(pytesseract.image_to_string(img))
        full_text = "\n\n".join(ocr_texts)

    if write_to:
        with open(write_to, "w", encoding="utf-8") as f:
            f.write(full_text)

    return full_text


# Simple CLI when run as script
if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Transcribe PDF to text (with optional OCR).")
    parser.add_argument("pdf", help="path to PDF")
    parser.add_argument("--ocr", action="store_true", help="use OCR fallback")
    parser.add_argument("--out", help="write extracted text to file")
    parser.add_argument("--dpi", type=int, default=300, help="DPI for OCR images")
    args = parser.parse_args()
    result = transcribe_pdf_to_text(args.pdf, ocr=args.ocr, dpi=args.dpi, write_to=args.out)
    print(result[:1000])  # print first 1000 chars to avoid huge terminal output