In [11]:
import os
import base64
from io import BytesIO
from pathlib import Path
from pdf2image import convert_from_path
from openai import AzureOpenAI

# === Azure OpenAI Client ===
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY", "a87f3b6d9af74203b33788a796709638"),
    api_version="2024-02-01",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT", "https://synthetic-data-test.openai.azure.com")
)

# === Load prompt from text file ===
PROMPT_FILE = "prompt.txt"
with open(PROMPT_FILE, "r", encoding="utf-8") as f:
    prompt_text = f.read().strip()

# === OCR function that processes all pages at once ===
def perform_ocr(images) -> str:
    """
    Takes a list of PIL.Image objects (PDF pages) and sends them to GPT-4o for OCR.
    """
    content = [{"type": "text", "text": prompt_text}]
    for image in images:
        buffered = BytesIO()
        image.save(buffered, format="JPEG")
        base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
        image_data_url = f"data:image/jpeg;base64,{base64_image}"
        content.append({
            "type": "image_url",
            "image_url": {"url": image_data_url}
        })

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": content}],
        max_tokens=4096,
        temperature=0.1,
        top_p=0.7
    )

    return response.choices[0].message.content

# === Function to run OCR on a PDF ===
def run_ocr_on_pdf(pdf_path_str: str):
    """
    Converts PDF to images, performs OCR on all pages together, and returns the text.
    """
    pdf_path = Path(pdf_path_str)
    if not pdf_path.exists():
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")

    print(f"📄 Loading PDF: {pdf_path.name}")
    pages = convert_from_path(str(pdf_path), dpi=200)

    if len(pages) > 20:
        print(f"⚠️ Skipping {pdf_path.name}: too many pages ({len(pages)}). Limit is 20.")
        return ""

    print(f"🧠 Performing OCR on {len(pages)} pages together...")
    ocr_result = perform_ocr(pages)
    print("\n✅ OCR Output:\n")
    print(ocr_result)
    return ocr_result

# === Example Usage: Change path as needed ===
pdf_file_path = "start/onesource.pdf"
ocr_output = run_ocr_on_pdf(pdf_file_path)


FileNotFoundError: PDF file not found: start/onesource.pdf