In [1]:
import os
import base64
from io import BytesIO
from pathlib import Path
from pdf2image import convert_from_path
from openai import AzureOpenAI

# === Azure OpenAI Client ===
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY", "a87f3b6d9af74203b33788a796709638"),
    api_version="2024-02-01",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT", "https://synthetic-data-test.openai.azure.com")
)

# === Load prompt from file ===
PROMPT_FILE = "prompt_start.txt"
with open(PROMPT_FILE, "r", encoding="utf-8") as f:
    prompt_text = f.read().strip()

# === OCR Function ===
def perform_ocr(images) -> str:
    content = [{"type": "text", "text": prompt_text}]
    for image in images:
        buffered = BytesIO()
        image.save(buffered, format="JPEG")
        base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
        image_data_url = f"data:image/jpeg;base64,{base64_image}"
        content.append({
            "type": "image_url",
            "image_url": {"url": image_data_url}
        })

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": content}],
        max_tokens=4096,
        temperature=0.1,
        top_p=0.7
    )
    return response.choices[0].message.content

# === OCR Runner for a Single PDF ===
def run_ocr_on_pdf(pdf_path: Path):
    print(f"📄 Processing: {pdf_path.name}")
    try:
        pages = convert_from_path(str(pdf_path), dpi=200)
        if len(pages) > 20:
            print(f"⚠️ Skipping {pdf_path.name}: too many pages ({len(pages)}). Limit is 20.")
            return
        ocr_result = perform_ocr(pages)
        print(f"\n✅ OCR for {pdf_path.name}:\n")
        print(ocr_result)
    except Exception as e:
        print(f"❌ Error processing {pdf_path.name}: {e}")

# === Main Loop: All PDFs in Folder ===
INPUT_FOLDER = Path("start")  # Change this to your folder
for pdf_file in INPUT_FOLDER.glob("*.pdf"):
    run_ocr_on_pdf(pdf_file)


📄 Processing: 5.pdf

✅ OCR for 5.pdf:

Here is the extracted layout-aware OCR data from the provided images, mapped to the possible fields:

---

### **General Information**
- **invoice_id**: 2054227  
- **invoice_date**: 17 December 2024  
- **due_date**: 30 January 2025  
- **currency**: USD  
- **total_amount_due**: $9,829.52  
- **discount_amount**: $818.92  
- **total_discount_amount**: $7,098.96  
- **gross_amount**: $17,747.40  
- **net_amount**: $9,829.52  
- **tax_amount**: $0.00  
- **tax_rate**: 0%  

---

### **Bill To**
- **bill_to_name**: Performant Financial Corporation  
- **bill_to_address**: 4309 Hacienda Drive, Suite 110, Pleasanton CA 94588, United States  

---

### **Ship To**
- **ship_to_name**: Performant Financial Corporation  
- **ship_to_address**: 4309 Hacienda Drive, Suite 110, Pleasanton CA 94588, United States  

---

### **Line Items**
1. **line_item/description**: NetSuite SuiteSuccess Financials First Standard Cloud Service  
   - **line_item/coverage_