In [7]:
# -----------------------------------------------------------------------------
# PDF AUDIT AGENT SCRIPT
# Purpose: Compares a local JSON data entry against the content of a PDF
# (acting as the ground truth) using the Gemini API's enforced JSON output.
# The script determines if the local data needs correction and allows the
# user to persist the changes.
# -----------------------------------------------------------------------------

# --- Standard Library Imports ---
import os
import json
import time
import fitz # PyMuPDF library for efficient PDF reading
import re
from typing import Dict, Any, Optional
# Note: The provided imports were missing typing hints, adding them for clarity
# from typing import Dict, Any, Optional is assumed/added for best practice.

# --- Third-Party Library Imports ---
from google import genai
from google.genai.errors import APIError

# -----------------------
# Configuration & Paths
# -----------------------
# --- API and Model Setup ---
# The API key is sourced from environment variables, falling back to a placeholder.
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "ENTER YOUR API KEY")
client = genai.Client(api_key=GEMINI_API_KEY)
MODEL = "gemini-2.5-flash"

# --- File Paths and Operational Settings ---
JSON_PATH = "scheme_details.json" # Path to the local data entry to be audited
PDF_PATH = "scheme_pdf.pdf" # Path to the PDF file (the ground truth)
MAX_RETRIES = 3 # Maximum attempts for the LLM API call in case of transient errors

# --- RESPONSE SCHEMA DEFINITION ---
# This schema defines the structure the LLM MUST return (JSON Mode) for
# guaranteed validation. It includes the corrected data and a summary of changes.
RESPONSE_SCHEMA: Dict[str, Any] = {
    "type": "object",
    "properties": {
        "corrected_entry": {
            "type": "object",
            "properties": {
                "headline": {"type": "string"},
                "summary": {"type": "string"},
                "important_points": {"type": "array", "items": {"type": "string"}},
                "eligibility_rules": {"type": "array", "items": {"type": "string"}},
                "category": {"type": "string"},
                "date": {"type": "string"},
                "source_url": {"type": "string"},
            },
            "required": ["headline", "summary", "important_points", "eligibility_rules", "category", "date", "source_url"],
        },
        "changes_summary": {"type": "string"},
    },
    "required": ["corrected_entry", "changes_summary"],
}
# ----------------------------------

# -----------------------
# Data I/O Utilities (Persistence Layer)
# -----------------------
def load_json(path: str) -> Optional[Dict[str, Any]]:
    """
    Loads a single JSON entry from the specified path.
    Handles file existence and JSON decoding errors.
    """
    if not os.path.exists(path):
        print(f"Error: JSON file not found at {path}")
        return None
    try:
        with open(path, "r", encoding='utf-8') as f:
            data = json.load(f)
            # Handles if the JSON file contains a list of objects (common format)
            return data if not isinstance(data, list) else data[0] if data else None
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from {path}: {e}")
        return None

def save_json(data: Dict[str, Any], path: str):
    """
    Saves the corrected entry back to the JSON file.
    Uses indentation (indent=2) for readability.
    """
    with open(path, "w", encoding='utf-8') as f:
        # Saving as a list containing a single object for consistency
        content_to_save = [data] if isinstance(data, dict) else data
        json.dump(content_to_save, f, indent=2, ensure_ascii=False)

# -----------------------
# PDF Parsing Utilities (Ground Truth Layer)
# -----------------------
def prune_text(text: str) -> str:
    """
    Cleans raw text extracted from the PDF by standardizing whitespace.
    Removes excessive newlines and spaces, and strips leading/trailing space.
    """
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def fetch_pdf_text(file_path: str) -> Optional[str]:
    """
    Reads all text content from a local PDF file using PyMuPDF (fitz) and cleans it.
    This acts as the agent fetching the ground truth data.
    """
    print(f"   üìÑ Reading text from PDF: {file_path}")
    try:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"PDF not found at: {file_path}")

        doc = fitz.open(file_path)
        full_text = ""
        for page in doc:
            full_text += page.get_text()
        doc.close()

        return prune_text(full_text)
    except Exception as e:
        print(f"‚ùå Error reading PDF: {e}")
        return None

# -----------------------
# LLM Auditor Agent (Correction Engine)
# -----------------------
AUDIT_PROMPT = """
You are a Strict Data Auditor. Your task is to compare the 'Incorrect Local Entry' with the 'Official Ground Truth' extracted from a PDF. You must generate a corrected entry.

CRITICAL INSTRUCTIONS:
1. **ONLY CHANGE WHAT IS FACTUALLY WRONG.** If a value (like a headline or a point in a list) is correct according to the Official Ground Truth, you MUST leave it exactly as it is in the Incorrect Local Entry.
2. If you cannot find a certain detail (e.g., a specific date), **DO NOT CHANGE** the existing value in the Local Entry.
3. The final output must be a valid JSON object with the keys "corrected_entry" and "changes_summary".
4. The content of "corrected_entry" must be a perfect, item-for-item replication of the "Incorrect Local Entry", with **ONLY** the necessary factual corrections applied based on the "Official Ground Truth".

Incorrect Local Entry:
{local_data}

Official Ground Truth (Raw Text from PDF):
{ground_truth}
"""

def run_llm_audit(local_entry: Dict[str, Any], pdf_text: str) -> Optional[Dict[str, Any]]:
    """
    Sends data to the Gemini API for comparison and correction.

    Args:
        local_entry: The dictionary containing the data to be audited.
        pdf_text: The ground truth text extracted from the PDF.

    Returns:
        A dictionary containing the corrected entry and summary, or None on failure.
    """
    # Truncate PDF text if it exceeds the token limit to prevent API errors
    MAX_TRUNCATE_SIZE = 12000
    if len(pdf_text) > MAX_TRUNCATE_SIZE:
        pdf_text = pdf_text[:MAX_TRUNCATE_SIZE] + " [TRUNCATED]"

    # Format the prompt with the local data and ground truth
    prompt = AUDIT_PROMPT.format(
        local_data=json.dumps(local_entry, indent=2, ensure_ascii=False),
        ground_truth=pdf_text
    )

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            # Call the Gemini API, enforcing JSON output using the defined schema
            response = client.models.generate_content(
                model=MODEL,
                contents=prompt,
                config=genai.types.GenerateContentConfig(
                    temperature=0.0, # Low temperature for factual, deterministic changes
                    response_mime_type="application/json",
                    response_schema=RESPONSE_SCHEMA
                ),
            )

            # The API guarantees response.text is valid JSON matching the schema
            content = response.text
            parsed = json.loads(content)
            return parsed

        except (APIError, json.JSONDecodeError) as e:
            # Handle API or JSON structure errors
            print(f"‚ùå LLM Error (Attempt {attempt}): {e}")
            if attempt == MAX_RETRIES:
                return None
            time.sleep(1) # Wait before retrying
        except Exception as e:
             # Handle other unexpected errors
             print(f"General Error (Attempt {attempt}): {e}")
             if attempt == MAX_RETRIES:
                 return None
             time.sleep(1)

    return None

# -----------------------
# Main Audit Agent (Orchestrator)
# -----------------------
def run_agent(interactive: bool = True):
    """
    The main agent function orchestrating the entire audit process.

    Args:
        interactive: If True, prompts the user before saving changes.
    """
    # 1. Load Data (using Data I/O utility)
    local_entry = load_json(JSON_PATH)
    if not local_entry:
        return # Exit if data cannot be loaded

    # 2. Load Ground Truth (using PDF parsing utility)
    pdf_text = fetch_pdf_text(PDF_PATH)
    if not pdf_text:
        print("üõë Critical: PDF text could not be loaded. Aborting audit.")
        return # Exit if PDF text is unavailable

    print("\nüîç Starting Audit and Correction...")

    # 3. Run LLM Audit (calling the LLM Auditor Agent)
    audit_result = run_llm_audit(local_entry, pdf_text)

    if not audit_result:
        print("   ‚ùå Failed to get a valid correction result from LLM. Aborting.")
        return # Exit if LLM call fails

    summary = audit_result.get("changes_summary", "No summary provided.")
    corrected_entry = audit_result["corrected_entry"]

    # 4. Compare and Decide on Update
    # Comparison uses sorted JSON dumps to ensure that order differences in lists/dicts
    # don't trigger a false change alert, focusing only on content differences.
    if json.dumps(local_entry, sort_keys=True) != json.dumps(corrected_entry, sort_keys=True):
        print("\n   ‚ö° CHANGE DETECTED!")
        print(f"   Summary of changes: {summary}")

        if interactive:
            confirm = input("\n      Apply this correction? (y/n): ").lower().strip()
        else:
            confirm = "y"

        if confirm == 'y':
            # Update the entry using the Data I/O utility
            save_json(corrected_entry, JSON_PATH)
            print(f"\n      ‚úÖ Saved corrected data to {JSON_PATH}.")
        else:
            print("      ‚úã Skipped. File not modified.")
    else:
        print("\n   ‚úÖ Local data is already correct. No changes detected.")

if __name__ == "__main__":
    # Check for placeholder API key before running the main function
    if "YOUR_GEMINI_API_KEY_HERE" in GEMINI_API_KEY:
         print("üö® WARNING: Please set the GEMINI_API_KEY environment variable or replace the placeholder in the code.")
    run_agent(interactive=True)

   üìÑ Reading text from PDF: scheme_pdf.pdf

üîç Starting Audit and Correction...

   ‚úÖ Local data is already correct. No changes detected.
