In [23]:
import os
import logging
from flask import Flask, request, jsonify
from flask_cors import CORS
from pdf2image import convert_from_bytes
import pytesseract
from typing import List, Dict
import numpy as np
from PIL import Image
import io
import traceback
from typing import Optional
from dotenv import load_dotenv
import json
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw
from datetime import datetime

In [24]:
load_dotenv()
CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY")

In [25]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(f'ocr_server_{datetime.now().strftime("%Y%m%d")}.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

In [35]:
sonnet = "anthropic/claude-3-7-sonnet-20250219"
haiku = "anthropic/claude-3-5-haiku-20241022"
lm = dspy.LM(sonnet, api_key=CLAUDE_API_KEY)
dspy.configure(lm=lm)

In [27]:
def process_pdf(pdf_bytes: bytes) -> List[Dict]:
    """Process PDF and return structured results with bounding boxes"""
    logger.info("Starting PDF processing with paragraph-based analysis and bounding boxes")
    
    results = []  # Initialize results list
    
    try:
        # Convert PDF to images
        logger.debug("Converting PDF to images")
        images = convert_from_bytes(pdf_bytes)
        logger.info(f"Converted PDF to {len(images)} images")
        
        for i, img in enumerate(images, 1):
            logger.info(f"Processing image {i} of {len(images)}")
            
            # Get image dimensions
            width, height = img.size
            
            # Convert PIL image to bytes for OCR
            img_byte_arr = io.BytesIO()
            img.save(img_byte_arr, format='PNG')
            img_byte_arr = img_byte_arr.getvalue()
            
            # Perform OCR with bounding box information
            logger.debug(f"Performing OCR with bounding boxes on image {i}")
            
            # Get detailed data including bounding boxes
            ocr_data = pytesseract.image_to_data(
                Image.open(io.BytesIO(img_byte_arr)), 
                output_type=pytesseract.Output.DICT,
                config=r'--psm 3'
            )
            
            # Process OCR data
            n_boxes = len(ocr_data['text'])
            for j in range(n_boxes):
                # Skip empty text
                if not ocr_data['text'][j].strip():
                    continue
                
                # Get confidence score
                conf = float(ocr_data['conf'][j])
                if conf < 0:  # Skip low confidence results
                    continue
                
                # Get coordinates and normalize them
                x1 = ocr_data['left'][j] / width
                y1 = ocr_data['top'][j] / height
                x2 = (ocr_data['left'][j] + ocr_data['width'][j]) / width
                y2 = (ocr_data['top'][j] + ocr_data['height'][j]) / height
                
                # Create result entry
                result = {
                    'text': ocr_data['text'][j],
                    'bbox': {
                        'x1': x1,
                        'y1': y1,
                        'x2': x2,
                        'y2': y2,
                        'page': i - 1  # 0-based page numbering
                    }
                }
                results.append(result)
                
    except Exception as e:
        logger.error(f"Error processing PDF: {str(e)}")
        traceback.print_exc()
        raise
    
    return results

In [None]:
class GuidelinesProcessor(dspy.Signature):
    """Given a guidelines text outlining rules for classifying and declassifying national security information, parse it and return a dictionary of rules stated by the document and their corresponding text. It should simply be a dict of {string: string} where the keys are all of the rules, and the values are the text of the rules. Stay as true to ground-truth document as possible, replicating the wording as much as possible."""
    
    guidelines: str = dspy.InputField(desc="Input text containing guidelines to be processed")
    json_output: dict = dspy.OutputField(desc="JSON-formatted string representation of the guidelines, of format: {rule: text}")

In [37]:
def process_guidelines_to_json(guidelines_text: str, output_file: str = "guidelines.json") -> str:
    """
    Process guidelines text and write it to a JSON file.
    
    Args:
        guidelines_text (str): The guidelines text to process
        output_file (str): Path to the output JSON file (default: "guidelines.json")
        
    Returns:
        str: The JSON-formatted string that was written to the file
    """
    # Create an instance of the processor
    processor = dspy.ChainOfThought(GuidelinesProcessor)
    
    # Process the guidelines
    result = processor(guidelines=guidelines_text)
    
    # Convert to JSON and write to file
    with open(output_file, 'w') as f:
        json.dump({"guidelines": result.json_output}, f, indent=2)
    
    return result

In [30]:
def pdf_path_to_bytes(pdf_path):
    """
    Convert a PDF file at the specified path to bytes.
    
    Args:
        pdf_path (str): Path to the PDF file
        
    Returns:
        bytes: The content of the PDF file as bytes
    """
    try:
        with open(pdf_path, 'rb') as file:
            pdf_bytes = file.read()
        return pdf_bytes
    except FileNotFoundError:
        print(f"Error: The file at {pdf_path} was not found.")
        return None
    except Exception as e:
        print(f"Error: {e}")
        return None

In [31]:
pdf_path = "../test_data/guideline_2.pdf"

test_pdf = pdf_path_to_bytes(pdf_path)

results = process_pdf(test_pdf)

2025-04-04 08:11:50,570 - __main__ - INFO - Starting PDF processing with paragraph-based analysis and bounding boxes
2025-04-04 08:11:54,743 - __main__ - INFO - Converted PDF to 27 images
2025-04-04 08:11:54,743 - __main__ - INFO - Processing image 1 of 27
2025-04-04 08:11:56,080 - __main__ - INFO - Processing image 2 of 27
2025-04-04 08:11:58,094 - __main__ - INFO - Processing image 3 of 27
2025-04-04 08:11:59,516 - __main__ - INFO - Processing image 4 of 27
2025-04-04 08:12:01,217 - __main__ - INFO - Processing image 5 of 27
2025-04-04 08:12:03,120 - __main__ - INFO - Processing image 6 of 27
2025-04-04 08:12:04,751 - __main__ - INFO - Processing image 7 of 27
2025-04-04 08:12:06,212 - __main__ - INFO - Processing image 8 of 27
2025-04-04 08:12:07,970 - __main__ - INFO - Processing image 9 of 27
2025-04-04 08:12:09,549 - __main__ - INFO - Processing image 10 of 27
2025-04-04 08:12:11,046 - __main__ - INFO - Processing image 11 of 27
2025-04-04 08:12:12,856 - __main__ - INFO - Process

In [32]:
final_string = " ".join([result["text"] for result in results])

In [38]:
process_guidelines_to_json(final_string)

[92m08:16:53 - LiteLLM:INFO[0m: utils.py:2975 - 
LiteLLM completion() model= claude-3-7-sonnet-20250219; provider = anthropic
2025-04-04 08:16:53,378 - LiteLLM - INFO - 
LiteLLM completion() model= claude-3-7-sonnet-20250219; provider = anthropic
2025-04-04 08:17:10,755 - httpx - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
[92m08:17:10 - LiteLLM:INFO[0m: utils.py:1143 - Wrapper: Completed Call, calling success_handler
2025-04-04 08:17:10,758 - LiteLLM - INFO - Wrapper: Completed Call, calling success_handler
[92m08:17:10 - LiteLLM:INFO[0m: cost_calculator.py:576 - selected model name for cost calculation: anthropic/claude-3-7-sonnet-20250219
2025-04-04 08:17:10,759 - LiteLLM - INFO - selected model name for cost calculation: anthropic/claude-3-7-sonnet-20250219


Prediction(
    reasoning="I need to parse this document to extract the rules for classifying and declassifying national security information according to Executive Order 13526. I'll go through the document systematically to identify distinct rules and their corresponding text.\n\nThe document is organized into several sections covering:\n1. General policy and purpose\n2. Classification authorities and procedures\n3. Marking requirements\n4. Declassification procedures\n5. Automatic declassification\n6. Review processes\n7. Management of classified information\n\nFor each rule I identify, I'll extract the exact text that describes the rule, maintaining the original wording as much as possible. I'll organize these as key-value pairs where the key is a concise description of the rule and the value is the full text of the rule from the document.\n\nI'll focus on extracting rules that provide specific guidance or requirements rather than general background information. I'll also ensure I c