In [19]:
import os
import logging
from flask import Flask, request, jsonify
from flask_cors import CORS
from pdf2image import convert_from_bytes
import pytesseract
import openai
from typing import List, Dict
import numpy as np
from PIL import Image
import io
import traceback
from datetime import datetime
from supabase import create_client
from typing import Optional
from dotenv import load_dotenv
import docx  # For DOCX files
import mimetypes  # For file type detection
import json
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw

In [20]:
# Load environment variables
load_dotenv()

# pytesseract configuration
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(f'ocr_server_{datetime.now().strftime("%Y%m%d")}.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

In [21]:
def process_pdf(pdf_bytes: bytes) -> List[Dict]:
    """Process PDF and return structured results with bounding boxes"""
    logger.info("Starting PDF processing with paragraph-based analysis and bounding boxes")
    
    results = []  # Initialize results list
    
    try:
        # Convert PDF to images
        logger.debug("Converting PDF to images")
        images = convert_from_bytes(pdf_bytes)
        logger.info(f"Converted PDF to {len(images)} images")
        
        for i, img in enumerate(images, 1):
            logger.info(f"Processing image {i} of {len(images)}")
            
            # Get image dimensions
            width, height = img.size
            
            # Convert PIL image to bytes for OCR
            img_byte_arr = io.BytesIO()
            img.save(img_byte_arr, format='PNG')
            img_byte_arr = img_byte_arr.getvalue()
            
            # Perform OCR with bounding box information
            logger.debug(f"Performing OCR with bounding boxes on image {i}")
            
            # Get detailed data including bounding boxes
            ocr_data = pytesseract.image_to_data(
                Image.open(io.BytesIO(img_byte_arr)), 
                output_type=pytesseract.Output.DICT,
                config=r'--psm 3'
            )
            
            # Process OCR data
            n_boxes = len(ocr_data['text'])
            for j in range(n_boxes):
                # Skip empty text
                if not ocr_data['text'][j].strip():
                    continue
                
                # Get confidence score
                conf = float(ocr_data['conf'][j])
                if conf < 0:  # Skip low confidence results
                    continue
                
                # Get coordinates and normalize them
                x1 = ocr_data['left'][j] / width
                y1 = ocr_data['top'][j] / height
                x2 = (ocr_data['left'][j] + ocr_data['width'][j]) / width
                y2 = (ocr_data['top'][j] + ocr_data['height'][j]) / height
                
                # Create result entry
                result = {
                    'text': ocr_data['text'][j],
                    'bbox': {
                        'x1': x1,
                        'y1': y1,
                        'x2': x2,
                        'y2': y2,
                        'page': i - 1  # 0-based page numbering
                    }
                }
                results.append(result)
                
    except Exception as e:
        logger.error(f"Error processing PDF: {str(e)}")
        traceback.print_exc()
        raise
    
    return results

In [22]:
def visualize_document_with_boxes(pdf_bytes: bytes, results: List[Dict]):
    """
    Draw bounding boxes on the PDF pages and display them.
    
    Args:
        pdf_bytes: The original PDF file in bytes
        results: List of dictionaries containing bounding box information
    """
    
    # Convert PDF to images
    images = convert_from_bytes(pdf_bytes)
    
    # Group results by page
    results_by_page = {}
    for item in results:
        page_num = item['bbox']['page']
        if page_num not in results_by_page:
            results_by_page[page_num] = []
        results_by_page[page_num].append(item)
    
    # Process each page
    for page_num, img in enumerate(images):
        if page_num not in results_by_page:
            continue
            
        # Create a drawing object
        draw = ImageDraw.Draw(img)
        width, height = img.size
        
        # Draw boxes for each result on this page
        for result in results_by_page[page_num]:
            bbox = result['bbox']
            
            # Convert normalized coordinates to pixel coordinates
            x1 = int(bbox['x1'] * width)
            y1 = int(bbox['y1'] * height)
            x2 = int(bbox['x2'] * width)
            y2 = int(bbox['y2'] * height)
            
            # Draw rectangle with semi-transparent red color
            draw.rectangle([x1, y1, x2, y2], outline='red', width=2)
        
        # Display the image
        plt.figure(figsize=(15, 20))
        plt.imshow(img)
        plt.axis('off')
        plt.title(f'Page {page_num}')
        plt.show()

In [23]:
def pdf_path_to_bytes(pdf_path):
    """
    Convert a PDF file at the specified path to bytes.
    
    Args:
        pdf_path (str): Path to the PDF file
        
    Returns:
        bytes: The content of the PDF file as bytes
    """
    try:
        with open(pdf_path, 'rb') as file:
            pdf_bytes = file.read()
        return pdf_bytes
    except FileNotFoundError:
        print(f"Error: The file at {pdf_path} was not found.")
        return None
    except Exception as e:
        print(f"Error: {e}")
        return None

In [24]:
pdf_path = "../test_data/test.pdf"

In [25]:

test_pdf = pdf_path_to_bytes(pdf_path)

In [None]:
results = process_pdf(test_pdf)

2025-04-03 10:46:49,924 - __main__ - INFO - Starting PDF processing with paragraph-based analysis and bounding boxes
2025-04-03 10:46:50,345 - __main__ - INFO - Converted PDF to 3 images
2025-04-03 10:46:50,346 - __main__ - INFO - Processing image 1 of 3


In [None]:
results

[{'text': 'Declassified',
  'bbox': {'x1': 0.13117647058823528,
   'y1': 0.022272727272727274,
   'x2': 0.22647058823529412,
   'y2': 0.03272727272727273,
   'page': 0}},
 {'text': 'and',
  'bbox': {'x1': 0.23352941176470587,
   'y1': 0.022272727272727274,
   'x2': 0.26235294117647057,
   'y2': 0.03272727272727273,
   'page': 0}},
 {'text': 'Approved',
  'bbox': {'x1': 0.26823529411764707,
   'y1': 0.022272727272727274,
   'x2': 0.3452941176470588,
   'y2': 0.035454545454545454,
   'page': 0}},
 {'text': 'For',
  'bbox': {'x1': 0.35294117647058826,
   'y1': 0.022272727272727274,
   'x2': 0.3788235294117647,
   'y2': 0.03272727272727273,
   'page': 0}},
 {'text': 'Release',
  'bbox': {'x1': 0.3847058823529412,
   'y1': 0.022272727272727274,
   'x2': 0.44882352941176473,
   'y2': 0.03272727272727273,
   'page': 0}},
 {'text': '2012/05/23',
  'bbox': {'x1': 0.45529411764705885,
   'y1': 0.022272727272727274,
   'x2': 0.5452941176470588,
   'y2': 0.03272727272727273,
   'page': 0}},
 {'tex

In [None]:
visualize_document_with_boxes(test_pdf, results)

KeyError: 'page'