In [16]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [17]:
from crewai import Agent, Task, Crew, LLM
from crewai.tools import tool

In [18]:
@tool("read_file")
def read_file(file_path: str) -> str:
    """Reads and returns the contents of a text file at the given file path. Useful for reading OCR output files and other text documents."""
    try:
        with open(f"{file_path}", 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        return f"Error reading file: {str(e)}"

analyzer = Agent(
    role="Quality Analyzer",
    goal="Evaluate the OCR quality of the pdf if correction is needed ",
    backstory="You are a Senior Quality Assurance Engineer with 10 years of experience in the field. "
              "You have a deep understanding of the OCR process and the quality of the OCR output. "
              "You are given a pdf file and you need to evaluate the OCR quality of the pdf. "
              "The text is about U.S. Executive Orders. "
              "The OCRs are inaccurate usually due to either the quality of the pdf or multiple columns and executive orders in the same file.",
    allow_delegation=False,
    llm=LLM(model="gpt-4o", temperature=0),
    verbose=True
)

analyze = Task(
    description=
    "Analyze the following OCR versions of the document: "
    "1. Read {file_path}.pdf/{file_path}_v2.txt using read_file tool "
    "2. Read {file_path}.pdf/{file_path}.txt using read_file tool "
    "3. Compare versions and determine if OCR correction is needed "
    "4. Only proceed with Correction if significant differences are found.",
    expected_output="""Return a JSON with the following structure:
    {
        "needs_correction": boolean,
        "confidence_score": float (0-1),
        "reason": string (brief explanation if correction needed, or "OCR quality acceptable" if not)
    }""",
    tools=[read_file],
    agent=analyzer
)


In [None]:
import easyocr
import PyPDF2
from pdfminer.high_level import extract_text
from pdf2image import convert_from_path
import numpy as np

@tool("initialize_ocr_reader")
def initialize_ocr_reader(languages=['en']):
    """Initialize EasyOCR reader with specified languages"""
    try:
        return easyocr.Reader(languages, gpu=True)  # Set gpu=False if no GPU available
    except Exception as e:
        return f"Error initializing EasyOCR: {str(e)}"

@tool("extract_text_from_pdf")
def extract_text_from_pdf(pdf_path: str, reader=None) -> dict:
    """Extracts text from PDF using multiple methods"""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
            
            if len(text.strip()) > 100:
                return {
                    "text": text,
                    "method": "PyPDF2",
                    "confidence": 0.95
                }

        text = extract_text(pdf_path)
        if len(text.strip()) > 100:
            return {
                "text": text,
                "method": "pdfminer",
                "confidence": 0.90
            }

        if reader is None:
            reader = initialize_ocr_reader()
            
        images = convert_from_path(pdf_path)
        full_text = ""
        total_confidence = 0
        total_elements = 0
        
        for image in images:
            image_np = np.array(image)
            
            results = reader.readtext(image_np)
            
            for (bbox, text, confidence) in results:
                full_text += text + " "
                total_confidence += confidence
                total_elements += 1
        
        avg_confidence = total_confidence / total_elements if total_elements > 0 else 0
        
        return {
            "text": full_text,
            "method": "EasyOCR",
            "confidence": avg_confidence
        }

    except Exception as e:
        return {
            "text": f"Error extracting text: {str(e)}",
            "method": "error",
            "confidence": 0
        }

@tool("evaluate_extraction_quality")
def evaluate_extraction_quality(extraction_result: dict) -> dict:
    """Evaluates the quality of extracted text"""
    try:
        text = extraction_result["text"]
        method = extraction_result["method"]
        confidence = extraction_result["confidence"]
        
        words = text.split()
        num_words = len(words)
        avg_word_length = sum(len(word) for word in words) / num_words if num_words > 0 else 0
        
        method_quality_factors = {
            "PyPDF2": 1.0,    # Native PDF text is usually most reliable
            "pdfminer": 0.95, # Very good for complex layouts
            "EasyOCR": 0.90   # Generally good but may need verification
        }
        
        # Calculate final quality score
        base_quality = method_quality_factors.get(method, 0.5)
        content_quality = min(1.0, num_words / 1000)  
        final_score = base_quality * confidence * content_quality
        
        return {
            "quality_score": final_score,
            "word_count": num_words,
            "method_used": method,
            "confidence": confidence,
            "needs_verification": final_score < 0.7,
            "metrics": {
                "avg_word_length": avg_word_length,
                "content_density": content_quality
            }
        }
    except Exception as e:
        return {"error": str(e)}

ocr_processor = Agent(
    role="OCR Processor",
    goal="Extract high-quality text from PDFs using optimal methods",
    backstory="You are an expert in PDF text extraction and OCR processing. "
              "You specialize in using EasyOCR for complex documents while "
              "intelligently choosing the best extraction method for each PDF.",
    allow_delegation=True,
    llm=LLM(model="gpt-4", temperature=0),
    verbose=True,
)

process_document = Task(
    description=
    "Process the PDF document through the following steps:\n"
    "1. Initialize OCR reader if needed\n"
    "2. Extract text using the most appropriate method\n"
    "3. Evaluate the quality of extracted text\n"
    "4. Return the best quality text with confidence metrics",
    expected_output="""Return a JSON with the following structure:
    {
        "extracted_text": string,
        "method_used": string,
        "quality_score": float,
        "confidence": float,
        "processing_notes": string
    }""",
    tools=[extract_text_from_pdf, evaluate_extraction_quality, initialize_ocr_reader],
    agent=ocr_processor
)

In [19]:
# pdf_files = [f.replace('.pdf', '') for f in os.listdir() if f.endswith('.pdf')]