In [34]:
import os
from dotenv import load_dotenv

load_dotenv(override=True)

True

In [35]:
import openai

# Replace with your API key
api_key = "your-api-key-here"
openai.api_key = api_key

try:
    # Make a simple API call
    response = openai.models.list()
    print("API key is valid!")
    print("Available models:", [model.id for model in response])
except openai.AuthenticationError:
    print("Invalid API key")
except Exception as e:
    print(f"An error occurred: {str(e)}")

Invalid API key


In [36]:
from crewai import Agent, Task, Crew, LLM
from crewai.tools import tool

import os
os.environ["OTEL_PYTHON_DISABLED"] = "true"

In [37]:
@tool("read_file")
def read_file(file_path: str) -> str:
    """Reads and returns the contents of a text file at the given file path."""
    try:
        # Get the directory and base name
        dir_name = os.path.dirname(file_path)
        base_name = os.path.basename(file_path)
        
        # Remove .pdf extension if it exists
        base_name = base_name.replace('.pdf', '')
        
        # The txt file is in the same directory as the PDF
        txt_path = os.path.join(dir_name, f"{base_name}.txt")
        
        with open(txt_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        return f"Error reading file: {str(e)}"

analyzer = Agent(
    role="Quality Analyzer",
    goal="Evaluate the OCR quality of the pdf if correction is needed ",
    backstory="You are a Senior Quality Assurance Engineer with 10 years of experience in the field. "
              "You have a deep understanding of the OCR process and the quality of the OCR output. "
              "You are given a pdf file and you need to evaluate the OCR quality of the pdf. "
              "The text is about U.S. Executive Orders. "
              "The OCRs are inaccurate usually due to either the quality of the pdf or multiple columns and executive orders in the same file.",
    allow_delegation=False,
    llm=LLM(model="gpt-4o", temperature=0),
    verbose=True
)

analyze = Task(
    description=
    "Analyze the document processing needs: "
    "1. Check if OCR file exists at {file_path} using read_file tool "
    "2. If OCR file doesn't exist, return JSON indicating immediate OCR processing needed "
    "3. If file exists, evaluate OCR quality "
    "4. Only proceed with Correction if quality issues are found.",
    expected_output="""Return a JSON with the following structure:
    {
        "filename": string,
        "needs_correction": boolean,
        "confidence_score": float (0-1),
        "reason": string (brief explanation if correction needed, "OCR files missing" if no OCR files found, or "OCR quality acceptable" if not needed)
    }""",
    tools=[read_file],
    agent=analyzer
)

In [38]:
import easyocr
import PyPDF2
from pdfminer.high_level import extract_text
from pdf2image import convert_from_path
import numpy as np

@tool("initialize_ocr_reader")
def initialize_ocr_reader(languages=['en']):
    """Initialize EasyOCR reader with specified languages"""
    try:
        return easyocr.Reader(languages, gpu=True)  # Set gpu=False if no GPU available
    except Exception as e:
        return f"Error initializing EasyOCR: {str(e)}"

@tool("extract_text_from_pdf")
def extract_text_from_pdf(pdf_path: str, reader=None) -> dict:
    """Extracts text from PDF using multiple methods"""
    try:
        # The PDF file should already be at the correct path
        if not os.path.exists(pdf_path):
            return {
                "text": f"Error: File not found at {pdf_path}",
                "method": "error",
                "confidence": 0
            }
        
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
            
            if len(text.strip()) > 100:
                return {
                    "text": text,
                    "method": "PyPDF2",
                    "confidence": 0.95
                }

        text = extract_text(pdf_path)
        if len(text.strip()) > 100:
            return {
                "text": text,
                "method": "pdfminer",
                "confidence": 0.90
            }

        if reader is None:
            reader = initialize_ocr_reader()
            
        images = convert_from_path(pdf_path)
        full_text = ""
        total_confidence = 0
        total_elements = 0
        
        for image in images:
            image_np = np.array(image)
            
            results = reader.readtext(image_np)
            
            for (bbox, text, confidence) in results:
                full_text += text + " "
                total_confidence += confidence
                total_elements += 1
        
        avg_confidence = total_confidence / total_elements if total_elements > 0 else 0
        
        return {
            "text": full_text,
            "method": "EasyOCR",
            "confidence": avg_confidence
        }

    except Exception as e:
        return {
            "text": f"Error extracting text: {str(e)}",
            "method": "error",
            "confidence": 0
        }

@tool("evaluate_extraction_quality")
def evaluate_extraction_quality(extraction_result: dict) -> dict:
    """Evaluates the quality of extracted text"""
    try:
        text = extraction_result["text"]
        method = extraction_result["method"]
        confidence = extraction_result["confidence"]
        
        words = text.split()
        num_words = len(words)
        avg_word_length = sum(len(word) for word in words) / num_words if num_words > 0 else 0
        
        method_quality_factors = {
            "PyPDF2": 1.0,    # Native PDF text is usually most reliable
            "pdfminer": 0.95, # Very good for complex layouts
            "EasyOCR": 0.90   # Generally good but may need verification
        }
        
        # Calculate final quality score
        base_quality = method_quality_factors.get(method, 0.5)
        content_quality = min(1.0, num_words / 1000)  
        final_score = base_quality * confidence * content_quality
        
        return {
            "quality_score": final_score,
            "word_count": num_words,
            "method_used": method,
            "confidence": confidence,
            "needs_verification": final_score < 0.7,
            "metrics": {
                "avg_word_length": avg_word_length,
                "content_density": content_quality
            }
        }
    except Exception as e:
        return {"error": str(e)}

processor = Agent(
    role="OCR Processor",
    goal="Extract high-quality text from PDFs using optimal methods",
    backstory="You are an expert in PDF text extraction and OCR processing. "
              "You specialize in using EasyOCR for complex documents while "
              "intelligently choosing the best extraction method for each PDF.",
    allow_delegation=True,
    llm=LLM(model="gpt-4", temperature=0),
    verbose=True,
)

process_document = Task(
    description=
    "Process the PDF document through the following steps:\n"
    "1. Initialize OCR reader if needed\n"
    "2. Extract text using the most appropriate method\n"
    "3. Evaluate the quality of extracted text\n"
    "4. Return the best quality text with confidence metrics\n"
    "IMPORTANT: Always use the complete file_path provided in the inputs. Do not use just the filename.\n"
    "e.g. /Volumes/One Touch/OCR/PDFs/{filename}.pdf/{filename}.pdf",
    expected_output="""Return a JSON with the following structure:
    {
        "extracted_text": string,
        "filename": string,
        "method_used": string,
        "quality_score": float,
        "confidence": float,
        "processing_notes": string
    }""",
    tools=[extract_text_from_pdf, evaluate_extraction_quality, initialize_ocr_reader],
    agent=processor
)

In [39]:
eo_identifier = Agent(
    role="Executive Order Identifier", 
    goal="Accurately identify and extract specific executive orders from documents containing multiple orders",
    backstory="You are a specialist in document analysis, focusing on executive orders. "
              "You excel at identifying and extracting specific executive orders from documents "
              "that may contain multiple orders in various formats and layouts. "
              "You can parse filenames like '1789-20-1.pdf' to extract the year (1789) and order number (1) "  # Updated example
              "and then accurately isolate just that specific order's complete text, filtering out "
              "other orders or irrelevant content from the same document.",
    allow_delegation=False,
    llm=LLM(model="gpt-4", temperature=0),
    verbose=True,
)

eo_parser = Task(
    description="Extract executive order number and year from filename\n"  # Added year
    "IMPORTANT: Always use the complete file_path provided in the inputs. Do not use just the filename.\n"
    "Example: From '/Volumes/One Touch/OCR/PDFs/{filename}.pdf/{filename}.pdf', extract year=1789, order=1",
    expected_output="""Return a JSON with the following structure:
    {
        "year": string,           # Added year field
        "order_number": string,
        "confidence": float,
        "notes": string
    }""",
    agent=eo_identifier
)

eo_extractor = Task(
            description="Locate and extract specific executive order text\n"
            "IMPORTANT: Always use the complete file_path provided in the inputs. Do not use just the filename.\n"
            "e.g. /Volumes/One Touch/OCR/PDFs/1789-20-1.pdf/1789-20-1.pdf",
            expected_output="""Return a JSON with the following structure:
            {
                "executive_order_text": string,
                "filename": string,
                "order_number": string, 
                "confidence": float,
                "is_complete": boolean,
                "notes": string
            }""",
    agent=eo_identifier
)

@tool("write_file")
def write_file(file_path: str, content: str) -> str:  # Added return type
    """Writes the content to a file at the given file path.
    
    Args:
        file_path (str): The complete path where the file should be written
        content (str): The content to write to the file
        
    Returns:
        str: Success message or error details
    """
    try:
        with open(file_path, \, encoding='utf-8') as f:  # Added encoding
            f.write(content)
        return f"Successfully wrote file to {file_path}"
    except Exception as e:
        return f"Error writing file: {str(e)}"

eo_writer = Task(
    description="""Write the extracted executive order text to a file in the same directory as the input PDF.
    
    IMPORTANT: 
    1. Use the same directory as the input PDF file
    2. The output filename should be {filename}_llm.txt where filename is the base name of the pdf file
    3. Use the complete file path, not just the filename
    4. Example: if input PDF is '/Volumes/One Touch/OCR/PDFs/{filename}.pdf/{filename}.pdf', 
       the output should be '/Volumes/One Touch/OCR/PDFs/{filename}.pdf/{filename}_llm.txt'""",
    expected_output="""Return a string indicating success or failure:
        "Successfully wrote file to {path}" or
        "Error writing file: {error_message}" """,  # Better output specification
    tools=[write_file],
    agent=eo_identifier
)

In [40]:
pdf_dir = '/Volumes/One Touch/OCR/PDFs'
output_dir = os.path.join(pdf_dir, 'processed')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')][:20]

In [41]:
for pdf_file in pdf_files:
    if pdf_file.startswith('.'):
        continue
        
    print(f"\nProcessing {pdf_file}...")
    
    # Construct the paths correctly
    base_name = os.path.splitext(pdf_file)[0]
    pdf_dir_path = os.path.join(pdf_dir, f"{base_name}.pdf")  # Directory path
    pdf_path = os.path.join(pdf_dir_path, f"{base_name}.pdf")  # Full PDF path
    
    # Verify paths before processing
    if not os.path.exists(pdf_dir_path):
        print(f"Error: Directory not found at {pdf_dir_path}")
        continue
    if not os.path.exists(pdf_path):
        print(f"Error: File not found at {pdf_path}")
        continue
    
    crew = Crew(
        agents=[analyzer, processor, eo_identifier],
        tasks=[analyze, process_document, eo_parser, eo_extractor, eo_writer],
        verbose=True,
        memory=True
    )
    
    try:
        result = crew.kickoff(
            inputs={
                "file_path": pdf_path,
                "filename": base_name
            }
        )
        print(f"Completed processing {pdf_file}")
    except Exception as e:
        print(f"Error processing {pdf_file}: {str(e)}")
        continue




Processing 1789-20-1.pdf...


ERROR:root:Error during short_term search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:Error during entities search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:LiteLLM call failed: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}


[1m[95m# Agent:[00m [1m[92mQuality Analyzer[00m
[95m## Task:[00m [92mAnalyze the document processing needs: 1. Check if OCR file exists at /Volumes/One Touch/OCR/PDFs/1789-20-1.pdf/1789-20-1.pdf using read_file tool 2. If OCR file doesn't exist, return JSON indicating immediate OCR processing needed 3. If file exists, evaluate OCR quality 4. Only proceed with Correction if quality issues are found.[00m


LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.



ERROR:root:Error during short_term search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:Error during entities search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:LiteLLM call failed: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}


[1m[95m# Agent:[00m [1m[92mQuality Analyzer[00m
[95m## Task:[00m [92mAnalyze the document processing needs: 1. Check if OCR file exists at /Volumes/One Touch/OCR/PDFs/1789-20-1.pdf/1789-20-1.pdf using read_file tool 2. If OCR file doesn't exist, return JSON indicating immediate OCR processing needed 3. If file exists, evaluate OCR quality 4. Only proceed with Correction if quality issues are found.[00m


LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.



ERROR:root:Error during short_term search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:Error during entities search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:LiteLLM call failed: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}


[1m[95m# Agent:[00m [1m[92mQuality Analyzer[00m
[95m## Task:[00m [92mAnalyze the document processing needs: 1. Check if OCR file exists at /Volumes/One Touch/OCR/PDFs/1789-20-1.pdf/1789-20-1.pdf using read_file tool 2. If OCR file doesn't exist, return JSON indicating immediate OCR processing needed 3. If file exists, evaluate OCR quality 4. Only proceed with Correction if quality issues are found.[00m


LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

Error processing 1789-20-1.pdf: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

Processing 1789-20-2.pdf...


ERROR:root:Error during short_term search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:Error during entities search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:LiteLLM call failed: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}


[1m[95m# Agent:[00m [1m[92mQuality Analyzer[00m
[95m## Task:[00m [92mAnalyze the document processing needs: 1. Check if OCR file exists at /Volumes/One Touch/OCR/PDFs/1789-20-2.pdf/1789-20-2.pdf using read_file tool 2. If OCR file doesn't exist, return JSON indicating immediate OCR processing needed 3. If file exists, evaluate OCR quality 4. Only proceed with Correction if quality issues are found.[00m


LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

Error processing 1789-20-2.pdf: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

Processing 1789-20-3.pdf...


ERROR:root:Error during short_term search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:Error during entities search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:LiteLLM call failed: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}


[1m[95m# Agent:[00m [1m[92mQuality Analyzer[00m
[95m## Task:[00m [92mAnalyze the document processing needs: 1. Check if OCR file exists at /Volumes/One Touch/OCR/PDFs/1789-20-3.pdf/1789-20-3.pdf using read_file tool 2. If OCR file doesn't exist, return JSON indicating immediate OCR processing needed 3. If file exists, evaluate OCR quality 4. Only proceed with Correction if quality issues are found.[00m


LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

Error processing 1789-20-3.pdf: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

Processing 1789-20-4.pdf...


ERROR:root:Error during short_term search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:Error during entities search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:LiteLLM call failed: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}


[1m[95m# Agent:[00m [1m[92mQuality Analyzer[00m
[95m## Task:[00m [92mAnalyze the document processing needs: 1. Check if OCR file exists at /Volumes/One Touch/OCR/PDFs/1789-20-4.pdf/1789-20-4.pdf using read_file tool 2. If OCR file doesn't exist, return JSON indicating immediate OCR processing needed 3. If file exists, evaluate OCR quality 4. Only proceed with Correction if quality issues are found.[00m


LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

Error processing 1789-20-4.pdf: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

Processing 1789-21-1.pdf...


ERROR:root:Error during short_term search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:Error during entities search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:LiteLLM call failed: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}


[1m[95m# Agent:[00m [1m[92mQuality Analyzer[00m
[95m## Task:[00m [92mAnalyze the document processing needs: 1. Check if OCR file exists at /Volumes/One Touch/OCR/PDFs/1789-21-1.pdf/1789-21-1.pdf using read_file tool 2. If OCR file doesn't exist, return JSON indicating immediate OCR processing needed 3. If file exists, evaluate OCR quality 4. Only proceed with Correction if quality issues are found.[00m


LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

Error processing 1789-21-1.pdf: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

Processing 1789-21-2.pdf...


ERROR:root:Error during short_term search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:Error during entities search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:LiteLLM call failed: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}


[1m[95m# Agent:[00m [1m[92mQuality Analyzer[00m
[95m## Task:[00m [92mAnalyze the document processing needs: 1. Check if OCR file exists at /Volumes/One Touch/OCR/PDFs/1789-21-2.pdf/1789-21-2.pdf using read_file tool 2. If OCR file doesn't exist, return JSON indicating immediate OCR processing needed 3. If file exists, evaluate OCR quality 4. Only proceed with Correction if quality issues are found.[00m


LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

Error processing 1789-21-2.pdf: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

Processing 1789-21-3.pdf...


ERROR:root:Error during short_term search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:Error during entities search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:LiteLLM call failed: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}


[1m[95m# Agent:[00m [1m[92mQuality Analyzer[00m
[95m## Task:[00m [92mAnalyze the document processing needs: 1. Check if OCR file exists at /Volumes/One Touch/OCR/PDFs/1789-21-3.pdf/1789-21-3.pdf using read_file tool 2. If OCR file doesn't exist, return JSON indicating immediate OCR processing needed 3. If file exists, evaluate OCR quality 4. Only proceed with Correction if quality issues are found.[00m


LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

Error processing 1789-21-3.pdf: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

Processing 1789-21-4.pdf...


ERROR:root:Error during short_term search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:Error during entities search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:LiteLLM call failed: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}


[1m[95m# Agent:[00m [1m[92mQuality Analyzer[00m
[95m## Task:[00m [92mAnalyze the document processing needs: 1. Check if OCR file exists at /Volumes/One Touch/OCR/PDFs/1789-21-4.pdf/1789-21-4.pdf using read_file tool 2. If OCR file doesn't exist, return JSON indicating immediate OCR processing needed 3. If file exists, evaluate OCR quality 4. Only proceed with Correction if quality issues are found.[00m


LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

Error processing 1789-21-4.pdf: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

Processing 1789-21-5.pdf...


ERROR:root:Error during short_term search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:Error during entities search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:LiteLLM call failed: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}


[1m[95m# Agent:[00m [1m[92mQuality Analyzer[00m
[95m## Task:[00m [92mAnalyze the document processing needs: 1. Check if OCR file exists at /Volumes/One Touch/OCR/PDFs/1789-21-5.pdf/1789-21-5.pdf using read_file tool 2. If OCR file doesn't exist, return JSON indicating immediate OCR processing needed 3. If file exists, evaluate OCR quality 4. Only proceed with Correction if quality issues are found.[00m


LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

Error processing 1789-21-5.pdf: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

Processing 1789-29-1.pdf...


ERROR:root:Error during short_term search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:Error during entities search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
ERROR:root:LiteLLM call failed: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}


[1m[95m# Agent:[00m [1m[92mQuality Analyzer[00m
[95m## Task:[00m [92mAnalyze the document processing needs: 1. Check if OCR file exists at /Volumes/One Touch/OCR/PDFs/1789-29-1.pdf/1789-29-1.pdf using read_file tool 2. If OCR file doesn't exist, return JSON indicating immediate OCR processing needed 3. If file exists, evaluate OCR quality 4. Only proceed with Correction if quality issues are found.[00m


LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

Error processing 1789-29-1.pdf: litellm.AuthenticationError: AuthenticationError: OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: aVwvkKj2********************Fmw6. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}


In [42]:
# # Cell 1: Imports
# from crewai import Agent, Task, Crew, LLM
# from crewai.tools import tool
# from langchain_community.embeddings import OpenAIEmbeddings
# from langchain_community.vectorstores import Chroma
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_community.document_loaders import TextLoader
# import glob
# import os
# import time  

# print("Imports completed successfully")

# # Global variable
# _VECTORSTORE = None

# # Cell 2: Agent and tools
# print("\nDefining agent and tools...")

# rag_researcher = Agent(
#     role="Research Assistant",
#     goal="Search and analyze executive orders to provide accurate information",
#     backstory="""You are an expert researcher specializing in executive orders. 
#               You have access to a comprehensive database of processed executive orders 
#               and can provide detailed, accurate answers with source references.
#               You MUST use the exact directory_path provided in the inputs.""",
#     allow_delegation=False,
#     llm=LLM(model="gpt-4", temperature=0),
#     verbose=True
# )

# @tool("load_llm_files")
# def load_llm_files(directory_path: str) -> list:
#     """Load all _llm.txt files from the directory structure"""
#     # Use fixed path directly
#     fixed_path = "/Volumes/One Touch/OCR/PDFs/"
    
#     print(f"\nStarting to load files from fixed path: {fixed_path}")
#     print(f"Directory exists: {os.path.exists(fixed_path)}")
#     start_time = time.time()
    
#     try:
#         llm_files = glob.glob(os.path.join(fixed_path, "**/*_llm.txt"), recursive=True)
#         print(f"Found {len(llm_files)} _llm.txt files")
        
#         documents = []
#         for i, file_path in enumerate(llm_files, 1):
#             try:
#                 print(f"Processing file {i}/{len(llm_files)}: {os.path.basename(file_path)}")
#                 with open(file_path, 'r', encoding='utf-8') as f:
#                     content = f.read()
#                     base_name = os.path.basename(file_path).replace('_llm.txt', '')
#                     documents.append({
#                         "content": content,
#                         "metadata": {
#                             "source": base_name,
#                             "file_path": file_path
#                         }
#                     })
#                 print(f"Successfully loaded: {base_name}")
#             except Exception as e:
#                 print(f"Error loading {file_path}: {e}")
#                 continue
        
#         elapsed_time = time.time() - start_time
#         print(f"\nLoading completed in {elapsed_time:.2f} seconds")
#         print(f"Total documents loaded: {len(documents)}")
#         return documents
        
#     except Exception as e:
#         print(f"Error in load_llm_files: {str(e)}")
#         return []  # Return empty list instead of error string for consistency

# @tool("create_vectorstore")
# def create_vectorstore(documents: list) -> str:
#     """Create a vector store from loaded documents"""
#     print("\nStarting vector store creation...")
#     start_time = time.time()
    
#     try:
#         global _VECTORSTORE
#         print("Initializing text splitter...")
        
#         text_splitter = RecursiveCharacterTextSplitter(
#             chunk_size=1000,
#             chunk_overlap=200
#         )
        
#         print("Processing documents...")
#         processed_docs = []
#         for i, doc in enumerate(documents, 1):
#             print(f"Splitting document {i}/{len(documents)}: {doc['metadata']['source']}")
#             chunks = text_splitter.create_documents(
#                 texts=[doc["content"]],
#                 metadatas=[doc["metadata"]] * len(doc["content"])
#             )
#             processed_docs.extend(chunks)
        
#         print(f"\nCreating embeddings for {len(processed_docs)} chunks...")
#         _VECTORSTORE = Chroma.from_documents(
#             documents=processed_docs,
#             embedding=OpenAIEmbeddings(),
#             persist_directory="./chroma_db"
#         )
        
#         elapsed_time = time.time() - start_time
#         result = f"Successfully created vector store with {len(processed_docs)} chunks in {elapsed_time:.2f} seconds"
#         print(result)
#         return result
        
#     except Exception as e:
#         print(f"Error in create_vectorstore: {str(e)}")
#         return f"Error creating vectorstore: {str(e)}"

# @tool("search_documents")
# def search_documents(query: str) -> dict:
#     """Search the vector store for relevant documents"""
#     print(f"\nStarting search for query: {query}")
#     start_time = time.time()
    
#     try:
#         global _VECTORSTORE
#         if _VECTORSTORE is None:
#             print("Vector store not initialized")
#             return {
#                 "error": "Vector store not initialized",
#                 "documents": [],
#                 "sources": []
#             }
        
#         print("Performing similarity search...")
#         docs = _VECTORSTORE.similarity_search(query, k=3)
        
#         result = {
#             "documents": [doc.page_content for doc in docs],
#             "sources": [doc.metadata["source"] for doc in docs],
#             "file_paths": [doc.metadata["file_path"] for doc in docs]
#         }
        
#         elapsed_time = time.time() - start_time
#         print(f"Search completed in {elapsed_time:.2f} seconds")
#         print(f"Found {len(docs)} relevant documents")
#         return result
        
#     except Exception as e:
#         print(f"Error in search_documents: {str(e)}")
#         return {
#             "error": f"Error searching documents: {str(e)}",
#             "documents": [],
#             "sources": []
#         }

# # Cell 3: Create Tasks
# print("\nCreating tasks...")

# load_files_task = Task(
#     description="""Load all _llm.txt files from the directory.
#     You MUST use the exact directory_path from the inputs parameter.
#     Steps:
#     1. Get the directory_path from the inputs parameter
#     2. Call load_llm_files with that exact directory_path
#     3. Return the loaded documents
    
#     DO NOT use placeholder paths like '/path/to/directory'.
#     Use ONLY the directory_path from the inputs.""",
#     expected_output="List of documents with their content and metadata",
#     tools=[load_llm_files],
#     agent=rag_researcher,
# )

# create_db_task = Task(
#     description="""Create vector database from loaded documents.
#     1. Use the create_vectorstore tool
#     2. Process the documents into chunks
#     3. Create embeddings and store in vector database""",
#     expected_output="Success message with number of processed chunks",
#     tools=[create_vectorstore],
#     agent=rag_researcher
# )

# search_task = Task(
#     description="""Search the vector database to answer questions about executive orders.
#     1. Use the search_documents tool
#     2. Analyze the retrieved documents
#     3. Provide a detailed answer with source references""",
#     expected_output="""JSON with:
#     - answer: detailed response
#     - sources: list of source documents
#     - confidence: score between 0-1
#     - notes: any additional context""",
#     tools=[search_documents],
#     agent=rag_researcher
# )

# # Cell 4: Create Crew
# print("\nCreating research crew...")
# research_crew = Crew(
#     agents=[rag_researcher],
#     tasks=[load_files_task, create_db_task, search_task],
#     verbose=True
# )

# # Cell 5: Function to handle queries
# def query_executive_orders(query: str, directory_path: str):
#     """Query the executive orders database"""
#     print(f"\n{'='*50}")
#     print(f"Processing query: {query}")
#     print(f"Using directory: {directory_path}")
#     print(f"{'='*50}")
    
#     try:
#         if not os.path.exists(directory_path):
#             print(f"Error: Directory does not exist: {directory_path}")
#             return f"Error: Directory not found: {directory_path}"
            
#         crew = Crew(
#             agents=[rag_researcher],
#             tasks=[load_files_task, create_db_task, search_task],
#             verbose=True,
#             process_inputs={"directory_path": directory_path}  # Add this line
#         )
        
#         result = crew.kickoff(
#             inputs={
#                 "query": query,
#                 "directory_path": directory_path
#             }
#         )
#         return result
#     except Exception as e:
#         print(f"Error: {str(e)}")
#         return f"Error processing query: {str(e)}"

# # Cell 6: Test the system
# print("\nTesting the RAG system...")
# test_query = "What executive orders were issued in 1789?"
# response = query_executive_orders(test_query, pdf_dir)
# print("\nTest Response:", response)

# # Cell 7: Interactive querying (optional)
# while True:
#     query = input("\nEnter your question (or 'quit' to exit): ")
#     if query.lower() == 'quit':
#         break
#     response = query_executive_orders(query, pdf_dir)
#     print("\nResponse:", response)