In [2]:
!pip install pdfplumber pytesseract pdf2image Pillow

Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl (60 kB)
     -------------------------------------- 60.2/60.2 kB 793.5 kB/s eta 0:00:00
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Collecting pypdfium2>=4.18.0
  Downloading pypdfium2-4.30.1-py3-none-win_amd64.whl (3.0 MB)
     ---------------------------------------- 3.0/3.0 MB 3.4 MB/s eta 0:00:00
Collecting pdfminer.six==20250327
  Downloading pdfminer_six-20250327-py3-none-any.whl (5.6 MB)
     ---------------------------------------- 5.6/5.6 MB 6.8 MB/s eta 0:00:00
Collecting cryptography>=36.0.0
  Downloading cryptography-45.0.3-cp37-abi3-win_amd64.whl (3.4 MB)
     ---------------------------------------- 3.4/3.4 MB 7.0 MB/s eta 0:00:00
Collecting cffi>=1.14
  Using cached cffi-1.17.1-cp310-cp310-win_amd64.whl (181 kB)
Collecting pycparser
  Using cached pycparser-2.22-py3-none-any.whl (117 kB)
Installing collected packages: pypdfium2, pycparser, pdf2image, cffi, cryptography


[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import os
import json
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

In [4]:
# --- PDF text extraction and OCR ---
def extract_text_and_tables(pdf_path):
    all_data = []

    # Use pdfplumber for text + table extraction
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            page_data = {
                "page_number": i + 1,
                "text": page.extract_text() or "",
                "tables": [],
                "ocr_text": ""
            }

            # Extract tables and store as list of rows
            tables = page.extract_tables()
            for table in tables:
                cleaned_table = [row for row in table if any(cell is not None for cell in row)]
                if cleaned_table:
                    page_data["tables"].append(cleaned_table)

            # If no text and tables found → apply OCR
            if not page_data["text"].strip() and not page_data["tables"]:
                img = page.to_image(resolution=300).original
                ocr_text = pytesseract.image_to_string(img)
                page_data["ocr_text"] = ocr_text.strip()

            all_data.append(page_data)

    return all_data

In [5]:
# --- Convert JSON data into human-readable text ---
def json_to_llm_text(data: dict, indent=0) -> str:
    """
    Converts JSON/dictionary into human-readable, structured text
    suitable as an LLM input prompt.
    """
    output_lines = []
    indent_str = '  ' * indent

    if isinstance(data, dict):
        for key, value in data.items():
            if isinstance(value, (dict, list)):
                output_lines.append(f"{indent_str}{key.capitalize()}:")
                output_lines.append(json_to_llm_text(value, indent + 1))
            else:
                output_lines.append(f"{indent_str}{key.capitalize()}: {value}")
    elif isinstance(data, list):
        for idx, item in enumerate(data):
            output_lines.append(f"{indent_str}- Item {idx + 1}:")
            output_lines.append(json_to_llm_text(item, indent + 1))
    else:
        output_lines.append(f"{indent_str}{data}")

    return "\n".join(output_lines)


In [6]:
# --- File type detection and processing ---
def process_file(file_path):
    file_extension = file_path.split('.')[-1].lower()

    if file_extension == "txt":
        # Read and return plain text from a .txt file
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return text

    elif file_extension == "json":
        # Read and parse JSON data from a .json file
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return json_to_llm_text(data)

    elif file_extension == "pdf":
        # Process the PDF file (extract text, tables, and OCR)
        extracted_data = extract_text_and_tables(file_path)
        return json_to_llm_text(extracted_data)

    else:
        raise ValueError("Unsupported file format. Please provide a .txt, .json, or .pdf file.")


In [13]:
# --- Main function to handle user input ---
def main():
    # Get the file path from the user
    file_path = r"E:\langflow_directory\gitRepos\AgenticAI\sampleFiles\json\sample2.json"

    if not os.path.exists(file_path):
        print("File does not exist. Please check the file path and try again.")
        return

    try:
        result = process_file(file_path)
        print("\nExtracted Information:")
        print(result)
    except ValueError as e:
        print(f"Error: {e}")


if __name__ == "__main__":
    main()



Extracted Information:
- Item 1:
  Invoice_number: INV12345
  Amount: 2000.0
  Date: 2025-05-15
  Vendor: TechSolutions Inc.
  Items:
    - Item 1:
      Description: Software License
      Quantity: 1
      Price: 2000.0
  Intent: Invoice
- Item 2:
  Invoice_number: INV67890
  Amount: 12000.0
  Date: 2025-06-01
  Vendor: Office Supplies Co.
  Items:
    - Item 1:
      Description: Office Chairs
      Quantity: 50
      Price: 200.0
    - Item 2:
      Description: Desks
      Quantity: 20
      Price: 400.0
  Intent: Invoice
