# üè¢ Invoice Processing with LangGraph + LangChain
## Automated Invoice Classification using LangChain Document Processing

### üéØ Use Case: Accounts Payable Automation
Automatically classify incoming invoices to streamline payment processing:
- ‚úÖ **Valid**: Complete invoice ready for payment
- ‚ùå **Invalid**: Rejected invoice requiring vendor correction

### üõ†Ô∏è Technology Stack
- **LangGraph**: Workflow orchestration
- **LangChain**: Document loading and text splitting
- **OpenAI**: Embeddings and GPT-4 classification
- **Pinecone**: Vector storage for RAG

### üìã Business Rules
**Valid Invoice Requirements:**
- All invoices must include vendor name, invoice number, date, and amount
- Missing any required field = Invalid classification

### üîÑ Workflow: PDF -> LLM -> Markdown -> Vector DB

In [None]:
!pip install langchain-openai PyMuPDF

In [None]:
import os
import base64
from pathlib import Path
import getpass
import fitz  # PyMuPDF
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage

# Configuration
DATA_DIR = "data"
OUTPUT_DIR = "markdown_output"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or getpass.getpass("OpenAI API Key: ")

# Create output directory
Path(OUTPUT_DIR).mkdir(exist_ok=True)

print("‚úÖ Setup complete!")

In [None]:
# Initialize LLM
llm = ChatOpenAI(
    api_key=OPENAI_API_KEY,
    model="gpt-4o-mini",
    temperature=0
)

def pdf_to_base64(pdf_path: str) -> str:
    """Convert first page of PDF to base64 image"""
    doc = fitz.open(pdf_path)
    page = doc.load_page(0)
    pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
    img_data = pix.tobytes("png")
    doc.close()
    return base64.b64encode(img_data).decode()

def pdf_to_markdown(pdf_path: str) -> str:
    """Convert PDF to markdown using LLM vision"""
    image_base64 = pdf_to_base64(pdf_path)
    
    message = HumanMessage(
        content=[
            {
                "type": "text",
                "text": "Convert this document to clean markdown format. Preserve structure, headings, and content. Return only the markdown."
            },
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/png;base64,{image_base64}"}
            }
        ]
    )
    
    response = llm.invoke([message])
    return response.content.strip()

print("‚úÖ Functions ready!")

In [None]:
# Process all PDFs
data_dir = Path(DATA_DIR)
pdf_files = list(data_dir.glob("*.pdf"))

print(f"üìÅ Found {len(pdf_files)} PDF files")

for pdf_file in pdf_files:
    print(f"\nüìÑ Processing: {pdf_file.name}")
    
    try:
        # Convert to markdown
        markdown_content = pdf_to_markdown(str(pdf_file))
        
        # Save as .md file
        output_file = Path(OUTPUT_DIR) / f"{pdf_file.stem}.md"
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(markdown_content)
        
        print(f"‚úÖ Saved: {output_file}")
        print(f"üìä Size: {len(markdown_content)} characters")
        
    except Exception as e:
        print(f"‚ùå Error: {e}")

print(f"\n‚úÖ Complete! Check {OUTPUT_DIR}/ for markdown files")

In [None]:
# Load markdown files to vector database
!pip install langchain-pinecone pinecone

from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.schema import Document
from pinecone import Pinecone, ServerlessSpec

# Initialize components
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY, model="text-embedding-3-small")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") or getpass.getpass("Pinecone API Key: ")
INDEX_NAME = "pdf-markdown-vectors"

pc = Pinecone(api_key=PINECONE_API_KEY)

# Create index if needed
existing = [idx["name"] for idx in pc.list_indexes()]
if INDEX_NAME not in existing:
    pc.create_index(
        name=INDEX_NAME,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    import time
    time.sleep(10)

index = pc.Index(INDEX_NAME)
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

# Load markdown files
output_dir = Path(OUTPUT_DIR)
md_files = list(output_dir.glob("*.md"))

documents = []
for md_file in md_files:
    with open(md_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    doc = Document(
        page_content=content,
        metadata={
            'pdf_filename': f"{md_file.stem}.pdf",
            'markdown_filename': md_file.name,
            'doc_type': 'pdf_markdown'
        }
    )
    documents.append(doc)

# Store in vector database
if documents:
    vector_ids = vector_store.add_documents(documents)
    print(f"‚úÖ Stored {len(vector_ids)} documents in Pinecone")
    
    # Test search
    results = vector_store.similarity_search("invoice amount", k=2)
    for i, doc in enumerate(results, 1):
        pdf_name = doc.metadata.get('pdf_filename', 'Unknown')
        print(f"{i}. {pdf_name}: {doc.page_content[:100]}...")
else:
    print("‚ùå No documents to store")