# Solution
#### Task
- Extract text from multiple document formats
- Recognize document structure and organization
- Extract and enrich documents with metadata
- Create a searchable index of processed documents

Your task is to implement key functions in the provided boilerplate code to create a robust document processing system.

In [2]:
from llama_index.core import Document
from llama_index.core.node_parser import SimpleNodeParser, MarkdownNodeParser, HTMLNodeParser
from llama_index.readers.file import PDFReader, DocxReader 
from llama_index.core.schema import MetadataMode
import os
from datetime import datetime
import re
from typing import List, Dict, Any, Optional, Tuple

class DocumentProcessor:
    """A complete document processing pipeline"""
    
    def __init__(self, storage_dir: str = "./processed_docs"):
        """Initialize the document processor"""
        self.storage_dir = storage_dir
        self.documents = []
        self.nodes = []
        self.document_map = []
        
        # Create storage directory if it doesn't exist
        if not os.path.exists(storage_dir):
            os.makedirs(storage_dir)
    
    def load_document(self, file_path: str) -> Optional[Document]:
        """
        Load a document from a file path using the appropriate reader
        """
        try:
            # Get file extension (lowercase)
            _, ext = os.path.splitext(file_path)
            ext = ext.lower()
            
            # Initialize content variable
            content = None
            
            # Choose appropriate reader based on file extension
            if ext == '.pdf':
                reader = PDFReader()
                docs = reader.load_data(file=file_path)
                if docs:
                    content = docs[0].text
            elif ext in ['.docx', '.doc']:
                reader = DocxReader()
                docs = reader.load_data(file=file_path)
                if docs:
                    content = docs[0].text
            elif ext in ['.txt', '.md', '.markdown']:
                # Simple text file reading
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
            elif ext in ['.html', '.htm']:
                # Simple HTML file reading
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
            else:
                print(f"Unsupported file type: {ext}")
                return None
                
            if content:
                # Create document with basic file metadata
                doc = Document(
                    text=content,
                    metadata={
                        "file_path": file_path,
                        "file_name": os.path.basename(file_path),
                        "file_type": ext[1:]  # Remove the dot
                    }
                )
                return doc
            else:
                print(f"No content extracted from {file_path}")
                return None
                
        except Exception as e:
            print(f"Error loading document {file_path}: {str(e)}")
            return None
    
    def extract_metadata(self, doc: Document) -> Dict[str, Any]:
        """
        Extract metadata from document content and file information
        """
        metadata = {}
        
        # Copy existing metadata
        if doc.metadata:
            metadata.update(doc.metadata)
        
        # Add basic information
        metadata["file_size"] = len(doc.text)
        metadata["extracted_date"] = datetime.now().strftime("%Y-%m-%d")
        metadata["num_characters"] = len(doc.text)
        metadata["num_words"] = len(doc.text.split())
        
        # Extract potential date/year using regex
        # Look for common date formats: YYYY-MM-DD, MM/DD/YYYY, etc.
        date_patterns = [
            r'\b(20\d{2}[/\-]\d{1,2}[/\-]\d{1,2})\b',  # 2023-01-01
            r'\b(\d{1,2}[/\-]\d{1,2}[/\-]20\d{2})\b',  # 01-01-2023
            r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+20\d{2}\b'  # January 1, 2023
        ]
        
        for pattern in date_patterns:
            match = re.search(pattern, doc.text)
            if match:
                metadata["date_found"] = match.group(1)
                break
        
        # Extract year
        year_pattern = r'\b(20\d{2})\b'
        year_matches = re.findall(year_pattern, doc.text)
        if year_matches:
            # Use the most frequent year
            year_counts = {}
            for year in year_matches:
                year_counts[year] = year_counts.get(year, 0) + 1
            
            most_common_year = max(year_counts.items(), key=lambda x: x[1])[0]
            metadata["year"] = most_common_year
        
        # Try to identify document type
        doc_type_patterns = {
            "report": r'\breport\b',
            "invoice": r'\binvoice\b',
            "proposal": r'\bproposal\b',
            "contract": r'\bcontract\b|agreement\b',
            "manual": r'\bmanual\b|\bguide\b|\binstruction',
            "article": r'\barticle\b',
            "analysis": r'\banalysis\b'
        }
        
        for doc_type, pattern in doc_type_patterns.items():
            if re.search(pattern, doc.text.lower()):
                metadata["document_type"] = doc_type
                break
        
        # Try to extract title from first line or filename
        lines = doc.text.strip().split('\n')
        if lines and len(lines[0]) < 200:  # Assume first line might be title if not too long
            metadata["title"] = lines[0].strip()
        else:
            # Use filename without extension as fallback title
            filename = metadata.get("file_name", "")
            if filename:
                base_name = os.path.splitext(filename)[0]
                metadata["title"] = base_name.replace("_", " ").replace("-", " ").title()
        
        return metadata
    
    def parse_document(self, doc: Document) -> List[Document]:
        """
        Parse a document into nodes based on structure
        """
        # Update document with extracted metadata
        metadata = self.extract_metadata(doc)
        doc.metadata = metadata
        
        # Choose appropriate parser based on document type
        file_type = metadata.get("file_type", "").lower()
        
        if file_type in ["md", "markdown"]:
            parser = MarkdownNodeParser()
        elif file_type in ["html", "htm"]:
            parser = HTMLNodeParser()
        else:
            # Default to simple node parser
            parser = SimpleNodeParser.from_defaults(chunk_size=1024, chunk_overlap=100)
        
        # Parse the document
        nodes = parser.get_nodes_from_documents([doc])
        
        # Add document ID to each node metadata
        doc_id = len(self.documents)
        for i, node in enumerate(nodes):
            node.metadata["doc_id"] = doc_id
            node.metadata["node_id"] = i
            
            # Try to identify section headings in the text
            lines = node.text.strip().split('\n')
            if lines and len(lines[0]) < 150:  # Simple heuristic for headings
                if "heading" not in node.metadata:
                    node.metadata["heading"] = lines[0].strip()
        
        # Add document to our collection
        self.documents.append(doc)
        self.nodes.extend(nodes)
        
        return nodes
    
    def create_document_map(self, nodes: List[Document]) -> List[Dict[str, Any]]:
        """
        Create a document map for navigation
        """
        document_map = []
        
        for i, node in enumerate(nodes):
            # Get heading info
            heading = node.metadata.get("heading", "")
            level = node.metadata.get("heading_level", 0)
            
            # If no heading info is found, try to create one
            if not heading:
                # Extract first line as potential heading
                lines = node.text.strip().split('\n')
                if lines and len(lines[0]) < 150:
                    heading = lines[0].strip()
                    # Try to guess the level based on appearance
                    if lines[0].startswith('#'):
                        level = lines[0].count('#', 0, lines[0].find(' '))
                    elif lines[0].startswith('=='):
                        level = 1
                    elif lines[0].startswith('--'):
                        level = 2
                    else:
                        level = 1  # Default level
                else:
                    # Create a generic section name
                    heading = f"Section {i+1}"
                    level = 1
            
            # Create indentation for hierarchical display
            indent = "  " * (level - 1) if level > 0 else ""
            display = f"{indent}{heading}"
            
            # Find relevant metadata for the map
            map_item = {
                "heading": heading,
                "level": level,
                "index": i,
                "doc_id": node.metadata.get("doc_id", 0),
                "display": display,
                "text_preview": node.text[:100] + "..." if len(node.text) > 100 else node.text
            }
            
            document_map.append(map_item)
        
        # Update the global document map
        self.document_map.extend(document_map)
        
        return document_map
    
    def process_directory(self, directory_path: str) -> int:
        """
        Process all documents in a directory
        """
        processed_count = 0
        failed_count = 0
        
        # Check if directory exists
        if not os.path.isdir(directory_path):
            print(f"Directory not found: {directory_path}")
            return 0
        
        # List of supported extensions
        supported_extensions = ['.pdf', '.docx', '.doc', '.txt', '.md', '.markdown', '.html', '.htm']
        
        # Process each file
        for filename in os.listdir(directory_path):
            file_path = os.path.join(directory_path, filename)
            
            # Skip directories
            if os.path.isdir(file_path):
                continue
                
            # Check if file type is supported
            _, ext = os.path.splitext(filename)
            if ext.lower() not in supported_extensions:
                continue
                
            print(f"Processing: {filename}")
            
            # Load document
            doc = self.load_document(file_path)
            if doc:
                # Process document
                nodes = self.parse_document(doc)
                self.create_document_map(nodes)
                processed_count += 1
                print(f"  Success: {len(nodes)} nodes created")
            else:
                failed_count += 1
                print(f"  Failed to process {filename}")
        
        print(f"Directory processing completed. Processed: {processed_count}, Failed: {failed_count}")
        return processed_count
    
    def search(self, query: str, filters: Optional[Dict[str, str]] = None) -> List[Dict[str, Any]]:
        """
        Search processed documents
        """
        results = []
        
        if not query and not filters:
            return results
            
        query = query.lower()
        
        # Process each node
        for i, node in enumerate(self.nodes):
            match = False
            
            # Check query text match
            if query and query in node.text.lower():
                match = True
                
            # Check metadata filters
            if filters and match:
                for key, value in filters.items():
                    if key not in node.metadata or str(node.metadata[key]) != str(value):
                        match = False
                        break
            elif filters and not query:
                # If only filters are provided (no query text)
                match = True
                for key, value in filters.items():
                    if key not in node.metadata or str(node.metadata[key]) != str(value):
                        match = False
                        break
            
            if match:
                # Find the context of the match
                if query:
                    index = node.text.lower().find(query)
                    start = max(0, index - 40)
                    end = min(len(node.text), index + len(query) + 40)
                    context = node.text[start:end]
                    
                    # Highlight the match
                    if start > 0:
                        context = "..." + context
                    if end < len(node.text):
                        context = context + "..."
                else:
                    # No query text, use beginning of node
                    context = node.text[:100] + "..." if len(node.text) > 100 else node.text
                
                # Create result item
                result = {
                    "node_id": i,
                    "doc_id": node.metadata.get("doc_id", 0),
                    "heading": node.metadata.get("heading", "Section"),
                    "context": context,
                    "metadata": node.metadata
                }
                
                results.append(result)
        
        return results
    
    def get_document_structure(self, doc_id: int) -> Dict[str, Any]:
        """
        Get the structure of a specific document
        """
        if doc_id >= len(self.documents):
            return {"error": "Document ID not found"}
            
        # Find all map items for this document
        doc_sections = [item for item in self.document_map if item.get("doc_id") == doc_id]
        
        if not doc_sections:
            return {"error": "No sections found for document"}
            
        # Get document metadata
        doc_metadata = self.documents[doc_id].metadata.copy()
        
        # Create structure
        structure = {
            "title": doc_metadata.get("title", f"Document {doc_id}"),
            "metadata": doc_metadata,
            "sections": []
        }
        
        # Organize sections
        for section in doc_sections:
            section_info = {
                "heading": section.get("heading", ""),
                "level": section.get("level", 1),
                "display": section.get("display", ""),
                "node_id": section.get("index")
            }
            structure["sections"].append(section_info)
            
        return structure
        
    def save_processed_documents(self) -> bool:
        """
        Save processed documents to storage directory
        
        This function is implemented for you
        """
        try:
            # Simple file-based storage
            with open(f"{self.storage_dir}/document_map.txt", "w") as f:
                for item in self.document_map:
                    f.write(f"{item['display']} - ID: {item['index']}\n")
            
            print(f"Saved document map with {len(self.document_map)} entries")
            return True
        except Exception as e:
            print(f"Error saving documents: {str(e)}")
            return False

# Example usage
if __name__ == "__main__":
    processor = DocumentProcessor()
    
    # Process a single document
    doc = processor.load_document("example.pdf")
    if doc:
        metadata = processor.extract_metadata(doc)
        nodes = processor.parse_document(doc)
        doc_map = processor.create_document_map(nodes)
        
        print(f"Processed document with {len(nodes)} nodes")
        print(f"Document structure has {len(doc_map)} sections")
        
    # Process a directory
    num_processed = processor.process_directory("./documents")
    print(f"Processed {num_processed} documents from directory")
    
    # Search example
    results = processor.search("budget", filters={"year": "2023"})
    print(f"Found {len(results)} results for 'budget' in 2023 documents")

Error loading document example.pdf: RetryError[<Future at 0x7b307b8d86e0 state=finished raised FileNotFoundError>]
Directory not found: ./documents
Processed 0 documents from directory
Found 0 results for 'budget' in 2023 documents
