# Challenge
#### Task
- Extract text from multiple document formats
- Recognize document structure and organization
- Extract and enrich documents with metadata
- Create a searchable index of processed documents

Your task is to implement key functions in the provided boilerplate code to create a robust document processing system.

# Hints 
- Start with the load_document function to get documents into the system
- For extract_metadata, focus on simple pattern matching for dates, titles, etc.
- In parse_document, use the appropriate parser based on document type
- The create_document_map function should create a hierarchical view of document sections
- For the search function, implement simple text matching with metadata filtering

In [2]:
from llama_index.core import Document
from llama_index.core.node_parser import SimpleNodeParser, MarkdownNodeParser, HTMLNodeParser
from llama_index.readers.file import PDFReader, DocxReader
from llama_index.core.schema import MetadataMode
import os
from datetime import datetime
import re
from typing import List, Dict, Any, Optional, Tuple

class DocumentProcessor:
    """A complete document processing pipeline"""
    
    def __init__(self, storage_dir: str = "./processed_docs"):
        """Initialize the document processor"""
        self.storage_dir = storage_dir
        self.documents = []
        self.nodes = []
        self.document_map = []
        
        # Create storage directory if it doesn't exist
        if not os.path.exists(storage_dir):
            os.makedirs(storage_dir)
    
    def load_document(self, file_path: str) -> Optional[Document]:
        """
        Load a document from a file path using the appropriate reader
        
        TODO: Implement this function to:
        1. Determine the file type based on extension
        2. Use the appropriate reader (PDFReader, DocxReader, etc.)
        3. Load and return the document with basic metadata
        4. Return None if the file type is unsupported or there's an error
        """
        # YOUR CODE HERE
        pass
    
    def extract_metadata(self, doc: Document) -> Dict[str, Any]:
        """
        Extract metadata from document content and file information
        
        TODO: Implement this function to:
        1. Extract basic file metadata (filename, path, type, etc.)
        2. Try to identify document date/year from content
        3. Attempt to determine document type/category
        4. Extract any other useful metadata (authors, titles, etc.)
        """
        # YOUR CODE HERE
        pass
    
    def parse_document(self, doc: Document) -> List[Document]:
        """
        Parse a document into nodes based on structure
        
        TODO: Implement this function to:
        1. Determine the appropriate parser based on document type
        2. Parse the document into nodes that respect its structure
        3. Ensure metadata is properly propagated to nodes
        4. Return the list of nodes
        """
        # YOUR CODE HERE
        pass
    
    def create_document_map(self, nodes: List[Document]) -> List[Dict[str, Any]]:
        """
        Create a document map for navigation
        
        TODO: Implement this function to:
        1. Extract the hierarchical structure from nodes
        2. Create a map that shows document organization
        3. Include section headings and their relationships
        4. Store references to the actual content
        """
        # YOUR CODE HERE
        pass
    
    def process_directory(self, directory_path: str) -> int:
        """
        Process all documents in a directory
        
        TODO: Implement this function to:
        1. Find all supported documents in the directory
        2. Process each document through the pipeline
        3. Track success/failure statistics
        4. Return the number of successfully processed documents
        """
        # YOUR CODE HERE
        pass
    
    def search(self, query: str, filters: Optional[Dict[str, str]] = None) -> List[Dict[str, Any]]:
        """
        Search processed documents
        
        TODO: Implement this function to:
        1. Search document content for the query string
        2. Apply metadata filters if provided
        3. Return matching document sections with context
        4. Include document metadata in results
        """
        # YOUR CODE HERE
        pass
    
    def get_document_structure(self, doc_id: int) -> Dict[str, Any]:
        """
        Get the structure of a specific document
        
        TODO: Implement this function to:
        1. Return the hierarchical structure of the document
        2. Include section titles and their relationships
        3. Format it for easy display (like a table of contents)
        """
        # YOUR CODE HERE
        pass
        
    def save_processed_documents(self) -> bool:
        """
        Save processed documents to storage directory
        
        This function is implemented for you
        """
        try:
            # Simple file-based storage
            with open(f"{self.storage_dir}/document_map.txt", "w") as f:
                for item in self.document_map:
                    f.write(f"{item['display']} - ID: {item['index']}\n")
            
            print(f"Saved document map with {len(self.document_map)} entries")
            return True
        except Exception as e:
            print(f"Error saving documents: {str(e)}")
            return False

# Example usage
if __name__ == "__main__":
    processor = DocumentProcessor()
    
    # Process a single document
    doc = processor.load_document("example.pdf")
    if doc:
        metadata = processor.extract_metadata(doc)
        nodes = processor.parse_document(doc)
        doc_map = processor.create_document_map(nodes)
        
        print(f"Processed document with {len(nodes)} nodes")
        print(f"Document structure has {len(doc_map)} sections")
        
    # Process a directory
    num_processed = processor.process_directory("./documents")
    print(f"Processed {num_processed} documents from directory")
    
    # Search example
    results = processor.search("budget", filters={"year": "2023"})
    print(f"Found {len(results)} results for 'budget' in 2023 documents")

Processed None documents from directory


TypeError: object of type 'NoneType' has no len()