In [None]:
# Here's a Python solution using PyMuPDF and LangChain for PDF processing and metadata extraction. This code handles complex document structures and integrates with vector databases:

# ```python
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

def extract_pdf_structure(pdf_path):
    doc = fitz.open(pdf_path)
    document_metadata = []
    
    for page_num, page in enumerate(doc):
        page_data = {
            "page_number": page_num + 1,
            "headers": [],
            "subheaders": [],
            "paragraphs": [],
            "full_text": ""
        }
        
        # Extract text with formatting information
        blocks = page.get_text("dict")["blocks"]
        current_header = None
        current_subheader = None
        
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        text = span["text"].strip()
                        font_size = span["size"]
                        
                        # Header detection logic
                        if font_size >= 20 and text:
                            page_data["headers"].append(text)
                            current_header = text
                            current_subheader = None
                        elif font_size >= 16 and text:
                            page_data["subheaders"].append(text)
                            current_subheader = text
                        else:
                            if text:
                                paragraph_data = {
                                    "text": text,
                                    "header": current_header,
                                    "subheader": current_subheader,
                                    "page": page_num + 1
                                }
                                page_data["paragraphs"].append(paragraph_data)
                                page_data["full_text"] += text + "\n"
        
        document_metadata.append(page_data)
    
    return document_metadata

def process_pdf(pdf_path):
    # Extract document structure
    structured_data = extract_pdf_structure(pdf_path)
    
    # Recursive chunking with metadata preservation
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    
    documents = []
    for page in structured_data:
        for paragraph in page["paragraphs"]:
            metadata = {
                "page": page["page_number"],
                "header": paragraph["header"],
                "subheader": paragraph["subheader"]
            }
            documents.append((paragraph["text"], metadata))
    
    # Create FAISS vector store
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    texts = [doc[0] for doc in documents]
    metadatas = [doc[1] for doc in documents]
    
    vector_db = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
    vector_db.save_local("pdf_vector_index")
    
    return structured_data, vector_db

# Usage
pdf_path = "your_document.pdf"
structured_data, vector_db = process_pdf(pdf_path)

# Generate document structure report
print("Document Structure Overview:")
for page in structured_data:
    print(f"\nPage {page['page_number']}:")
    print(f"Headers: {', '.join(page['headers'])}")
    print(f"Subheaders: {', '.join(page['subheaders'])}")

# Key components of this solution:

# **Document Structure Extraction**  
# 1. **Header Detection**: Uses font size thresholds (adjust based on your document)
# 2. **Metadata Tracking**: Maintains header/subheader context through document flow
# 3. **Paragraph Association**: Links content to current header hierarchy

# **Output Structure**  
# The code generates:
# 1. List of pages with headers/subheaders
# 2. Vector database with chunked content and metadata
# 3. Hierarchical document structure report

# Example output format for document structure:
# ```
# Page 1:
# Headers: Introduction, Abstract
# Subheaders: Project Overview, Methodology

# Page 2:
# Headers: System Architecture
# Subheaders: Components Diagram, Data Flow
# ```

# **Integration Options**  
# For Chroma DB instead of FAISS:
# ```python
# from langchain.vectorstores import Chroma

# # Replace FAISS code with:
# vector_db = Chroma.from_texts(
#     texts=texts,
#     embedding=embeddings,
#     metadatas=metadatas,
#     persist_directory="chroma_db"
# )
# ```

# **Optimization Tips**:
# 1. Adjust font size thresholds based on your document's typography
# 2. Add logic for bold/italic text detection using `span["flags"]`
# 3. Implement custom chunking strategies for different content types
# 4. Add error handling for malformed PDFs
# 5. Include PDF text cleanup/normalization steps

# This implementation provides a balance between automated structure detection and customizable parameters for handling various document formats.

# ---
# Answer from Perplexity: pplx.ai/share