## Same as V2 + page number extraction

In [9]:
from docx import Document
import json
import re
from xml.dom import minidom
import zipfile
from docx.oxml.shared import qn

def get_paragraph_number(paragraph):
    """Extract numbering from paragraph text if present."""
    match = re.match(r'^(\d+(\.\d+)*)\s', paragraph.text)
    if match:
        return match.group(1)
    return None

def get_text_formatting(paragraph):
    """
    Extract formatted elements from a paragraph.
    Returns a list of formatted text elements with their styles.
    """
    formatted_elements = []
    
    for run in paragraph.runs:
        # Skip empty runs
        if not run.text.strip():
            continue
            
        # Get formatting properties
        formatting = {
            "text": run.text,
            "bold": run.bold if hasattr(run, 'bold') else None,
            "italic": run.italic if hasattr(run, 'italic') else None,
            "underline": run.underline if hasattr(run, 'underline') else None
        }
        
        # Get color if it exists
        try:
            if run.font.color and run.font.color.rgb:
                rgb = run.font.color.rgb
                formatting["color"] = {
                    "r": rgb[0] if rgb else None,
                    "g": rgb[1] if rgb else None,
                    "b": rgb[2] if rgb else None
                }
        except AttributeError:
            pass
            
        # Get font name if it exists
        if hasattr(run.font, 'name') and run.font.name:
            formatting["font"] = run.font.name
            
        # Get font size if it exists
        try:
            if run.font.size:
                formatting["size"] = run.font.size.pt
        except AttributeError:
            pass
        
        # Only add elements that have some formatting
        has_formatting = any(v for k, v in formatting.items() 
                           if k != "text" and v is not None)
        if has_formatting:
            formatted_elements.append(formatting)
    
    return formatted_elements


def extract_page_numbers(docx_path):
    """
    Extract accurate page numbers by analyzing Word's OpenXML structure.
    """
    page_info = {}
    
    with zipfile.ZipFile(docx_path) as zip_ref:
        # Get main document content
        with zip_ref.open('word/document.xml') as xml_file:
            doc = minidom.parseString(xml_file.read())
            
        # Try to get header/footer content for page number fields
        header_footer_refs = {}
        try:
            rels_path = 'word/_rels/document.xml.rels'
            with zip_ref.open(rels_path) as rels_file:
                rels = minidom.parseString(rels_file.read())
                for rel in rels.getElementsByTagName('Relationship'):
                    if 'header' in rel.getAttribute('Target').lower() or 'footer' in rel.getAttribute('Target').lower():
                        header_footer_refs[rel.getAttribute('Id')] = rel.getAttribute('Target')
        except KeyError:
            pass

    # Find last rendered page breaks
    paragraphs = doc.getElementsByTagName('w:p')
    current_page = 1
    
    for para in paragraphs:
        para_id = para.getAttribute('w14:paraId')
        if not para_id:
            continue
            
        # Check for last rendered page break
        for child in para.getElementsByTagName('w:lastRenderedPageBreak'):
            current_page += 1
            
        # Store page number for this paragraph
        page_info[para_id] = current_page
        
        # Check section properties for page setup
        sect_pr = para.getElementsByTagName('w:sectPr')
        if sect_pr:
            # New section might indicate new page
            if sect_pr[0].getElementsByTagName('w:type'):
                current_page += 1
    
    return page_info

def get_paragraph_page(paragraph, page_info):
    """
    Get the page number for a paragraph.
    """
    para_id = paragraph._element.get(qn('w14:paraId'))
    if not para_id:
        para_id = paragraph._element.get('w14:paraId')
    
    return page_info.get(para_id)

def extract_docx_content_with_pages(input_file):
    """
    Extract document content with accurate page numbers.
    """
    doc = Document(input_file)
    document_structure = []
    current_header = None
    
    # Get page number information
    page_info = extract_page_numbers(input_file)
    
    # Process document content
    for paragraph in doc.paragraphs:
        page_number = get_paragraph_page(paragraph, page_info)
        
        if paragraph.style.name.startswith('Heading'):
            level = int(paragraph.style.name.split()[-1])
            number = get_paragraph_number(paragraph)
            formatted_elements = get_text_formatting(paragraph)
            
            current_header = {
                "text": paragraph.text,
                "level": level,
                "style": paragraph.style.name,
                "number": number,
                "formatted_elements": formatted_elements,
                "page": page_number
            }
            document_structure.append({
                "header": current_header,
                "paragraphs": [],
                "formatted_paragraphs": []
            })
        elif paragraph.text.strip():
            formatted_elements = get_text_formatting(paragraph)
            
            if document_structure:
                document_structure[-1]["paragraphs"].append({
                    "text": paragraph.text,
                    "page": page_number
                })
                if formatted_elements:
                    document_structure[-1]["formatted_paragraphs"].append({
                        "text": paragraph.text,
                        "formatted_elements": formatted_elements,
                        "page": page_number
                    })
            else:
                document_structure.append({
                    "header": None,
                    "paragraphs": [{
                        "text": paragraph.text,
                        "page": page_number
                    }],
                    "formatted_paragraphs": [{
                        "text": paragraph.text,
                        "formatted_elements": formatted_elements,
                        "page": page_number
                    }] if formatted_elements else []
                })
    
    return document_structure

def print_outline_with_pages(structure):
    """
    Print document outline with page numbers in a clean format.
    """
    print("\nDocument Outline")
    print("===============\n")
    
    for section in structure:
        if section.get("header"):
            header = section["header"]
            indent = "    " * (header["level"] - 1)
            header_text = header["text"].strip()
            
            if header.get("number"):
                header_text = f"{header['number']} {header_text}"
                
            page_num = f"[p.{header.get('page', '?')}]"
            dots = "." * (60 - len(indent) - len(header_text) - len(page_num))
            
            print(f"{indent}{header_text} {dots} {page_num}")
            
            # Print formatted elements if they exist
            if section["formatted_paragraphs"]:
                for para in section["formatted_paragraphs"]:
                    for elem in para["formatted_elements"]:
                        styles = []
                        if elem.get("bold"): styles.append("B")
                        if elem.get("italic"): styles.append("I")
                        if elem.get("underline"): styles.append("U")
                        
                        if styles:
                            elem_text = f"{elem['text']}[{','.join(styles)}]"
                            elem_page = f"[p.{para.get('page', '?')}]"
                            print(f"{indent}  • {elem_text} {elem_page}")
                            
            print()  # Add spacing between sections


In [10]:
def print_table_of_contents_with_metadata(structure):
    """
    Print table of contents with formatting metadata under each section.
    Includes styled elements found in the paragraphs under each heading.
    """
    print("\nTable of Contents")
    print("================\n")
    
    def format_header(header, indent_level=0):
        """Helper function to format header with proper indentation"""
        indent = "    " * indent_level
        header_text = header["text"].strip()
        if header.get("number"):
            header_text = f"{header['number']} {header_text}"
        
        # Add page number if available
        page_num = header.get("page", "?")
        dots = "." * (60 - len(indent) - len(header_text) - len(str(page_num)) - 4)  # 4 for "[p.]"
        return f"{indent}{header_text} {dots} [p.{page_num}]"
    
    def format_metadata(formatted_paragraphs):
        """Helper function to extract and format metadata from paragraphs"""
        metadata = []
        for para in formatted_paragraphs:
            for elem in para["formatted_elements"]:
                meta_item = {
                    "text": elem["text"].strip(),
                    "styles": [],
                    "page": para.get("page", "?")
                }
                
                if elem.get("bold"):
                    meta_item["styles"].append("bold")
                if elem.get("italic"):
                    meta_item["styles"].append("italic")
                if elem.get("underline"):
                    meta_item["styles"].append("underlined")
                if elem.get("color"):
                    color = elem["color"]
                    meta_item["styles"].append(
                        f"color:rgb({color['r']},{color['g']},{color['b']})"
                    )
                if elem.get("font"):
                    meta_item["styles"].append(f"font:{elem['font']}")
                if elem.get("size"):
                    meta_item["styles"].append(f"size:{elem['size']}")
                
                if meta_item["styles"]:  # Only add items with formatting
                    metadata.append(meta_item)
        return metadata
    
    for section in structure:
        if section.get("header"):
            header = section["header"]
            # Print header with proper indentation
            print(format_header(header, header["level"] - 1))
            
            # Process formatted elements under this header
            metadata = format_metadata(section.get("formatted_paragraphs", []))
            
            # Print metadata if exists
            if metadata:
                print("    └── Formatted elements:")
                for item in metadata:
                    print(f"        • \"{item['text']}\" [p.{item['page']}]")
            print()  # Add blank line for readability

def print_toc_compact(structure):
    """
    Print a more compact version of the table of contents with metadata.
    """
    print("\nTable of Contents (Compact)")
    print("=========================\n")
    
    for section in structure:
        if section.get("header"):
            header = section["header"]
            indent = "  " * (header["level"] - 1)
            
            # Print header with page number
            header_text = header["text"].strip()
            if header.get("number"):
                header_text = f"{header['number']} {header_text}"
            
            # Add page number
            page_num = header.get("page", "?")
            dots = "." * (60 - len(indent) - len(header_text) - len(str(page_num)) - 4)
            print(f"{indent}{header_text} {dots} [p.{page_num}]")
            
            # Print formatted elements inline
            formatted_elements = []
            for para in section.get("formatted_paragraphs", []):
                for elem in para["formatted_elements"]:
                    styles = []
                    if elem.get("bold"): styles.append("B")
                    if elem.get("italic"): styles.append("I")
                    if elem.get("underline"): styles.append("U")
                    if styles:
                        formatted_elements.append(
                            f"{elem['text']}[{','.join(styles)}] [p.{para.get('page', '?')}]"
                        )
            
            if formatted_elements:
                print(f"{indent}  └─ {', '.join(formatted_elements)}")
            print()  # Add blank line for readability

## Try

In [11]:
input_file = "embedded_docx_embedded_1.docx"
structure = extract_docx_content_with_pages(input_file)

# print_outline_with_pages(structure)
print_table_of_contents_with_metadata(structure)


Table of Contents

INTRODUCTION ........................................... [p.3]
    └── Formatted elements:
        • "Thales AVS MIS needs the support of the Thales Digital Factory especially the Professional Service team to explore using" [p.3]
        • "TrustNest" [p.3]
        • "Digital Platform and support AVS team in two POCs" [p.3]
        • "described below" [p.3]
        • "." [p.3]
        • "This commercial proposal will describe the support provided during the “Build” phase. “Run” costs" [p.3]
        • "related to TDP usages" [p.3]
        • "are not included in this commercial proposal and should be estimated" [p.3]
        • "by" [p.3]
        • "You must also estimate your Azure Consumptions due to the usage. Please find enclosed to this commercial proposal, the Billing User Guide." [p.3]
        • "The objectives of the PoC are:" [p.3]
        • "Run POCs on the largest VM sizes to measure performance" [p.3]
        • "gains" [p.3]
        • "Explore optimization 

In [12]:
input_file = "repports/financial_stament_review.docx"
structure = extract_docx_content_with_pages(input_file)

# print_outline_with_pages(structure)
print_table_of_contents_with_metadata(structure)


Table of Contents

There are four major financial statements used to communicate information to external users (creditors, investors, suppliers, etc.) -  [p.None]
    └── Formatted elements:
        • "" [p.None]

Balance Sheet ....................................... [p.None]
    └── Formatted elements:
        • "" [p.None]

Income Statement .................................... [p.None]
    └── Formatted elements:
        • "-Cost of Goods Sold" [p.None]
        • "-Operating Expenses" [p.None]
        • "+/- Miscellaneous" [p.None]
        • "-Income Tax Expense" [p.None]

Statement of Changes in Shareholder’s Equity ........ [p.None]
    └── Formatted elements:
        • "Plus Net Income or Minus Net Loss" [p.None]

Statement of Cash Flows ............................. [p.None]
    └── Formatted elements:
        • "cash" [p.None]
        • "cash" [p.None]
        • "" [p.None]
        • "Preparation of Financial Statements:" [p.None]
        • "Example 1:" [p.None]
        • "B