## Extract headlines and special style texts

In [1]:
from docx import Document
import json
import re

def get_paragraph_number(paragraph):
    """Extract numbering from paragraph text if present."""
    match = re.match(r'^(\d+(\.\d+)*)\s', paragraph.text)
    if match:
        return match.group(1)
    return None

def get_text_formatting(paragraph):
    """
    Extract formatted elements from a paragraph.
    Returns a list of formatted text elements with their styles.
    """
    formatted_elements = []
    
    for run in paragraph.runs:
        # Skip empty runs
        if not run.text.strip():
            continue
            
        # Get formatting properties
        formatting = {
            "text": run.text,
            "bold": run.bold if hasattr(run, 'bold') else None,
            "italic": run.italic if hasattr(run, 'italic') else None,
            "underline": run.underline if hasattr(run, 'underline') else None
        }
        
        # Get color if it exists
        try:
            if run.font.color and run.font.color.rgb:
                rgb = run.font.color.rgb
                formatting["color"] = {
                    "r": rgb[0] if rgb else None,
                    "g": rgb[1] if rgb else None,
                    "b": rgb[2] if rgb else None
                }
        except AttributeError:
            pass
            
        # Get font name if it exists
        if hasattr(run.font, 'name') and run.font.name:
            formatting["font"] = run.font.name
            
        # Get font size if it exists
        try:
            if run.font.size:
                formatting["size"] = run.font.size.pt
        except AttributeError:
            pass
        
        # Only add elements that have some formatting
        has_formatting = any(v for k, v in formatting.items() 
                           if k != "text" and v is not None)
        if has_formatting:
            formatted_elements.append(formatting)
    
    return formatted_elements

def extract_docx_content_python_docx(input_file):
    """
    Extract content from a docx file including headers, paragraphs, and text formatting.
    """
    doc = Document(input_file)
    document_structure = []
    current_header = None
    
    for paragraph in doc.paragraphs:
        if paragraph.style.name.startswith('Heading'):
            level = int(paragraph.style.name.split()[-1])
            number = get_paragraph_number(paragraph)
            
            # Extract formatting for header text
            formatted_elements = get_text_formatting(paragraph)
            
            current_header = {
                "text": paragraph.text,
                "level": level,
                "style": paragraph.style.name,
                "number": number,
                "formatted_elements": formatted_elements
            }
            document_structure.append({
                "header": current_header,
                "paragraphs": [],
                "formatted_paragraphs": []
            })
        elif paragraph.text.strip():
            # Extract formatting for paragraph text
            formatted_elements = get_text_formatting(paragraph)
            
            if document_structure:
                document_structure[-1]["paragraphs"].append(paragraph.text)
                if formatted_elements:  # Only add if there are formatted elements
                    document_structure[-1]["formatted_paragraphs"].append({
                        "text": paragraph.text,
                        "formatted_elements": formatted_elements
                    })
            else:
                document_structure.append({
                    "header": None,
                    "paragraphs": [paragraph.text],
                    "formatted_paragraphs": [{
                        "text": paragraph.text,
                        "formatted_elements": formatted_elements
                    }] if formatted_elements else []
                })
    
    # Extract basic metadata
    metadata = {
        "author": doc.core_properties.author,
        "created": doc.core_properties.created.isoformat() if doc.core_properties.created else None,
        "modified": doc.core_properties.modified.isoformat() if doc.core_properties.modified else None,
        "title": doc.core_properties.title,
    }
    
    return document_structure, metadata

def print_formatted_structure_infos(structure):
    """Print document structure with formatting information."""
    for section in structure:
        if section["header"]:
            print("\nHeader:", section["header"]["text"])
            print("Level:", section["header"]["level"])
            print("Style:", section["header"]["style"])
            if section["header"]["formatted_elements"]:
                print("Header Formatting:")
                for elem in section["header"]["formatted_elements"]:
                    print(f"  - '{elem['text']}': ", end="")
                    formats = []
                    if elem.get("bold"): formats.append("bold")
                    if elem.get("italic"): formats.append("italic")
                    if elem.get("underline"): formats.append("underlined")
                    if elem.get("color"): 
                        formats.append(f"color: rgb({elem['color']['r']},{elem['color']['g']},{elem['color']['b']})")
                    print(", ".join(formats))
        
        if section["formatted_paragraphs"]:
            print("\nFormatted Paragraphs:")
            for para in section["formatted_paragraphs"]:
                for elem in para["formatted_elements"]:
                    print(f"  - '{elem['text']}': ", end="")
                    formats = []
                    if elem.get("bold"): formats.append("bold")
                    if elem.get("italic"): formats.append("italic")
                    if elem.get("underline"): formats.append("underlined")
                    if elem.get("color"): 
                        formats.append(f"color: rgb({elem['color']['r']},{elem['color']['g']},{elem['color']['b']})")
                    print(", ".join(formats))

In [2]:
def print_table_of_contents_with_metadata(structure):
    """
    Print table of contents with formatting metadata under each section.
    Includes styled elements found in the paragraphs under each heading.
    """
    print("\nTable of Contents")
    print("================\n")
    
    def format_header(header, indent_level=0):
        """Helper function to format header with proper indentation"""
        indent = "    " * indent_level
        header_text = header["text"].strip()
        if header.get("number"):
            header_text = f"{header['number']} {header_text}"
        return f"{indent}{header_text}"
    
    def format_metadata(formatted_paragraphs):
        """Helper function to extract and format metadata from paragraphs"""
        metadata = []
        for para in formatted_paragraphs:
            for elem in para["formatted_elements"]:
                meta_item = {
                    "text": elem["text"].strip(),
                    "styles": []
                }
                
                if elem.get("bold"):
                    meta_item["styles"].append("bold")
                if elem.get("italic"):
                    meta_item["styles"].append("italic")
                if elem.get("underline"):
                    meta_item["styles"].append("underlined")
                if elem.get("color"):
                    color = elem["color"]
                    meta_item["styles"].append(
                        f"color:rgb({color['r']},{color['g']},{color['b']})"
                    )
                if elem.get("font"):
                    meta_item["styles"].append(f"font:{elem['font']}")
                if elem.get("size"):
                    meta_item["styles"].append(f"size:{elem['size']}")
                
                if meta_item["styles"]:  # Only add items with formatting
                    metadata.append(meta_item)
        return metadata
    
    for section in structure:
        if section.get("header"):
            header = section["header"]
            # Print header with proper indentation
            print(format_header(header, header["level"] - 1))
            
            # Process formatted elements under this header
            metadata = format_metadata(section.get("formatted_paragraphs", []))
            
            # Print metadata if exists
            if metadata:
                print("    └── Formatted elements:")  
                for item in metadata:
                    # styles = ", ".join(item["styles"])
                    # print(f"        • \"{item['text']}\" ({styles})")
                    print(f"        • \"{item['text']}\"")
            print()  # Add blank line for readability

# Alternative version with more compact output
def print_toc_compact(structure):
    """
    Print a more compact version of the table of contents with metadata.
    """
    print("\nTable of Contents (Compact)")
    print("=========================\n")
    
    for section in structure:
        if section.get("header"):
            header = section["header"]
            indent = "  " * (header["level"] - 1)
            
            # Print header
            header_text = header["text"].strip()
            if header.get("number"):
                header_text = f"{header['number']} {header_text}"
            print(f"{indent}{header_text}")
            
            # Print formatted elements inline
            formatted_elements = []
            for para in section.get("formatted_paragraphs", []):
                for elem in para["formatted_elements"]:
                    styles = []
                    if elem.get("bold"): styles.append("B")
                    if elem.get("italic"): styles.append("I")
                    if elem.get("underline"): styles.append("U")
                    if styles:
                        formatted_elements.append(
                            f"{elem['text']}[{','.join(styles)}]"
                        )
            
            if formatted_elements:
                print(f"{indent}  └─ {', '.join(formatted_elements)}")
            print()  # Add blank line for readability


## Try

In [None]:

input_file = "repports/financial_stament_review2.docx"
structure, metadata = extract_docx_content_python_docx(input_file)

# Print formatted JSON structure
# print("Document Structure:")
# print(json.dumps(structure, indent=2))

# print("\nMetadata:")
# print(json.dumps(metadata, indent=2))

print("\nFormatted Output:")
print_table_of_contents_with_metadata(structure)

# print_toc_compact(structure)


Formatted Output:

Table of Contents

There are four major financial statements used to communicate information to external users (creditors, investors, suppliers, etc.) -
    └── Formatted elements:
        • ""

Balance Sheet
    └── Formatted elements:
        • ""

Income Statement
    └── Formatted elements:
        • "-Cost of Goods Sold"
        • "-Operating Expenses"
        • "+/- Miscellaneous"
        • "-Income Tax Expense"

Statement of Changes in Shareholder’s Equity
    └── Formatted elements:
        • "Plus Net Income or Minus Net Loss"

Statement of Cash Flows
    └── Formatted elements:
        • "cash"
        • "cash"
        • ""
        • "Preparation of Financial Statements:"
        • "Example 1:"

    Bowser blue Incorporated
    └── Formatted elements:
        • "Bowser Incorporated"
        • "Income Statement"
        • "For the year ending December 31, 2009"
        • "Net Income before taxes"
        • "21,200 Income tax expense"
        • "6360"
   

In [11]:
from docx import Document
import json
import re

def get_paragraph_number(paragraph):
    """Extract numbering from paragraph text if present."""
    match = re.match(r'^(\d+(\.\d+)*)\s', paragraph.text)
    if match:
        return match.group(1)
    return None

def get_paragraph_style(paragraph):
    """Get the base style information of a paragraph"""
    try:
        style = {
            "name": paragraph.style.name,
            "font": None,
            "size": None,
            "bold": None,
            "italic": None,
            "color": None
        }
        
        # Safely get font properties
        if hasattr(paragraph.style, 'font'):
            font = paragraph.style.font
            style.update({
                "font": font.name if hasattr(font, 'name') else None,
                "bold": font.bold if hasattr(font, 'bold') else None,
                "italic": font.italic if hasattr(font, 'italic') else None
            })
            
            # Safely get font size
            if hasattr(font, 'size') and font.size is not None:
                try:
                    style["size"] = font.size.pt
                except AttributeError:
                    pass

            # Safely get color
            if hasattr(font, 'color') and font.color and hasattr(font.color, 'rgb'):
                rgb = font.color.rgb
                if rgb:
                    style["color"] = {
                        "r": rgb[0],
                        "g": rgb[1],
                        "b": rgb[2]
                    }
                    
        return style
    except AttributeError:
        return None

def get_run_style(run):
    """Extract complete style information from a run"""
    style = {
        "text": run.text,
        "bold": None,
        "italic": None,
        "underline": None,
        "font": None,
        "size": None,
        "color": None,
        "highlight_color": None
    }
    
    # Safely get basic properties
    style.update({
        "bold": run.bold if hasattr(run, 'bold') else None,
        "italic": run.italic if hasattr(run, 'italic') else None,
        "underline": run.underline if hasattr(run, 'underline') else None
    })
    
    # Safely get font properties
    if hasattr(run, 'font'):
        font = run.font
        
        # Get font name
        if hasattr(font, 'name'):
            style["font"] = font.name
            
        # Get font size
        if hasattr(font, 'size') and font.size is not None:
            try:
                style["size"] = font.size.pt
            except AttributeError:
                pass
                
        # Get text color
        if hasattr(font, 'color') and font.color and hasattr(font.color, 'rgb'):
            rgb = font.color.rgb
            if rgb:
                style["color"] = {
                    "r": rgb[0],
                    "g": rgb[1],
                    "b": rgb[2]
                }
                
        # Get highlight color
        if hasattr(font, 'highlight_color'):
            style["highlight_color"] = font.highlight_color
            
    return style

def style_differs_from_base(run_style, base_style):
    """Compare run style with base paragraph style to detect differences"""
    if not base_style:
        return []
        
    differences = []
    
    # Compare each attribute
    if run_style.get("bold") is not None and run_style["bold"] != base_style.get("bold"):
        differences.append("bold")
    if run_style.get("italic") is not None and run_style["italic"] != base_style.get("italic"):
        differences.append("italic")
    if run_style.get("font") and run_style["font"] != base_style.get("font"):
        differences.append("font")
    if run_style.get("size") and run_style["size"] != base_style.get("size"):
        differences.append("size")
    if run_style.get("color") != base_style.get("color"):
        differences.append("color")
    if run_style.get("highlight_color"):
        differences.append("highlight_color")
    if run_style.get("underline"):
        differences.append("underline")
        
    return differences

def get_styled_text(paragraph):
    """Extract text with styles different from the base paragraph style"""
    base_style = get_paragraph_style(paragraph)
    styled_elements = []
    
    for run in paragraph.runs:
        if not run.text.strip():
            continue
            
        run_style = get_run_style(run)
        style_differences = style_differs_from_base(run_style, base_style)
        
        if style_differences:
            styled_element = {
                "text": run.text,
                "style_differences": style_differences,
                "styles": {}
            }
            
            # Include the actual style values that differ
            for diff in style_differences:
                if diff == "color" and run_style.get("color"):
                    styled_element["styles"]["color"] = run_style["color"]
                elif diff == "highlight_color":
                    styled_element["styles"]["highlight_color"] = run_style["highlight_color"]
                else:
                    styled_element["styles"][diff] = run_style.get(diff)
                    
            styled_elements.append(styled_element)
            
    return styled_elements

def extract_docx_content_python_docx(input_file):
    """Extract content from a docx file including headers, paragraphs, and differently styled text."""
    doc = Document(input_file)
    document_structure = []
    current_header = None
    
    for paragraph in doc.paragraphs:
        # Get styled text elements
        styled_elements = get_styled_text(paragraph)
        
        if paragraph.style.name.startswith('Heading'):
            level = int(paragraph.style.name.split()[-1])
            number = get_paragraph_number(paragraph)
            
            current_header = {
                "text": paragraph.text,
                "level": level,
                "style": paragraph.style.name,
                "number": number,
                "styled_elements": styled_elements
            }
            document_structure.append({
                "header": current_header,
                "paragraphs": [],
                "styled_text": []
            })
        elif paragraph.text.strip():
            if document_structure:
                document_structure[-1]["paragraphs"].append(paragraph.text)
                if styled_elements:
                    document_structure[-1]["styled_text"].append({
                        "text": paragraph.text,
                        "styled_elements": styled_elements
                    })
            else:
                document_structure.append({
                    "header": None,
                    "paragraphs": [paragraph.text],
                    "styled_text": [{
                        "text": paragraph.text,
                        "styled_elements": styled_elements
                    }] if styled_elements else []
                })
    
    # Extract metadata
    metadata = {
        "author": doc.core_properties.author,
        "created": doc.core_properties.created.isoformat() if doc.core_properties.created else None,
        "modified": doc.core_properties.modified.isoformat() if doc.core_properties.modified else None,
        "title": doc.core_properties.title,
    }
    
    return document_structure, metadata

def print_styled_text_summary(structure):
    """Print a summary of all text with different styling in the document"""
    print("\nStyled Text Summary")
    print("==================\n")
    
    for section in structure:
        if section["header"]:
            print(f"Section: {section['header']['text']}")
            
            # Print styled elements in header if any
            if section["header"].get("styled_elements"):
                print("  Header styled elements:")
                for elem in section["header"]["styled_elements"]:
                    print(f"    • \"{elem['text']}\"")
                    print(f"      Style differences: {', '.join(elem['style_differences'])}")
                    for style_name, style_value in elem['styles'].items():
                        print(f"      - {style_name}: {style_value}")
        
        # Print styled elements in paragraphs
        for styled_text in section["styled_text"]:
            for elem in styled_text["styled_elements"]:
                print(f"  • \"{elem['text']}\"")
                print(f"    Style differences: {', '.join(elem['style_differences'])}")
                for style_name, style_value in elem['styles'].items():
                    print(f"    - {style_name}: {style_value}")
        
        print()  # Add blank line between sections

In [12]:
input_file = "repports/financial_stament_review2.docx"
structure, metadata = extract_docx_content_python_docx(input_file)
print_styled_text_summary(structure)


Styled Text Summary

  • "Financial Statement Review: "
    Style differences: bold
    - bold: True
  • "Financial Statements Tutorial"
    Style differences: bold, underline
    - bold: True
    - underline: True
  • "TEST"
    Style differences: bold
    - bold: True

Section: There are four major financial statements used to communicate information to external users (creditors, investors, suppliers, etc.) - 
  Header styled elements:
    • "There are four major financial statements used to communicate information to external users (creditors, investors, suppliers, etc.) - "
      Style differences: color
      - color: None
  • "Balance Sheet  (assets, liabilities, and Shareholder’s equity) "
    Style differences: highlight_color
    - highlight_color: YELLOW (7)
  • ""
    Style differences: font
    - font: Wingdings

Section: Balance Sheet 
  Header styled elements:
    • "Balance Sheet "
      Style differences: color
      - color: None
  • ""
    Style differences: font
 

In [13]:
from docx import Document
import json
import re

# ... (keep all the previous helper functions the same: get_paragraph_number, get_paragraph_style, get_run_style, style_differs_from_base, get_styled_text) ...

def format_table_of_contents(structure):
    """
    Format table of contents with styled elements as a string.
    Returns a formatted string instead of printing directly.
    """
    output = []
    output.append("Table of Contents")
    output.append("================\n")
    
    def format_header(header, indent_level=0):
        """Helper function to format header with proper indentation"""
        indent = "    " * indent_level
        header_text = header["text"].strip()
        if header.get("number"):
            header_text = f"{header['number']} {header_text}"
        return f"{indent}{header_text}"
    
    for section in structure:
        if section.get("header"):
            # Add header text
            header = section["header"]
            output.append(format_header(header, 0))
            
            # Add styled elements if present
            if section.get("styled_text"):
                output.append("    └── Formatted elements:")
                for styled_text in section["styled_text"]:
                    for elem in styled_text["styled_elements"]:
                        output.append(f"        • \"{elem['text']}\"")
            elif section["header"].get("styled_elements"):
                output.append("    └── Formatted elements:")
                for elem in section["header"]["styled_elements"]:
                    output.append(f"        • \"{elem['text']}\"")
            
            output.append("")  # Add blank line for readability
            
        elif section.get("styled_text"):
            # Handle styled text in non-header sections
            output.append("    └── Formatted elements:")
            for styled_text in section["styled_text"]:
                for elem in styled_text["styled_elements"]:
                    output.append(f"        • \"{elem['text']}\"")
            output.append("")
    
    return "\n".join(output)

def extract_docx_content_python_docx(input_file):
    """Extract content from a docx file and return both the structure and a formatted string output."""
    doc = Document(input_file)
    document_structure = []
    current_header = None
    
    for paragraph in doc.paragraphs:
        # Get styled text elements
        styled_elements = get_styled_text(paragraph)
        
        if paragraph.style.name.startswith('Heading'):
            level = int(paragraph.style.name.split()[-1])
            number = get_paragraph_number(paragraph)
            
            current_header = {
                "text": paragraph.text,
                "level": level,
                "style": paragraph.style.name,
                "number": number,
                "styled_elements": styled_elements
            }
            document_structure.append({
                "header": current_header,
                "paragraphs": [],
                "styled_text": []
            })
        elif paragraph.text.strip():
            if document_structure:
                document_structure[-1]["paragraphs"].append(paragraph.text)
                if styled_elements:
                    document_structure[-1]["styled_text"].append({
                        "text": paragraph.text,
                        "styled_elements": styled_elements
                    })
            else:
                document_structure.append({
                    "header": None,
                    "paragraphs": [paragraph.text],
                    "styled_text": [{
                        "text": paragraph.text,
                        "styled_elements": styled_elements
                    }] if styled_elements else []
                })
    
    # Extract metadata
    metadata = {
        "author": doc.core_properties.author,
        "created": doc.core_properties.created.isoformat() if doc.core_properties.created else None,
        "modified": doc.core_properties.modified.isoformat() if doc.core_properties.modified else None,
        "title": doc.core_properties.title,
    }
    
    # Format the output string
    formatted_output = format_table_of_contents(document_structure)
    
    return document_structure, metadata, formatted_output

In [15]:
input_file = "repports/financial_stament_review2.docx"
structure, metadata, formatted_output = extract_docx_content_python_docx(input_file)

# Print the formatted output
print("Formatted Output:")
print(formatted_output)

# Or save it to a file
with open('output.txt', 'w', encoding='utf-8') as f:
    f.write(formatted_output)

Formatted Output:
Table of Contents

    └── Formatted elements:
        • "Financial Statement Review: "
        • "Financial Statements Tutorial"
        • "TEST"

There are four major financial statements used to communicate information to external users (creditors, investors, suppliers, etc.) -
    └── Formatted elements:
        • "Balance Sheet  (assets, liabilities, and Shareholder’s equity) "
        • ""

Balance Sheet
    └── Formatted elements:
        • ""

Income Statement
    └── Formatted elements:
        • "-Cost of Goods Sold"
        • "-Operating Expenses"
        • "+/- Miscellaneous"
        • "-Income Tax Expense"

Statement of Changes in Shareholder’s Equity
    └── Formatted elements:
        • "Plus Net Income or Minus Net Loss"

Statement of Cash Flows
    └── Formatted elements:
        • "cash"
        • "cash"
        • ""
        • "Preparation of Financial Statements: "
        • "Example 1: "

Bowser blue Incorporated
    └── Formatted elements:
    

In [6]:
input_file = "repports/financial_stament_review.docx"
structure, metadata = extract_docx_content_python_docx(input_file)

# Print formatted JSON structure
# print("Document Structure:")
# print(json.dumps(structure, indent=2))

# print("\nMetadata:")
# print(json.dumps(metadata, indent=2))

print("\nFormatted Output:")
print_table_of_contents_with_metadata(structure)

# print_toc_compact(structure)


Formatted Output:

Table of Contents

There are four major financial statements used to communicate information to external users (creditors, investors, suppliers, etc.) -
    └── Formatted elements:
        • ""

Balance Sheet
    └── Formatted elements:
        • ""

Income Statement
    └── Formatted elements:
        • "-Cost of Goods Sold"
        • "-Operating Expenses"
        • "+/- Miscellaneous"
        • "-Income Tax Expense"

Statement of Changes in Shareholder’s Equity
    └── Formatted elements:
        • "Plus Net Income or Minus Net Loss"

Statement of Cash Flows
    └── Formatted elements:
        • "cash"
        • "cash"
        • ""
        • "Preparation of Financial Statements:"
        • "Example 1:"
        • "Bowser Incorporated"
        • "Income Statement"
        • "For the year ending December 31, 2009"
        • "Net Income before taxes"
        • "21,200 Income tax expense"
        • "6360"
        • "Net Income"
        • "14,840"
        • "Bowser I

## Combine runs that are part of the same logical text unit

In [7]:
def get_text_formatting(paragraph):
    """
    Extract formatted elements from a paragraph, combining adjacent runs with same formatting.
    Returns a list of formatted text elements with their styles.
    """
    formatted_elements = []
    current_format = None
    current_text = []
    
    def format_matches(fmt1, fmt2):
        """Compare two format dictionaries to check if they have the same styling"""
        if not fmt1 or not fmt2:
            return False
        
        # Compare basic formatting
        basic_keys = ['bold', 'italic', 'underline']
        if any(fmt1.get(key) != fmt2.get(key) for key in basic_keys):
            return False
            
        # Compare colors
        color1 = fmt1.get('color', {})
        color2 = fmt2.get('color', {})
        if color1 != color2:
            return False
            
        # Compare font and size
        if fmt1.get('font') != fmt2.get('font'):
            return False
        if fmt1.get('size') != fmt2.get('size'):
            return False
            
        return True

    def create_format_dict(run):
        """Create a formatting dictionary for a run"""
        formatting = {
            "bold": run.bold if hasattr(run, 'bold') else None,
            "italic": run.italic if hasattr(run, 'italic') else None,
            "underline": run.underline if hasattr(run, 'underline') else None
        }
        
        # Get color
        try:
            if run.font.color and run.font.color.rgb:
                rgb = run.font.color.rgb
                formatting["color"] = {
                    "r": rgb[0] if rgb else None,
                    "g": rgb[1] if rgb else None,
                    "b": rgb[2] if rgb else None
                }
        except AttributeError:
            pass
            
        # Get font name
        if hasattr(run.font, 'name') and run.font.name:
            formatting["font"] = run.font.name
            
        # Get font size
        try:
            if run.font.size:
                formatting["size"] = run.font.size.pt
        except AttributeError:
            pass
            
        return formatting
    
    def append_current_element():
        """Helper function to append the current element to formatted_elements"""
        if current_text and current_format:
            # Check if there's actual formatting
            has_formatting = any(v for k, v in current_format.items() 
                               if k != "text" and v is not None)
            if has_formatting:
                formatted_element = current_format.copy()
                formatted_element["text"] = "".join(current_text).strip()
                if formatted_element["text"]:  # Only add if there's text
                    formatted_elements.append(formatted_element)
    
    # Process each run in the paragraph
    for run in paragraph.runs:
        if not run.text.strip():
            continue
            
        # Get formatting for current run
        new_format = create_format_dict(run)
        
        # Check if this run should be combined with previous runs
        if format_matches(current_format, new_format):
            # Same formatting, append text
            current_text.append(run.text)
        else:
            # Different formatting, create new element
            append_current_element()
            current_format = new_format
            current_text = [run.text]
    
    # Add the last element
    append_current_element()
    
    return formatted_elements

# Example usage remains the same
# The rest of the code remains unchanged

In [8]:
input_file = "embedded_docx_embedded_1.docx"
structure, metadata = extract_docx_content_python_docx(input_file)

# Print formatted JSON structure
# print("Document Structure:")
# print(json.dumps(structure, indent=2))

# print("\nMetadata:")
# print(json.dumps(metadata, indent=2))

print("\nFormatted Output:")
print_table_of_contents_with_metadata(structure)


Formatted Output:

Table of Contents

INTRODUCTION
    └── Formatted elements:
        • "Thales AVS MIS needs the support of the Thales Digital Factory especially the Professional Service team to explore using TrustNest Digital Platform and support AVS team in two POCs described below. This commercial proposal will describe the support provided during the “Build” phase. “Run” costs related to TDP usages are not included in this commercial proposal and should be estimated by You must also estimate your Azure Consumptions due to the usage. Please find enclosed to this commercial proposal, the Billing User Guide."
        • "The objectives of the PoC are:"
        • "Run POCs on the largest VM sizes to measure performance gains"
        • "Explore optimization possibilities and areas for improvement using the algorithms provided"
        • "The PoC approach are the following:"
        • "PoC #1 : Simulation Monte Carlo (CPUneeds) – ACP Project"
        • "Measure performance improvement