In [1]:
import logging
from pathlib import Path
import json

# -------------------[ Docling Imports ]-------------------
# We only need the DoclingDocument class to load the data,
# and ImageRefMode to specify the output format.
try:
    from docling_core.types.doc.document import DoclingDocument
    # ImageRefMode should be imported from docling_core.types.doc.base
    from docling_core.types.doc.base import ImageRefMode
except ImportError as e:
    print(f"ERROR: Could not import core Docling classes: {e}")
    print("Please ensure 'docling' and its dependencies are correctly installed.")
    print("Try running: pip install --upgrade docling docling-core")
    exit()

# -------------------[ Configuration ]-------------------
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# -------------------[ Main Conversion Function ]-------------------
def convert_json_to_html(json_path_str: str):
    """
    Loads a DoclingDocument from a JSON file and exports it as a
    self-contained HTML file with embedded images.

    Args:
        json_path_str (str): The path to the input .json file.
    """
    json_path = Path(json_path_str)
    if not json_path.is_file():
        logging.error(f"❌ JSON file not found: {json_path}")
        return

    # --- Step 1: Load the DoclingDocument from the JSON file ---
    logging.info(f"📄 Loading document from: {json_path.name}")
    try:
        # Read the JSON file as text
        with open(json_path, "r", encoding="utf-8") as f:
            json_str = f.read()
        # Use model_validate_json to load the document from JSON string (Pydantic v2+)
        document = DoclingDocument.model_validate_json(json_str)
        logging.info("✅ Document loaded successfully.")
    except Exception as e:
        logging.error(f"A critical error occurred while loading the JSON file: {e}", exc_info=True)
        return

    # --- Step 2: Save the Document as a Self-Contained HTML File ---
    # Define the output path for the new HTML file
    output_html_path = json_path.with_suffix(".html")
    
    logging.info(f"💾 Saving to HTML with embedded images...")
    try:
        # Use ImageRefMode.EMBEDDED to create a single, portable HTML file
        document.save_as_html(
            output_html_path,
            image_mode=ImageRefMode.EMBEDDED
        )
        logging.info(f"🎉 Success! HTML file generated at: {output_html_path.resolve()}")
    except Exception as e:
        logging.error(f"Failed to save document as HTML: {e}", exc_info=True)
        return

# -------------------[ Script Execution ]-------------------
if __name__ == "__main__":
    # This should be the JSON file generated by the previous script.
    # It will likely be in the 'output_smoldocling' folder.
    JSON_FILE_TO_PROCESS = r"C:\Users\Hongyu\OneDrive - Northwestern University\NU\Capstone\output_lmstudio_conversion\1-s2.0-S1385110124000054-main-google_gemma-3-12b-it-gguf.json"
    
    convert_json_to_html(JSON_FILE_TO_PROCESS)


2025-07-29 01:19:03,033 - INFO - 📄 Loading document from: 1-s2.0-S1385110124000054-main-google_gemma-3-12b-it-gguf.json
2025-07-29 01:19:03,092 - INFO - ✅ Document loaded successfully.
2025-07-29 01:19:03,092 - INFO - 💾 Saving to HTML with embedded images...
2025-07-29 01:19:03,201 - INFO - 🎉 Success! HTML file generated at: C:\Users\Hongyu\OneDrive - Northwestern University\NU\Capstone\output_lmstudio_conversion\1-s2.0-S1385110124000054-main-google_gemma-3-12b-it-gguf.html


In [32]:
import logging
from pathlib import Path
import json

# -------------------[ Docling Imports ]-------------------
try:
    from docling_core.types.doc.document import DoclingDocument
    from docling_core.types.doc.base import ImageRefMode
except ImportError as e:
    print(f"ERROR: Could not import core Docling classes: {e}")
    print("Please ensure 'docling' and its dependencies are correctly installed.")
    print("Try running: pip install --upgrade docling docling-core")
    exit()

# -------------------[ Configuration ]-------------------
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def is_valid_docling_json(json_str):
    """
    Try to validate the JSON string as a DoclingDocument.
    Returns the document if valid, otherwise None.
    """
    try:
        document = DoclingDocument.model_validate_json(json_str)
        return document
    except Exception:
        return None

def clean_json_for_docling(json_str: str):
    """
    Clean the JSON to make it compatible with DoclingDocument format.
    Removes AI-added fields and attempts to fix broken references that might cause validation errors.
    """
    try:
        data = json.loads(json_str)

        # Remove metadata that might cause issues
        metadata_keys_to_remove = [
            'step1_metadata', 
            'nlp_ready_metadata', 
            'enhancement_metadata'
        ]
        for key in metadata_keys_to_remove:
            if key in data:
                del data[key]
                logging.info(f"   🗑️ Removed {key} for compatibility")

        # Clean pictures array - remove AI analysis fields but keep structure
        if 'pictures' in data and isinstance(data['pictures'], list):
            cleaned_pictures = []
            for pic in data['pictures']:
                cleaned_pic = {}

                # Keep essential fields, remove AI analysis
                essential_fields = ['self_ref', 'captions', 'prov', 'bbox']
                for field in essential_fields:
                    if field in pic:
                        cleaned_pic[field] = pic[field]

                # Keep image data if it exists (for original files)
                if 'image' in pic:
                    cleaned_pic['image'] = pic['image']

                # For NLP-ready files, we might want to add a text representation
                # where the image used to be
                if 'ai_analysis' in pic and 'image' not in pic:
                    # Create a text placeholder for where the image was
                    description = pic['ai_analysis'].get('description') or pic['ai_analysis'].get('detailed_description', 'Image description not available')
                    # Add as a caption or text element
                    if 'captions' not in cleaned_pic:
                        cleaned_pic['captions'] = []
                    cleaned_pic['captions'].append({
                        'text': f"[AI Description: {description}]",
                        'type': 'ai_generated'
                    })

                cleaned_pictures.append(cleaned_pic)

            data['pictures'] = cleaned_pictures
            logging.info(f"   🧹 Cleaned {len(cleaned_pictures)} pictures for compatibility")

        # Attempt to fix broken references in 'body' and 'furniture' if present
        # This is a workaround for "list index out of range" errors in DoclingDocument validation
        for section in ['body', 'furniture']:
            if section in data and isinstance(data[section], list):
                # Remove any references that are out of bounds
                # This is a naive fix: remove any child_ref with invalid index
                def filter_valid_refs(nodes, parent_path=""):
                    valid_nodes = []
                    for idx, node in enumerate(nodes):
                        # If node has 'children', recursively filter them
                        if isinstance(node, dict) and 'children' in node and isinstance(node['children'], list):
                            node['children'] = filter_valid_refs(node['children'], parent_path + f"/{section}[{idx}]")
                        # If node has a 'ref' or similar, check if it is valid (skip for now)
                        valid_nodes.append(node)
                    return valid_nodes
                data[section] = filter_valid_refs(data[section])

        return json.dumps(data, indent=2, ensure_ascii=False)

    except Exception as e:
        logging.error(f"Failed to clean JSON: {e}")
        return json_str  # Return original if cleaning fails

# -------------------[ Main Conversion Function ]-------------------
def convert_json_to_html(json_path_str: str):
    """
    Loads a DoclingDocument from a JSON file and exports it as a
    self-contained HTML file. Handles both original and AI-enhanced JSON files.
    Tries to clean the JSON and fix broken references if validation fails.
    """
    json_path = Path(json_path_str)
    if not json_path.is_file():
        logging.error(f"❌ JSON file not found: {json_path}")
        return

    # --- Step 1: Load and clean the JSON if necessary ---
    logging.info(f"📄 Loading document from: {json_path.name}")
    try:
        # Read the JSON file as text
        with open(json_path, "r", encoding="utf-8") as f:
            json_str = f.read()

        # Try loading directly first
        document = is_valid_docling_json(json_str)
        if document:
            logging.info("✅ Document loaded successfully (original format).")
        else:
            logging.info("⚠️ Original format failed, trying to clean JSON for compatibility...")
            # Clean the JSON to make it compatible with DoclingDocument
            cleaned_json_str = clean_json_for_docling(json_str)
            document = is_valid_docling_json(cleaned_json_str)
            if document:
                logging.info("✅ Document loaded successfully (cleaned format).")
            else:
                logging.error("❌ Failed to load document after cleaning JSON. The file may have broken references or unsupported structure.")
                return

    except Exception as e:
        logging.error(f"A critical error occurred while loading the JSON file: {e}", exc_info=True)
        return

    # --- Step 2: Save the Document as a Self-Contained HTML File ---
    # Define the output path for the new HTML file
    output_html_path = json_path.with_suffix(".html")

    logging.info(f"💾 Saving to HTML...")
    try:
        # Use ImageRefMode.EMBEDDED to create a single, portable HTML file
        # For NLP-ready files without images, this will show the text content
        document.save_as_html(
            output_html_path,
            image_mode=ImageRefMode.EMBEDDED
        )
        logging.info(f"🎉 Success! HTML file generated at: {output_html_path.resolve()}")
    except Exception as e:
        logging.error(f"Failed to save document as HTML: {e}", exc_info=True)
        return

# -------------------[ Alternative Simple HTML Generator ]-------------------
def convert_json_to_simple_html(json_path_str: str):
    """
    Fallback method: Generate simple HTML directly from JSON content.
    Use this if Docling method fails completely.
    """
    json_path = Path(json_path_str)
    if not json_path.is_file():
        logging.error(f"❌ JSON file not found: {json_path}")
        return

    try:
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Extract main text
        main_text = data.get('main_text', '')

        # Simple HTML template
        html_content = f"""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Document Content</title>
    <style>
        body {{
            font-family: 'Times New Roman', serif;
            max-width: 800px;
            margin: 0 auto;
            padding: 40px;
            line-height: 1.6;
            background-color: white;
        }}
        .content {{
            white-space: pre-wrap;
            text-align: justify;
        }}
        .ai-description {{
            background-color: #f0f8ff;
            border-left: 4px solid #007bff;
            padding: 15px;
            margin: 20px 0;
            font-style: italic;
        }}
    </style>
</head>
<body>
    <div class="content">{main_text}</div>
    
    {generate_ai_descriptions(data.get('pictures', []))}
    
</body>
</html>
"""

        # Save HTML
        output_html_path = json_path.with_suffix("_simple.html")
        with open(output_html_path, "w", encoding="utf-8") as f:
            f.write(html_content)

        logging.info(f"✅ Simple HTML generated: {output_html_path.resolve()}")

    except Exception as e:
        logging.error(f"Failed to generate simple HTML: {e}")

def generate_ai_descriptions(pictures):
    """Generate HTML for AI descriptions of images."""
    if not pictures:
        return ""

    descriptions_html = ""
    for i, pic in enumerate(pictures, 1):
        if 'ai_analysis' in pic:
            description = pic['ai_analysis'].get('description') or pic['ai_analysis'].get('detailed_description', 'No description available')
            descriptions_html += f"""
            <div class="ai-description">
                <strong>Figure {i} Description:</strong><br>
                {description}
            </div>
            """

    return descriptions_html

# -------------------[ Script Execution ]-------------------
if __name__ == "__main__":
    # Specify the JSON file to process
    JSON_FILE_TO_PROCESS = r"C:\Users\Hongyu\OneDrive - Northwestern University\NU\Capstone\output_lmstudio_conversion\1-s2.0-S1385110124000054-main-google_gemma-3-12b-it-gguf_nlp_ready.json"

    # Try the main Docling method
    try:
        convert_json_to_html(JSON_FILE_TO_PROCESS)
    except Exception as e:
        logging.error(f"Main conversion failed: {e}")
        logging.info("Trying fallback method...")
        convert_json_to_simple_html(JSON_FILE_TO_PROCESS)

2025-07-21 23:03:24,612 - INFO - 📄 Loading document from: 1-s2.0-S1385110124000054-main-google_gemma-3-12b-it-gguf_nlp_ready.json
2025-07-21 23:03:24,623 - INFO - ⚠️ Original format failed, trying to clean JSON for compatibility...
2025-07-21 23:03:24,626 - INFO -    🗑️ Removed nlp_ready_metadata for compatibility
2025-07-21 23:03:24,646 - ERROR - ❌ Failed to load document after cleaning JSON. The file may have broken references or unsupported structure.
