In [None]:
import logging
from pathlib import Path

# -------------------[ Docling Imports ]-------------------
# We only need the DoclingDocument class to load the data,
# and ImageRefMode to specify the output format.
try:
    from docling_core.types.doc.document import DoclingDocument
    from docling_core.types.doc import ImageRefMode
except ImportError as e:
    print(f"ERROR: Could not import core Docling classes: {e}")
    print("Please ensure 'docling' and its dependencies are correctly installed.")
    print("Try running: pip install --upgrade docling docling-core")
    exit()

# -------------------[ Configuration ]-------------------
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# -------------------[ Main Conversion Function ]-------------------
def convert_json_to_html(json_path_str: str):
    """
    Loads a DoclingDocument from a JSON file and exports it as a
    self-contained HTML file with embedded images.

    Args:
        json_path_str (str): The path to the input .json file.
    """
    json_path = Path(json_path_str)
    if not json_path.is_file():
        logging.error(f"❌ JSON file not found: {json_path}")
        return

    # --- Step 1: Load the DoclingDocument from the JSON file ---
    logging.info(f"📄 Loading document from: {json_path.name}")
    try:
        # Use the parse_file class method to load the document from JSON
        document = DoclingDocument.parse_file(json_path)
        logging.info("✅ Document loaded successfully.")
    except Exception as e:
        logging.error(f"A critical error occurred while loading the JSON file: {e}", exc_info=True)
        return

    # --- Step 2: Save the Document as a Self-Contained HTML File ---
    # Define the output path for the new HTML file
    output_html_path = json_path.with_suffix(".html")
    
    logging.info(f"💾 Saving to HTML with embedded images...")
    try:
        # Use ImageRefMode.EMBEDDED to create a single, portable HTML file
        document.save_as_html(
            output_html_path,
            image_mode=ImageRefMode.EMBEDDED
        )
        logging.info(f"🎉 Success! HTML file generated at: {output_html_path.resolve()}")
    except Exception as e:
        logging.error(f"Failed to save document as HTML: {e}", exc_info=True)
        return

# -------------------[ Script Execution ]-------------------
if __name__ == "__main__":
    # This should be the JSON file generated by the previous script.
    # It will likely be in the 'output_smoldocling' folder.
    JSON_FILE_TO_PROCESS = r"C:\Users\Hongyu\OneDrive - Northwestern University\NU\Capstone\output_lmstudio_conversion\1-s2.0-S1385110124000054-main-google_gemma-3-12b-it-gguf.json"
    
    convert_json_to_html(JSON_FILE_TO_PROCESS)


2025-07-14 19:32:19,643 - INFO - 📄 Loading document from: 1-s2.0-S1385110124000054-main-google_gemma-3-12b-it-gguf.json
C:\Users\Hongyu\AppData\Local\Temp\ipykernel_22500\847600398.py:37: PydanticDeprecatedSince20: The `parse_file` method is deprecated; load the data from file, then if your data is JSON use `model_validate_json`, otherwise `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  document = DoclingDocument.parse_file(json_path)
2025-07-14 19:32:19,700 - INFO - ✅ Document loaded successfully.
2025-07-14 19:32:19,700 - INFO - 💾 Saving to HTML with embedded images...
2025-07-14 19:32:19,803 - INFO - 🎉 Success! HTML file generated at: C:\Users\Hongyu\OneDrive - Northwestern University\NU\Capstone\output_lmstudio_conversion\1-s2.0-S1385110124000054-main-google_gemma-3-12b-it-gguf.html
