nbconvert/exporters/webpdf.py

"""Export to PDF via a headless browser"""

# Copyright (c) IPython Development Team.
# Distributed under the terms of the Modified BSD License.

import asyncio
import concurrent.futures
import os
import tempfile
from importlib import util as importlib_util

from traitlets import Bool, default

from .html import HTMLExporter

PYPPETEER_INSTALLED = importlib_util.find_spec("pyppeteer") is not None


class WebPDFExporter(HTMLExporter):
    """Writer designed to write to PDF files.

    This inherits from :class:`HTMLExporter`. It creates the HTML using the
    template machinery, and then run pyppeteer to create a pdf.
    """

    export_from_notebook = "PDF via HTML"

    allow_chromium_download = Bool(
        False,
        help="Whether to allow downloading Chromium if no suitable version is found on the system.",
    ).tag(config=True)

    paginate = Bool(
        True,
        help="""
        Split generated notebook into multiple pages.

        If False, a PDF with one long page will be generated.

        Set to True to match behavior of LaTeX based PDF generator
        """,
    ).tag(config=True)

    @default("file_extension")
    def _file_extension_default(self):
        return ".html"

    @default("template_name")
    def _template_name_default(self):
        return "webpdf"

    disable_sandbox = Bool(
        False,
        help="""
        Disable chromium security sandbox when converting to PDF.

        WARNING: This could cause arbitrary code execution in specific circumstances,
        where JS in your notebook can execute serverside code! Please use with
        caution.

        ``https://github.com/puppeteer/puppeteer/blob/main@%7B2020-12-14T17:22:24Z%7D/docs/troubleshooting.md#setting-up-chrome-linux-sandbox``
        has more information.

        This is required for webpdf to work inside most container environments.
        """,
    ).tag(config=True)

    def _check_launch_reqs(self):
        try:
            from pyppeteer import launch  # type: ignore[import]
            from pyppeteer.util import check_chromium  # type:ignore
        except ModuleNotFoundError as e:
            msg = (
                "Pyppeteer is not installed to support Web PDF conversion. "
                "Please install `nbconvert[webpdf]` to enable."
            )
            raise RuntimeError(msg) from e
        if not self.allow_chromium_download and not check_chromium():
            msg = (
                "No suitable chromium executable found on the system. "
                "Please use '--allow-chromium-download' to allow downloading one."
            )
            raise RuntimeError(msg)
        return launch

    def run_pyppeteer(self, html):
        """Run pyppeteer."""

        async def main(temp_file):
            """Run main pyppeteer script."""
            args = ["--no-sandbox"] if self.disable_sandbox else []
            browser = await self._check_launch_reqs()(
                handleSIGINT=False, handleSIGTERM=False, handleSIGHUP=False, args=args
            )
            page = await browser.newPage()
            await page.emulateMedia("print")
            await page.waitFor(100)
            await page.goto(f"file://{temp_file.name}", waitUntil="networkidle0")
            await page.waitFor(100)

            pdf_params = {"printBackground": True}
            if not self.paginate:
                # Floating point precision errors cause the printed
                # PDF from spilling over a new page by a pixel fraction.
                dimensions = await page.evaluate(
                    """() => {
                    const rect = document.body.getBoundingClientRect();
                    return {
                    width: Math.ceil(rect.width) + 1,
                    height: Math.ceil(rect.height) + 1,
                    }
                }"""
                )
                width = dimensions["width"]
                height = dimensions["height"]
                # 200 inches is the maximum size for Adobe Acrobat Reader.
                pdf_params.update(
                    {
                        "width": min(width, 200 * 72),
                        "height": min(height, 200 * 72),
                    }
                )
            pdf_data = await page.pdf(pdf_params)

            await browser.close()
            return pdf_data

        pool = concurrent.futures.ThreadPoolExecutor()
        # Create a temporary file to pass the HTML code to Chromium:
        # Unfortunately, tempfile on Windows does not allow for an already open
        # file to be opened by a separate process. So we must close it first
        # before calling Chromium. We also specify delete=False to ensure the
        # file is not deleted after closing (the default behavior).
        temp_file = tempfile.NamedTemporaryFile(suffix=".html", delete=False)
        with temp_file:
            temp_file.write(html.encode("utf-8"))
        try:
            # TODO: when dropping Python 3.6, use
            # pdf_data = pool.submit(asyncio.run, main(temp_file)).result()
            def run_coroutine(coro):
                """Run an internal coroutine."""
                loop = asyncio.new_event_loop()
                asyncio.set_event_loop(loop)
                return loop.run_until_complete(coro)

            pdf_data = pool.submit(run_coroutine, main(temp_file)).result()
        finally:
            # Ensure the file is deleted even if pypeteer raises an exception
            os.unlink(temp_file.name)
        return pdf_data

    def from_notebook_node(self, nb, resources=None, **kw):
        """Convert from a notebook node."""
        self._check_launch_reqs()
        html, resources = super().from_notebook_node(nb, resources=resources, **kw)

        self.log.info("Building PDF")
        pdf_data = self.run_pyppeteer(html)
        self.log.info("PDF successfully created")

        # convert output extension to pdf
        # the writer above required it to be html
        resources["output_extension"] = ".pdf"

        return pdf_data, resources