-
Notifications
You must be signed in to change notification settings - Fork 557
/
webpdf.py
165 lines (135 loc) · 5.93 KB
/
webpdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
"""Export to PDF via a headless browser"""
# Copyright (c) IPython Development Team.
# Distributed under the terms of the Modified BSD License.
import asyncio
import concurrent.futures
import os
import tempfile
from importlib import util as importlib_util
from traitlets import Bool, default
from .html import HTMLExporter
PYPPETEER_INSTALLED = importlib_util.find_spec("pyppeteer") is not None
class WebPDFExporter(HTMLExporter):
"""Writer designed to write to PDF files.
This inherits from :class:`HTMLExporter`. It creates the HTML using the
template machinery, and then run pyppeteer to create a pdf.
"""
export_from_notebook = "PDF via HTML"
allow_chromium_download = Bool(
False,
help="Whether to allow downloading Chromium if no suitable version is found on the system.",
).tag(config=True)
paginate = Bool(
True,
help="""
Split generated notebook into multiple pages.
If False, a PDF with one long page will be generated.
Set to True to match behavior of LaTeX based PDF generator
""",
).tag(config=True)
@default("file_extension")
def _file_extension_default(self):
return ".html"
@default("template_name")
def _template_name_default(self):
return "webpdf"
disable_sandbox = Bool(
False,
help="""
Disable chromium security sandbox when converting to PDF.
WARNING: This could cause arbitrary code execution in specific circumstances,
where JS in your notebook can execute serverside code! Please use with
caution.
``https://github.com/puppeteer/puppeteer/blob/main@%7B2020-12-14T17:22:24Z%7D/docs/troubleshooting.md#setting-up-chrome-linux-sandbox``
has more information.
This is required for webpdf to work inside most container environments.
""",
).tag(config=True)
def _check_launch_reqs(self):
try:
from pyppeteer import launch # type: ignore[import]
from pyppeteer.util import check_chromium # type:ignore
except ModuleNotFoundError as e:
msg = (
"Pyppeteer is not installed to support Web PDF conversion. "
"Please install `nbconvert[webpdf]` to enable."
)
raise RuntimeError(msg) from e
if not self.allow_chromium_download and not check_chromium():
msg = (
"No suitable chromium executable found on the system. "
"Please use '--allow-chromium-download' to allow downloading one."
)
raise RuntimeError(msg)
return launch
def run_pyppeteer(self, html):
"""Run pyppeteer."""
async def main(temp_file):
"""Run main pyppeteer script."""
args = ["--no-sandbox"] if self.disable_sandbox else []
browser = await self._check_launch_reqs()(
handleSIGINT=False, handleSIGTERM=False, handleSIGHUP=False, args=args
)
page = await browser.newPage()
await page.emulateMedia("print")
await page.waitFor(100)
await page.goto(f"file://{temp_file.name}", waitUntil="networkidle0")
await page.waitFor(100)
pdf_params = {"printBackground": True}
if not self.paginate:
# Floating point precision errors cause the printed
# PDF from spilling over a new page by a pixel fraction.
dimensions = await page.evaluate(
"""() => {
const rect = document.body.getBoundingClientRect();
return {
width: Math.ceil(rect.width) + 1,
height: Math.ceil(rect.height) + 1,
}
}"""
)
width = dimensions["width"]
height = dimensions["height"]
# 200 inches is the maximum size for Adobe Acrobat Reader.
pdf_params.update(
{
"width": min(width, 200 * 72),
"height": min(height, 200 * 72),
}
)
pdf_data = await page.pdf(pdf_params)
await browser.close()
return pdf_data
pool = concurrent.futures.ThreadPoolExecutor()
# Create a temporary file to pass the HTML code to Chromium:
# Unfortunately, tempfile on Windows does not allow for an already open
# file to be opened by a separate process. So we must close it first
# before calling Chromium. We also specify delete=False to ensure the
# file is not deleted after closing (the default behavior).
temp_file = tempfile.NamedTemporaryFile(suffix=".html", delete=False)
with temp_file:
temp_file.write(html.encode("utf-8"))
try:
# TODO: when dropping Python 3.6, use
# pdf_data = pool.submit(asyncio.run, main(temp_file)).result()
def run_coroutine(coro):
"""Run an internal coroutine."""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
return loop.run_until_complete(coro)
pdf_data = pool.submit(run_coroutine, main(temp_file)).result()
finally:
# Ensure the file is deleted even if pypeteer raises an exception
os.unlink(temp_file.name)
return pdf_data
def from_notebook_node(self, nb, resources=None, **kw):
"""Convert from a notebook node."""
self._check_launch_reqs()
html, resources = super().from_notebook_node(nb, resources=resources, **kw)
self.log.info("Building PDF")
pdf_data = self.run_pyppeteer(html)
self.log.info("PDF successfully created")
# convert output extension to pdf
# the writer above required it to be html
resources["output_extension"] = ".pdf"
return pdf_data, resources