Replace lxml.html.clean_html with bleach; drop lxml dependency (#1854)

jupyter · Sep 6, 2022 · b40bb13 · b40bb13
1 parent d3900ed
commit b40bb13
Show file tree

Hide file tree

Showing 6 changed files with 45 additions and 13,309 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@ docs/source/config/options
 docs/source/interactive/magics-generated.txt
 docs/gh-pages
 nbconvert/resources/style.min.css
+nbconvert/tests/files/*.html
 *.py[co]
 __pycache__
 *.egg-info

diff --git a/nbconvert/exporters/templateexporter.py b/nbconvert/exporters/templateexporter.py
@@ -22,7 +22,6 @@
     TemplateNotFound,
 )
 from jupyter_core.paths import jupyter_path
-from lxml.html.clean import clean_html
 from traitlets import Bool, Dict, HasTraits, List, Unicode, default, observe, validate
 from traitlets.config import Config
 from traitlets.utils.importstring import import_item
@@ -72,7 +71,7 @@
     "escape_html": lambda s: html.escape(str(s)),
     "escape_html_keep_quotes": lambda s: html.escape(str(s), quote=False),
     # For sanitizing HTML for any XSS
-    "clean_html": clean_html,
+    "clean_html": filters.clean_html,
     "strip_trailing_newline": filters.strip_trailing_newline,
     "text_base64": filters.text_base64,
 }

diff --git a/nbconvert/filters/strings.py b/nbconvert/filters/strings.py
@@ -15,12 +15,15 @@
 from urllib.parse import quote
 from xml.etree.ElementTree import Element
 
+import bleach
+
 # defusedxml does safe(r) parsing of untrusted XML data
 from defusedxml import ElementTree
 
 __all__ = [
     "wrap_text",
     "html2text",
+    "clean_html",
     "add_anchor",
     "strip_dollars",
     "strip_files_prefix",
@@ -75,6 +78,21 @@ def html2text(element):
     return text
 
 
+def clean_html(element):
+    if isinstance(element, bytes):
+        element = element.decode()
+    else:
+        element = str(element)
+    return bleach.clean(
+        element,
+        tags=[*bleach.ALLOWED_TAGS, "div", "pre", "code", "span"],
+        attributes={
+            **bleach.ALLOWED_ATTRIBUTES,
+            "*": ["class", "id"],
+        },
+    )
+
+
 def _convert_header_id(header_contents):
     """Convert header contents to valid id value. Takes string as input, returns string.