Skip to content

Commit

Permalink
Replace lxml.html.clean_html with bleach; drop lxml dependency (#1854)
Browse files Browse the repository at this point in the history
  • Loading branch information
akx committed Sep 6, 2022
1 parent d3900ed commit b40bb13
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 13,309 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ docs/source/config/options
docs/source/interactive/magics-generated.txt
docs/gh-pages
nbconvert/resources/style.min.css
nbconvert/tests/files/*.html
*.py[co]
__pycache__
*.egg-info
Expand Down
3 changes: 1 addition & 2 deletions nbconvert/exporters/templateexporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
TemplateNotFound,
)
from jupyter_core.paths import jupyter_path
from lxml.html.clean import clean_html
from traitlets import Bool, Dict, HasTraits, List, Unicode, default, observe, validate
from traitlets.config import Config
from traitlets.utils.importstring import import_item
Expand Down Expand Up @@ -72,7 +71,7 @@
"escape_html": lambda s: html.escape(str(s)),
"escape_html_keep_quotes": lambda s: html.escape(str(s), quote=False),
# For sanitizing HTML for any XSS
"clean_html": clean_html,
"clean_html": filters.clean_html,
"strip_trailing_newline": filters.strip_trailing_newline,
"text_base64": filters.text_base64,
}
Expand Down
18 changes: 18 additions & 0 deletions nbconvert/filters/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,15 @@
from urllib.parse import quote
from xml.etree.ElementTree import Element

import bleach

# defusedxml does safe(r) parsing of untrusted XML data
from defusedxml import ElementTree

__all__ = [
"wrap_text",
"html2text",
"clean_html",
"add_anchor",
"strip_dollars",
"strip_files_prefix",
Expand Down Expand Up @@ -75,6 +78,21 @@ def html2text(element):
return text


def clean_html(element):
if isinstance(element, bytes):
element = element.decode()
else:
element = str(element)
return bleach.clean(
element,
tags=[*bleach.ALLOWED_TAGS, "div", "pre", "code", "span"],
attributes={
**bleach.ALLOWED_ATTRIBUTES,
"*": ["class", "id"],
},
)


def _convert_header_id(header_contents):
"""Convert header contents to valid id value. Takes string as input, returns string.
Expand Down

0 comments on commit b40bb13

Please sign in to comment.