-
Notifications
You must be signed in to change notification settings - Fork 556
/
sanitize.py
138 lines (128 loc) · 3.97 KB
/
sanitize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
NBConvert Preprocessor for sanitizing HTML rendering of notebooks.
"""
from bleach import (
ALLOWED_ATTRIBUTES,
ALLOWED_STYLES,
ALLOWED_TAGS,
clean,
)
from traitlets import (
Any,
Bool,
List,
Set,
Unicode,
)
from .base import Preprocessor
class SanitizeHTML(Preprocessor):
# Bleach config.
attributes = Any(
config=True,
default_value=ALLOWED_ATTRIBUTES,
help="Allowed HTML tag attributes",
)
tags = List(
Unicode,
config=True,
default_value=ALLOWED_TAGS,
help="List of HTML tags to allow",
)
styles = List(
Unicode,
config=True,
default_value=ALLOWED_STYLES,
help="Allowed CSS styles if <style> tag is whitelisted"
)
strip = Bool(
config=True,
default_value=False,
help="If True, remove unsafe markup entirely instead of escaping"
)
strip_comments = Bool(
config=True,
default_value=True,
help="If True, strip comments from escaped HTML",
)
# Display data config.
safe_output_keys = Set(
config=True,
default_value={
'metadata', # Not a mimetype per-se, but expected and safe.
'text/plain',
'text/latex',
'application/json',
'image/png',
'image/jpeg',
},
help="Cell output mimetypes to render without modification",
)
sanitized_output_types = Set(
config=True,
default_value={
'text/html',
'text/markdown',
},
help="Cell output types to display after escaping with Bleach.",
)
def preprocess_cell(self, cell, resources, cell_index):
"""
Sanitize potentially-dangerous contents of the cell.
Cell Types:
raw:
Sanitize literal HTML
markdown:
Sanitize literal HTML
code:
Sanitize outputs that could result in code execution
"""
if cell.cell_type == 'raw':
# Sanitize all raw cells anyway.
# Only ones with the text/html mimetype should be emitted
# but erring on the side of safety maybe.
cell.source = self.sanitize_html_tags(cell.source)
return cell, resources
elif cell.cell_type == 'markdown':
cell.source = self.sanitize_html_tags(cell.source)
return cell, resources
elif cell.cell_type == 'code':
cell.outputs = self.sanitize_code_outputs(cell.outputs)
return cell, resources
def sanitize_code_outputs(self, outputs):
"""
Sanitize code cell outputs.
Removes 'text/javascript' fields from display_data outputs, and
runs `sanitize_html_tags` over 'text/html'.
"""
for output in outputs:
# These are always ascii, so nothing to escape.
if output['output_type'] in ('stream', 'error'):
continue
data = output.data
to_remove = []
for key in data:
if key in self.safe_output_keys:
continue
elif key in self.sanitized_output_types:
self.log.info("Sanitizing %s" % key)
data[key] = self.sanitize_html_tags(data[key])
else:
# Mark key for removal. (Python doesn't allow deletion of
# keys from a dict during iteration)
to_remove.append(key)
for key in to_remove:
self.log.info("Removing %s" % key)
del data[key]
return outputs
def sanitize_html_tags(self, html_str):
"""
Sanitize a string containing raw HTML tags.
"""
return clean(
html_str,
tags=self.tags,
attributes=self.attributes,
styles=self.styles,
strip=self.strip,
strip_comments=self.strip_comments,
)