Merge pull request #2785 from zerline/HTMLSanitizer

HTML sanitizer for descriptions.
jupyter-widgets · Aug 3, 2021 · 8a2b049 · 8a2b049
2 parents 2de28de + df33205
commit 8a2b049
Show file tree

Hide file tree

Showing 16 changed files with 849 additions and 19 deletions.
diff --git a/ipywidgets/widgets/widget_bool.py b/ipywidgets/widgets/widget_bool.py
@@ -53,9 +53,13 @@ class ToggleButton(_Bool):
     value : {True,False}
         value of the toggle button: True-pressed, False-unpressed
     description : str
-        description displayed next to the button
+        description displayed on the button
     icon: str
         font-awesome icon name
+    style: instance of DescriptionStyle
+        styling customizations
+    button_style: enum
+        button predefined styling
     """
     _view_name = Unicode('ToggleButtonView').tag(sync=True)
     _model_name = Unicode('ToggleButtonModel').tag(sync=True)

diff --git a/ipywidgets/widgets/widget_button.py b/ipywidgets/widgets/widget_button.py
@@ -35,7 +35,7 @@ class Button(DOMWidget, CoreWidget):
     Parameters
     ----------
     description: str
-       description displayed next to the button
+       description displayed on the button
     icon: str
        font-awesome icon names, without the 'fa-' prefix
     disabled: bool

diff --git a/ipywidgets/widgets/widget_description.py b/ipywidgets/widgets/widget_description.py
@@ -3,7 +3,7 @@
 
 """Contains the DOMWidget class"""
 
-from traitlets import Unicode
+from traitlets import Bool, Unicode
 from .widget import Widget, widget_serialization, register
 from .trait_types import InstanceDict
 from .widget_style import Style
@@ -21,6 +21,7 @@ class DescriptionWidget(DOMWidget, CoreWidget):
     """Widget that has a description label to the side."""
     _model_name = Unicode('DescriptionModel').tag(sync=True)
     description = Unicode('', help="Description of the control.").tag(sync=True)
+    description_allow_html = Bool(False, help="Accept HTML in the description.").tag(sync=True)
     style = InstanceDict(DescriptionStyle, help="Styling customizations").tag(sync=True, **widget_serialization)
 
     def _repr_keys(self):

diff --git a/packages/base-manager/package.json b/packages/base-manager/package.json
@@ -35,14 +35,16 @@
     "@jupyter-widgets/base": "^5.0.0-alpha.3",
     "@jupyterlab/services": "^6.0.0",
     "@lumino/coreutils": "^1.4.2",
-    "base64-js": "^1.2.1"
+    "base64-js": "^1.2.1",
+    "sanitize-html": "^1.20"
   },
   "devDependencies": {
     "@types/base64-js": "^1.2.5",
     "@types/chai": "^4.1.7",
     "@types/chai-as-promised": "^7.1.0",
     "@types/expect.js": "^0.3.29",
     "@types/mocha": "^8.2.2",
+    "@types/sanitize-html": "^1.20",
     "@types/sinon": "^10.0.2",
     "@types/sinon-chai": "^3.2.2",
     "chai": "^4.0.0",

diff --git a/packages/base-manager/src/latex.ts b/packages/base-manager/src/latex.ts
@@ -0,0 +1,193 @@
+/*-----------------------------------------------------------------------------
+| Copyright (c) Jupyter Development Team.
+| Distributed under the terms of the Modified BSD License.
+|----------------------------------------------------------------------------*/
+// Some magic for deferring mathematical expressions to MathJax
+// by hiding them from the Markdown parser.
+// Some of the code here is adapted with permission from Davide Cervone
+// under the terms of the Apache2 license governing the MathJax project.
+// Other minor modifications are also due to StackExchange and are used with
+// permission.
+
+const inline = '$'; // the inline math delimiter
+
+// MATHSPLIT contains the pattern for math delimiters and special symbols
+// needed for searching for math in the text input.
+const MATHSPLIT =
+  /(\$\$?|\\(?:begin|end)\{[a-z]*\*?\}|\\[{}$]|[{}]|(?:\n\s*)+|@@\d+@@|\\\\(?:\(|\)|\[|\]))/i;
+
+/**
+ *  Break up the text into its component parts and search
+ *    through them for math delimiters, braces, linebreaks, etc.
+ *  Math delimiters must match and braces must balance.
+ *  Don't allow math to pass through a double linebreak
+ *    (which will be a paragraph).
+ */
+export function removeMath(text: string): { text: string; math: string[] } {
+  const math: string[] = []; // stores math strings for later
+  let start: number | null = null;
+  let end: string | null = null;
+  let last: number | null = null;
+  let braces = 0;
+  let deTilde: (text: string) => string;
+
+  // Except for extreme edge cases, this should catch precisely those pieces of the markdown
+  // source that will later be turned into code spans. While MathJax will not TeXify code spans,
+  // we still have to consider them at this point; the following issue has happened several times:
+  //
+  //     `$foo` and `$bar` are variables.  -->  <code>$foo ` and `$bar</code> are variables.
+  const hasCodeSpans = /`/.test(text);
+  if (hasCodeSpans) {
+    text = text
+      .replace(/~/g, '~T')
+      .replace(/(^|[^\\])(`+)([^\n]*?[^`\n])\2(?!`)/gm, (wholematch) =>
+        wholematch.replace(/\$/g, '~D')
+      );
+    deTilde = (text: string) => {
+      return text.replace(/~([TD])/g, (wholematch, character) =>
+        character === 'T' ? '~' : inline
+      );
+    };
+  } else {
+    deTilde = (text: string) => {
+      return text;
+    };
+  }
+
+  let blocks = text.replace(/\r\n?/g, '\n').split(MATHSPLIT);
+
+  for (let i = 1, m = blocks.length; i < m; i += 2) {
+    const block = blocks[i];
+    if (block.charAt(0) === '@') {
+      //
+      //  Things that look like our math markers will get
+      //  stored and then retrieved along with the math.
+      //
+      blocks[i] = '@@' + math.length + '@@';
+      math.push(block);
+    } else if (start !== null) {
+      //
+      //  If we are in math, look for the end delimiter,
+      //    but don't go past double line breaks, and
+      //    and balance braces within the math.
+      //
+      if (block === end) {
+        if (braces) {
+          last = i;
+        } else {
+          blocks = processMath(start, i, deTilde, math, blocks);
+          start = null;
+          end = null;
+          last = null;
+        }
+      } else if (block.match(/\n.*\n/)) {
+        if (last !== null) {
+          i = last;
+          blocks = processMath(start, i, deTilde, math, blocks);
+        }
+        start = null;
+        end = null;
+        last = null;
+        braces = 0;
+      } else if (block === '{') {
+        braces++;
+      } else if (block === '}' && braces) {
+        braces--;
+      }
+    } else {
+      //
+      //  Look for math start delimiters and when
+      //    found, set up the end delimiter.
+      //
+      if (block === inline || block === '$$') {
+        start = i;
+        end = block;
+        braces = 0;
+      } else if (block === '\\\\(' || block === '\\\\[') {
+        start = i;
+        end = block.slice(-1) === '(' ? '\\\\)' : '\\\\]';
+        braces = 0;
+      } else if (block.substr(1, 5) === 'begin') {
+        start = i;
+        end = '\\end' + block.substr(6);
+        braces = 0;
+      }
+    }
+  }
+  if (start !== null && last !== null) {
+    blocks = processMath(start, last, deTilde, math, blocks);
+    start = null;
+    end = null;
+    last = null;
+  }
+  return { text: deTilde(blocks.join('')), math };
+}
+
+/**
+ * Put back the math strings that were saved,
+ * and clear the math array (no need to keep it around).
+ */
+export function replaceMath(text: string, math: string[]): string {
+  /**
+   * Replace a math placeholder with its corresponding group.
+   * The math delimiters "\\(", "\\[", "\\)" and "\\]" are replaced
+   * removing one backslash in order to be interpreted correctly by MathJax.
+   */
+  const process = (match: string, n: number): string => {
+    let group = math[n];
+    if (
+      group.substr(0, 3) === '\\\\(' &&
+      group.substr(group.length - 3) === '\\\\)'
+    ) {
+      group = '\\(' + group.substring(3, group.length - 3) + '\\)';
+    } else if (
+      group.substr(0, 3) === '\\\\[' &&
+      group.substr(group.length - 3) === '\\\\]'
+    ) {
+      group = '\\[' + group.substring(3, group.length - 3) + '\\]';
+    }
+    return group;
+  };
+  // Replace all the math group placeholders in the text
+  // with the saved strings.
+  return text.replace(/@@(\d+)@@/g, process);
+}
+
+/**
+ * Process math blocks.
+ *
+ * The math is in blocks i through j, so
+ *   collect it into one block and clear the others.
+ *  Replace &, <, and > by named entities.
+ *  For IE, put <br> at the ends of comments since IE removes \n.
+ *  Clear the current math positions and store the index of the
+ *   math, then push the math string onto the storage array.
+ *  The preProcess function is called on all blocks if it has been passed in
+ */
+function processMath(
+  i: number,
+  j: number,
+  preProcess: (input: string) => string,
+  math: string[],
+  blocks: string[]
+): string[] {
+  let block = blocks
+    .slice(i, j + 1)
+    .join('')
+    .replace(/&/g, '&amp;') // use HTML entity for &
+    .replace(/</g, '&lt;') // use HTML entity for <
+    .replace(/>/g, '&gt;'); // use HTML entity for >
+  if (navigator && navigator.appName === 'Microsoft Internet Explorer') {
+    block = block.replace(/(%[^\n]*)\n/g, '$1<br/>\n');
+  }
+  while (j > i) {
+    blocks[j] = '';
+    j--;
+  }
+  blocks[i] = '@@' + math.length + '@@'; // replace the current block text with a unique tag to find later
+  if (preProcess) {
+    block = preProcess(block);
+  }
+  math.push(block);
+  return blocks;
+}
diff --git a/packages/base-manager/src/manager-base.ts b/packages/base-manager/src/manager-base.ts
@@ -26,9 +26,41 @@ import {
 } from '@jupyter-widgets/base';
 
 import { base64ToBuffer, bufferToBase64, hexToBuffer } from './utils';
+import { removeMath, replaceMath } from './latex';
+import sanitize from 'sanitize-html';
 
 const PROTOCOL_MAJOR_VERSION = PROTOCOL_VERSION.split('.', 1)[0];
 
+/**
+ * Sanitize HTML-formatted descriptions.
+ */
+function default_inline_sanitize(s: string): string {
+  const allowedTags = [
+    'a',
+    'abbr',
+    'b',
+    'code',
+    'em',
+    'i',
+    'img',
+    'li',
+    'ol',
+    'span',
+    'strong',
+    'ul',
+  ];
+  const allowedAttributes = {
+    '*': ['aria-*', 'style', 'title'],
+    a: ['href'],
+    img: ['src'],
+    style: ['media', 'type'],
+  };
+  return sanitize(s, {
+    allowedTags: allowedTags,
+    allowedAttributes: allowedAttributes,
+  });
+}
+
 export interface IState extends PartialJSONObject {
   buffers?: IBase64Buffers[];
   model_name: string;
@@ -467,6 +499,13 @@ export abstract class ManagerBase implements IWidgetManager {
     return Promise.resolve(url);
   }
 
+  inline_sanitize(source: string): string {
+    const parts = removeMath(source);
+    // Sanitize tags for inline output.
+    const sanitized = default_inline_sanitize(parts['text']);
+    return replaceMath(sanitized, parts['math']);
+  }
+
   /**
    * The comm target name to register
    */

diff --git a/packages/base/src/manager.ts b/packages/base/src/manager.ts
@@ -190,4 +190,6 @@ export interface IWidgetManager {
    * The default implementation just returns the original url.
    */
   resolveUrl(url: string): Promise<string>;
+
+  inline_sanitize(s: string): string;
 }
diff --git a/packages/base/test/src/dummy-manager.ts b/packages/base/test/src/dummy-manager.ts
@@ -316,6 +316,10 @@ export class DummyManager implements widgets.IWidgetManager {
     return Promise.resolve(url);
   }
 
+  inline_sanitize(s: string): string {
+    return s;
+  }
+
   /**
    * Dictionary of model ids and model instance promises
    */

diff --git a/packages/controls/src/widget_bool.ts b/packages/controls/src/widget_bool.ts
@@ -79,7 +79,12 @@ export class CheckboxView extends DescriptionView {
       return;
     }
     const description = this.model.get('description');
-    this.descriptionSpan.innerHTML = description;
+    if (this.model.get('description_allow_html')) {
+      this.descriptionSpan.innerHTML =
+        this.model.widget_manager.inline_sanitize(description);
+    } else {
+      this.descriptionSpan.textContent = description;
+    }
     this.typeset(this.descriptionSpan);
     this.descriptionSpan.title = description;
     this.checkbox.title = description;

diff --git a/packages/controls/src/widget_description.ts b/packages/controls/src/widget_description.ts
@@ -41,6 +41,7 @@ export class DescriptionModel extends DOMWidgetModel {
       _view_module_version: JUPYTER_CONTROLS_VERSION,
       _model_module_version: JUPYTER_CONTROLS_VERSION,
       description: '',
+      description_allow_html: false,
     };
   }
 }
@@ -53,6 +54,11 @@ export class DescriptionView extends DOMWidgetView {
     this.label.style.display = 'none';
 
     this.listenTo(this.model, 'change:description', this.updateDescription);
+    this.listenTo(
+      this.model,
+      'change:description_allow_html',
+      this.updateDescription
+    );
     this.listenTo(this.model, 'change:tabbable', this.updateTabindex);
     this.updateDescription();
     this.updateTabindex();
@@ -68,7 +74,12 @@ export class DescriptionView extends DOMWidgetView {
     if (description.length === 0) {
       this.label.style.display = 'none';
     } else {
-      this.label.innerHTML = description;
+      if (this.model.get('description_allow_html')) {
+        this.label.innerHTML =
+          this.model.widget_manager.inline_sanitize(description);
+      } else {
+        this.label.textContent = description;
+      }
       this.typeset(this.label);
       this.label.style.display = '';
     }

diff --git a/packages/html-manager/package.json b/packages/html-manager/package.json
@@ -52,6 +52,7 @@
   "devDependencies": {
     "@types/mocha": "^8.2.2",
     "@types/node": "^15.12.2",
+    "@types/sanitize-html": "^1.20",
     "chai": "^4.0.0",
     "css-loader": "^5.2.6",
     "file-loader": "^6.2.0",