Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Loading…

Updated Purifiable to version 4.3 from version 4.0 #2

Merged
merged 1 commit into from

2 participants

@mntbkr

No description provided.

@josegonzalez

Do I get a pull request? ;)

pull sent

@josegonzalez josegonzalez merged commit 1677f31 into josegonzalez:master
@josegonzalez

Thanks for the pull request :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on May 30, 2011
  1. @mntbkr
This page is out of date. Refresh to see the latest.
Showing with 1,416 additions and 287 deletions.
  1. +1,215 −257 vendors/htmlpurifier/htmlpurifier.php
  2. BIN  vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema.ser
  3. +11 −0 vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/AutoFormat.RemoveSpansWithoutAttributes.txt
  4. +12 −0 vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/CSS.AllowedFonts.txt
  5. +13 −0 vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/CSS.ForbiddenProperties.txt
  6. +9 −0 vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/CSS.Trusted.txt
  7. +11 −0 vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/Cache.SerializerPermissions.txt
  8. +11 −0 vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/Core.NormalizeNewlines.txt
  9. +11 −0 vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/Core.RemoveProcessingInstructions.txt
  10. +5 −0 vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/Filter.YouTube.txt
  11. +7 −4 vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/HTML.Allowed.txt
  12. +11 −6 vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/HTML.AllowedElements.txt
  13. +11 −0 vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/HTML.FlashAllowFullScreen.txt
  14. +7 −0 vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/HTML.Nofollow.txt
  15. +3 −4 vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/HTML.SafeEmbed.txt
  16. +3 −4 vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/HTML.SafeObject.txt
  17. +1 −0  vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/HTML.Trusted.txt
  18. +15 −0 vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/Output.FixInnerHTML.txt
  19. +11 −0 vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/Output.FlashCompat.txt
  20. +2 −0  vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/URI.AllowedSchemes.txt
  21. +5 −2 vendors/htmlpurifier/standalone/HTMLPurifier/ConfigSchema/schema/URI.DisableResources.txt
  22. +1 −1  vendors/htmlpurifier/standalone/HTMLPurifier/EntityLookup/entities.ser
  23. +5 −5 vendors/htmlpurifier/standalone/HTMLPurifier/Filter/YouTube.php
  24. +1 −0  vendors/htmlpurifier/standalone/HTMLPurifier/Language/messages/en.php
  25. +35 −2 vendors/htmlpurifier/standalone/HTMLPurifier/Lexer/PEARSax3.php
  26. +0 −2  vendors/htmlpurifier/standalone/HTMLPurifier/Lexer/PH5P.php
View
1,472 vendors/htmlpurifier/htmlpurifier.php
@@ -7,7 +7,7 @@
* primary concern and you are using an opcode cache. PLEASE DO NOT EDIT THIS
* FILE, changes will be overwritten the next time the script is run.
*
- * @version 4.0.0
+ * @version 4.3.0
*
* @warning
* You must *not* include any other HTML Purifier files before this file,
@@ -39,7 +39,7 @@
*/
/*
- HTML Purifier 4.0.0 - Standards Compliant HTML Filtering
+ HTML Purifier 4.3.0 - Standards Compliant HTML Filtering
Copyright (C) 2006-2008 Edward Z. Yang
This library is free software; you can redistribute it and/or
@@ -75,10 +75,10 @@ class HTMLPurifier
{
/** Version of HTML Purifier */
- public $version = '4.0.0';
+ public $version = '4.3.0';
/** Constant with version of HTML Purifier */
- const VERSION = '4.0.0';
+ const VERSION = '4.3.0';
/** Global configuration object */
public $config;
@@ -469,6 +469,42 @@ protected function mungeRgb($string) {
return preg_replace('/rgb\((\d+)\s*,\s*(\d+)\s*,\s*(\d+)\)/', 'rgb(\1,\2,\3)', $string);
}
+ /**
+ * Parses a possibly escaped CSS string and returns the "pure"
+ * version of it.
+ */
+ protected function expandCSSEscape($string) {
+ // flexibly parse it
+ $ret = '';
+ for ($i = 0, $c = strlen($string); $i < $c; $i++) {
+ if ($string[$i] === '\\') {
+ $i++;
+ if ($i >= $c) {
+ $ret .= '\\';
+ break;
+ }
+ if (ctype_xdigit($string[$i])) {
+ $code = $string[$i];
+ for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
+ if (!ctype_xdigit($string[$i])) break;
+ $code .= $string[$i];
+ }
+ // We have to be extremely careful when adding
+ // new characters, to make sure we're not breaking
+ // the encoding.
+ $char = HTMLPurifier_Encoder::unichr(hexdec($code));
+ if (HTMLPurifier_Encoder::cleanUTF8($char) === '') continue;
+ $ret .= $char;
+ if ($i < $c && trim($string[$i]) !== '') $i--;
+ continue;
+ }
+ if ($string[$i] === "\n") continue;
+ }
+ $ret .= $string[$i];
+ }
+ return $ret;
+ }
+
}
@@ -811,7 +847,12 @@ class HTMLPurifier_Bootstrap
public static function autoload($class) {
$file = HTMLPurifier_Bootstrap::getPath($class);
if (!$file) return false;
- require HTMLPURIFIER_PREFIX . '/' . $file;
+ // Technically speaking, it should be ok and more efficient to
+ // just do 'require', but Antonio Parraga reports that with
+ // Zend extensions such as Zend debugger and APC, this invariant
+ // may be broken. Since we have efficient alternatives, pay
+ // the cost here and avoid the bug.
+ require_once HTMLPURIFIER_PREFIX . '/' . $file;
return true;
}
@@ -839,10 +880,11 @@ public static function registerAutoload() {
if ( ($funcs = spl_autoload_functions()) === false ) {
spl_autoload_register($autoload);
} elseif (function_exists('spl_autoload_unregister')) {
+ $buggy = version_compare(PHP_VERSION, '5.2.11', '<');
$compat = version_compare(PHP_VERSION, '5.1.2', '<=') &&
version_compare(PHP_VERSION, '5.1.0', '>=');
foreach ($funcs as $func) {
- if (is_array($func)) {
+ if ($buggy && is_array($func)) {
// :TRICKY: There are some compatibility issues and some
// places where we need to error out
$reflector = new ReflectionMethod($func[0], $func[1]);
@@ -886,6 +928,17 @@ public static function registerAutoload() {
public $setup = false;
/**
+ * If true, write out the final definition object to the cache after
+ * setup. This will be true only if all invocations to get a raw
+ * definition object are also optimized. This does not cause file
+ * system thrashing because on subsequent calls the cached object
+ * is used and any writes to the raw definition object are short
+ * circuited. See enduser-customize.html for the high-level
+ * picture.
+ */
+ public $optimized = null;
+
+ /**
* What type of definition is it?
*/
public $type;
@@ -1132,6 +1185,10 @@ protected function doSetup($config) {
$this->doSetupTricky($config);
}
+ if ($config->get('CSS.Trusted')) {
+ $this->doSetupTrusted($config);
+ }
+
$allow_important = $config->get('CSS.AllowImportant');
// wrap all attr-defs with decorator that handles !important
foreach ($this->info as $k => $v) {
@@ -1173,6 +1230,23 @@ protected function doSetupTricky($config) {
$this->info['overflow'] = new HTMLPurifier_AttrDef_Enum(array('visible', 'hidden', 'auto', 'scroll'));
}
+ protected function doSetupTrusted($config) {
+ $this->info['position'] = new HTMLPurifier_AttrDef_Enum(array(
+ 'static', 'relative', 'absolute', 'fixed'
+ ));
+ $this->info['top'] =
+ $this->info['left'] =
+ $this->info['right'] =
+ $this->info['bottom'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
+ new HTMLPurifier_AttrDef_CSS_Length(),
+ new HTMLPurifier_AttrDef_CSS_Percentage(),
+ new HTMLPurifier_AttrDef_Enum(array('auto')),
+ ));
+ $this->info['z-index'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
+ new HTMLPurifier_AttrDef_Integer(),
+ new HTMLPurifier_AttrDef_Enum(array('auto')),
+ ));
+ }
/**
* Performs extra config-based processing. Based off of
@@ -1185,20 +1259,29 @@ protected function setupConfigStuff($config) {
// setup allowed elements
$support = "(for information on implementing this, see the ".
"support forums) ";
- $allowed_attributes = $config->get('CSS.AllowedProperties');
- if ($allowed_attributes !== null) {
+ $allowed_properties = $config->get('CSS.AllowedProperties');
+ if ($allowed_properties !== null) {
foreach ($this->info as $name => $d) {
- if(!isset($allowed_attributes[$name])) unset($this->info[$name]);
- unset($allowed_attributes[$name]);
+ if(!isset($allowed_properties[$name])) unset($this->info[$name]);
+ unset($allowed_properties[$name]);
}
// emit errors
- foreach ($allowed_attributes as $name => $d) {
+ foreach ($allowed_properties as $name => $d) {
// :TODO: Is this htmlspecialchars() call really necessary?
$name = htmlspecialchars($name);
trigger_error("Style attribute '$name' is not supported $support", E_USER_WARNING);
}
}
+ $forbidden_properties = $config->get('CSS.ForbiddenProperties');
+ if ($forbidden_properties !== null) {
+ foreach ($this->info as $name => $d) {
+ if (isset($forbidden_properties[$name])) {
+ unset($this->info[$name]);
+ }
+ }
+ }
+
}
}
@@ -1275,7 +1358,7 @@ class HTMLPurifier_Config
/**
* HTML Purifier's version
*/
- public $version = '4.0.0';
+ public $version = '4.3.0';
/**
* Bool indicator whether or not to automatically finalize
@@ -1331,7 +1414,8 @@ class HTMLPurifier_Config
/**
* Set to false if you do not want line and file numbers in errors
- * (useful when unit testing)
+ * (useful when unit testing). This will also compress some errors
+ * and exceptions.
*/
public $chatty = true;
@@ -1573,26 +1657,64 @@ private function _listify($lookup) {
* Retrieves object reference to the HTML definition.
* @param $raw Return a copy that has not been setup yet. Must be
* called before it's been setup, otherwise won't work.
+ * @param $optimized If true, this method may return null, to
+ * indicate that a cached version of the modified
+ * definition object is available and no further edits
+ * are necessary. Consider using
+ * maybeGetRawHTMLDefinition, which is more explicitly
+ * named, instead.
*/
- public function getHTMLDefinition($raw = false) {
- return $this->getDefinition('HTML', $raw);
+ public function getHTMLDefinition($raw = false, $optimized = false) {
+ return $this->getDefinition('HTML', $raw, $optimized);
}
/**
* Retrieves object reference to the CSS definition
* @param $raw Return a copy that has not been setup yet. Must be
* called before it's been setup, otherwise won't work.
+ * @param $optimized If true, this method may return null, to
+ * indicate that a cached version of the modified
+ * definition object is available and no further edits
+ * are necessary. Consider using
+ * maybeGetRawCSSDefinition, which is more explicitly
+ * named, instead.
*/
- public function getCSSDefinition($raw = false) {
- return $this->getDefinition('CSS', $raw);
+ public function getCSSDefinition($raw = false, $optimized = false) {
+ return $this->getDefinition('CSS', $raw, $optimized);
+ }
+
+ /**
+ * Retrieves object reference to the URI definition
+ * @param $raw Return a copy that has not been setup yet. Must be
+ * called before it's been setup, otherwise won't work.
+ * @param $optimized If true, this method may return null, to
+ * indicate that a cached version of the modified
+ * definition object is available and no further edits
+ * are necessary. Consider using
+ * maybeGetRawURIDefinition, which is more explicitly
+ * named, instead.
+ */
+ public function getURIDefinition($raw = false, $optimized = false) {
+ return $this->getDefinition('URI', $raw, $optimized);
}
/**
* Retrieves a definition
* @param $type Type of definition: HTML, CSS, etc
* @param $raw Whether or not definition should be returned raw
- */
- public function getDefinition($type, $raw = false) {
+ * @param $optimized Only has an effect when $raw is true. Whether
+ * or not to return null if the result is already present in
+ * the cache. This is off by default for backwards
+ * compatibility reasons, but you need to do things this
+ * way in order to ensure that caching is done properly.
+ * Check out enduser-customize.html for more details.
+ * We probably won't ever change this default, as much as the
+ * maybe semantics is the "right thing to do."
+ */
+ public function getDefinition($type, $raw = false, $optimized = false) {
+ if ($optimized && !$raw) {
+ throw new HTMLPurifier_Exception("Cannot set optimized = true when raw = false");
+ }
if (!$this->finalized) $this->autoFinalize();
// temporarily suspend locks, so we can handle recursive definition calls
$lock = $this->lock;
@@ -1601,52 +1723,137 @@ public function getDefinition($type, $raw = false) {
$cache = $factory->create($type, $this);
$this->lock = $lock;
if (!$raw) {
- // see if we can quickly supply a definition
+ // full definition
+ // ---------------
+ // check if definition is in memory
if (!empty($this->definitions[$type])) {
- if (!$this->definitions[$type]->setup) {
- $this->definitions[$type]->setup($this);
- $cache->set($this->definitions[$type], $this);
+ $def = $this->definitions[$type];
+ // check if the definition is setup
+ if ($def->setup) {
+ return $def;
+ } else {
+ $def->setup($this);
+ if ($def->optimized) $cache->add($def, $this);
+ return $def;
}
- return $this->definitions[$type];
}
- // memory check missed, try cache
- $this->definitions[$type] = $cache->get($this);
- if ($this->definitions[$type]) {
- // definition in cache, return it
- return $this->definitions[$type];
+ // check if definition is in cache
+ $def = $cache->get($this);
+ if ($def) {
+ // definition in cache, save to memory and return it
+ $this->definitions[$type] = $def;
+ return $def;
}
- } elseif (
- !empty($this->definitions[$type]) &&
- !$this->definitions[$type]->setup
- ) {
- // raw requested, raw in memory, quick return
- return $this->definitions[$type];
+ // initialize it
+ $def = $this->initDefinition($type);
+ // set it up
+ $this->lock = $type;
+ $def->setup($this);
+ $this->lock = null;
+ // save in cache
+ $cache->add($def, $this);
+ // return it
+ return $def;
+ } else {
+ // raw definition
+ // --------------
+ // check preconditions
+ $def = null;
+ if ($optimized) {
+ if (is_null($this->get($type . '.DefinitionID'))) {
+ // fatally error out if definition ID not set
+ throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID");
+ }
+ }
+ if (!empty($this->definitions[$type])) {
+ $def = $this->definitions[$type];
+ if ($def->setup && !$optimized) {
+ $extra = $this->chatty ? " (try moving this code block earlier in your initialization)" : "";
+ throw new HTMLPurifier_Exception("Cannot retrieve raw definition after it has already been setup" . $extra);
+ }
+ if ($def->optimized === null) {
+ $extra = $this->chatty ? " (try flushing your cache)" : "";
+ throw new HTMLPurifier_Exception("Optimization status of definition is unknown" . $extra);
+ }
+ if ($def->optimized !== $optimized) {
+ $msg = $optimized ? "optimized" : "unoptimized";
+ $extra = $this->chatty ? " (this backtrace is for the first inconsistent call, which was for a $msg raw definition)" : "";
+ throw new HTMLPurifier_Exception("Inconsistent use of optimized and unoptimized raw definition retrievals" . $extra);
+ }
+ }
+ // check if definition was in memory
+ if ($def) {
+ if ($def->setup) {
+ // invariant: $optimized === true (checked above)
+ return null;
+ } else {
+ return $def;
+ }
+ }
+ // if optimized, check if definition was in cache
+ // (because we do the memory check first, this formulation
+ // is prone to cache slamming, but I think
+ // guaranteeing that either /all/ of the raw
+ // setup code or /none/ of it is run is more important.)
+ if ($optimized) {
+ // This code path only gets run once; once we put
+ // something in $definitions (which is guaranteed by the
+ // trailing code), we always short-circuit above.
+ $def = $cache->get($this);
+ if ($def) {
+ // save the full definition for later, but don't
+ // return it yet
+ $this->definitions[$type] = $def;
+ return null;
+ }
+ }
+ // check invariants for creation
+ if (!$optimized) {
+ if (!is_null($this->get($type . '.DefinitionID'))) {
+ if ($this->chatty) {
+ $this->triggerError("Due to a documentation error in previous version of HTML Purifier, your definitions are not being cached. If this is OK, you can remove the %$type.DefinitionRev and %$type.DefinitionID declaration. Otherwise, modify your code to use maybeGetRawDefinition, and test if the returned value is null before making any edits (if it is null, that means that a cached version is available, and no raw operations are necessary). See <a href='http://htmlpurifier.org/docs/enduser-customize.html#optimized'>Customize</a> for more details", E_USER_WARNING);
+ } else {
+ $this->triggerError("Useless DefinitionID declaration", E_USER_WARNING);
+ }
+ }
+ }
+ // initialize it
+ $def = $this->initDefinition($type);
+ $def->optimized = $optimized;
+ return $def;
}
+ throw new HTMLPurifier_Exception("The impossible happened!");
+ }
+
+ private function initDefinition($type) {
// quick checks failed, let's create the object
if ($type == 'HTML') {
- $this->definitions[$type] = new HTMLPurifier_HTMLDefinition();
+ $def = new HTMLPurifier_HTMLDefinition();
} elseif ($type == 'CSS') {
- $this->definitions[$type] = new HTMLPurifier_CSSDefinition();
+ $def = new HTMLPurifier_CSSDefinition();
} elseif ($type == 'URI') {
- $this->definitions[$type] = new HTMLPurifier_URIDefinition();
+ $def = new HTMLPurifier_URIDefinition();
} else {
throw new HTMLPurifier_Exception("Definition of $type type not supported");
}
- // quick abort if raw
- if ($raw) {
- if (is_null($this->get($type . '.DefinitionID'))) {
- // fatally error out if definition ID not set
- throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID");
- }
- return $this->definitions[$type];
- }
- // set it up
- $this->lock = $type;
- $this->definitions[$type]->setup($this);
- $this->lock = null;
- // save in cache
- $cache->set($this->definitions[$type], $this);
- return $this->definitions[$type];
+ $this->definitions[$type] = $def;
+ return $def;
+ }
+
+ public function maybeGetRawDefinition($name) {
+ return $this->getDefinition($name, true, true);
+ }
+
+ public function maybeGetRawHTMLDefinition() {
+ return $this->getDefinition('HTML', true, true);
+ }
+
+ public function maybeGetRawCSSDefinition() {
+ return $this->getDefinition('CSS', true, true);
+ }
+
+ public function maybeGetRawURIDefinition() {
+ return $this->getDefinition('URI', true, true);
}
/**
@@ -1804,17 +2011,22 @@ public function finalize() {
/**
* Produces a nicely formatted error message by supplying the
- * stack frame information from two levels up and OUTSIDE of
- * HTMLPurifier_Config.
+ * stack frame information OUTSIDE of HTMLPurifier_Config.
*/
protected function triggerError($msg, $no) {
// determine previous stack frame
- $backtrace = debug_backtrace();
- if ($this->chatty && isset($backtrace[1])) {
- $frame = $backtrace[1];
- $extra = " on line {$frame['line']} in file {$frame['file']}";
- } else {
- $extra = '';
+ $extra = '';
+ if ($this->chatty) {
+ $trace = debug_backtrace();
+ // zip(tail(trace), trace) -- but PHP is not Haskell har har
+ for ($i = 0, $c = count($trace); $i < $c - 1; $i++) {
+ if ($trace[$i + 1]['class'] === 'HTMLPurifier_Config') {
+ continue;
+ }
+ $frame = $trace[$i];
+ $extra = " invoked on line {$frame['line']} in file {$frame['file']}";
+ break;
+ }
}
trigger_error($msg . $extra, $no);
}
@@ -1896,7 +2108,13 @@ public function __construct() {
* Unserializes the default ConfigSchema.
*/
public static function makeFromSerial() {
- return unserialize(file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser'));
+ $contents = file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser');
+ $r = unserialize($contents);
+ if (!$r) {
+ $hash = sha1($contents);
+ trigger_error("Unserialization of configuration schema failed, sha1 of file was $hash", E_USER_ERROR);
+ }
+ return $r;
}
/**
@@ -2698,6 +2916,13 @@ class HTMLPurifier_ElementDef
public $autoclose = array();
/**
+ * If a foreign element is found in this element, test if it is
+ * allowed by this sub-element; if it is, instead of closing the
+ * current element, place it inside this element.
+ */
+ public $wrap;
+
+ /**
* Whether or not this is a formatting element affected by the
* "Active Formatting Elements" algorithm.
*/
@@ -3757,6 +3982,22 @@ class HTMLPurifier_Generator
private $_sortAttr;
/**
+ * Cache of %Output.FlashCompat
+ */
+ private $_flashCompat;
+
+ /**
+ * Cache of %Output.FixInnerHTML
+ */
+ private $_innerHTMLFix;
+
+ /**
+ * Stack for keeping track of object information when outputting IE
+ * compatibility code.
+ */
+ private $_flashStack = array();
+
+ /**
* Configuration for the generator
*/
protected $config;
@@ -3768,7 +4009,9 @@ class HTMLPurifier_Generator
public function __construct($config, $context) {
$this->config = $config;
$this->_scriptFix = $config->get('Output.CommentScriptContents');
+ $this->_innerHTMLFix = $config->get('Output.FixInnerHTML');
$this->_sortAttr = $config->get('Output.SortAttr');
+ $this->_flashCompat = $config->get('Output.FlashCompat');
$this->_def = $config->getHTMLDefinition();
$this->_xhtml = $this->_def->doctype->xml;
}
@@ -3811,9 +4054,11 @@ public function generateFromTokens($tokens) {
}
// Normalize newlines to system defined value
- $nl = $this->config->get('Output.Newline');
- if ($nl === null) $nl = PHP_EOL;
- if ($nl !== "\n") $html = str_replace("\n", $nl, $html);
+ if ($this->config->get('Core.NormalizeNewlines')) {
+ $nl = $this->config->get('Output.Newline');
+ if ($nl === null) $nl = PHP_EOL;
+ if ($nl !== "\n") $html = str_replace("\n", $nl, $html);
+ }
return $html;
}
@@ -3829,12 +4074,29 @@ public function generateFromToken($token) {
} elseif ($token instanceof HTMLPurifier_Token_Start) {
$attr = $this->generateAttributes($token->attr, $token->name);
+ if ($this->_flashCompat) {
+ if ($token->name == "object") {
+ $flash = new stdclass();
+ $flash->attr = $token->attr;
+ $flash->param = array();
+ $this->_flashStack[] = $flash;
+ }
+ }
return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
} elseif ($token instanceof HTMLPurifier_Token_End) {
- return '</' . $token->name . '>';
+ $_extra = '';
+ if ($this->_flashCompat) {
+ if ($token->name == "object" && !empty($this->_flashStack)) {
+ // doesn't do anything for now
+ }
+ }
+ return $_extra . '</' . $token->name . '>';
} elseif ($token instanceof HTMLPurifier_Token_Empty) {
+ if ($this->_flashCompat && $token->name == "param" && !empty($this->_flashStack)) {
+ $this->_flashStack[count($this->_flashStack)-1]->param[$token->attr['name']] = $token->attr['value'];
+ }
$attr = $this->generateAttributes($token->attr, $token->name);
return '<' . $token->name . ($attr ? ' ' : '') . $attr .
( $this->_xhtml ? ' /': '' ) // <br /> v. <br>
@@ -3884,6 +4146,37 @@ public function generateAttributes($assoc_array_of_attributes, $element = false)
continue;
}
}
+ // Workaround for Internet Explorer innerHTML bug.
+ // Essentially, Internet Explorer, when calculating
+ // innerHTML, omits quotes if there are no instances of
+ // angled brackets, quotes or spaces. However, when parsing
+ // HTML (for example, when you assign to innerHTML), it
+ // treats backticks as quotes. Thus,
+ // <img alt="``" />
+ // becomes
+ // <img alt=`` />
+ // becomes
+ // <img alt='' />
+ // Fortunately, all we need to do is trigger an appropriate
+ // quoting style, which we do by adding an extra space.
+ // This also is consistent with the W3C spec, which states
+ // that user agents may ignore leading or trailing
+ // whitespace (in fact, most don't, at least for attributes
+ // like alt, but an extra space at the end is barely
+ // noticeable). Still, we have a configuration knob for
+ // this, since this transformation is not necesary if you
+ // don't process user input with innerHTML or you don't plan
+ // on supporting Internet Explorer.
+ if ($this->_innerHTMLFix) {
+ if (strpos($value, '`') !== false) {
+ // check if correct quoting style would not already be
+ // triggered
+ if (strcspn($value, '"\' <>') === strlen($value)) {
+ // protect!
+ $value .= ' ';
+ }
+ }
+ }
$html .= $key.'="'.$this->escape($value).'" ';
}
return rtrim($html);
@@ -3899,7 +4192,10 @@ public function generateAttributes($assoc_array_of_attributes, $element = false)
* permissible for non-attribute output.
* @return String escaped data.
*/
- public function escape($string, $quote = ENT_COMPAT) {
+ public function escape($string, $quote = null) {
+ // Workaround for APC bug on Mac Leopard reported by sidepodcast
+ // http://htmlpurifier.org/phorum/read.php?3,4823,4846
+ if ($quote === null) $quote = ENT_COMPAT;
return htmlspecialchars($string, $quote, 'UTF-8');
}
@@ -4209,7 +4505,12 @@ protected function setupConfigStuff($config) {
unset($allowed_attributes_mutable[$key]);
}
}
- if ($delete) unset($this->info[$tag]->attr[$attr]);
+ if ($delete) {
+ if ($this->info[$tag]->attr[$attr]->required) {
+ trigger_error("Required attribute '$attr' in element '$tag' was not allowed, which means '$tag' will not be allowed either", E_USER_WARNING);
+ }
+ unset($this->info[$tag]->attr[$attr]);
+ }
}
}
// emit errors
@@ -4791,19 +5092,19 @@ public function setup($config) {
}
}
- // add proprietary module (this gets special treatment because
- // it is completely removed from doctypes, etc.)
+ // custom modules
if ($config->get('HTML.Proprietary')) {
$modules[] = 'Proprietary';
}
-
- // add SafeObject/Safeembed modules
if ($config->get('HTML.SafeObject')) {
$modules[] = 'SafeObject';
}
if ($config->get('HTML.SafeEmbed')) {
$modules[] = 'SafeEmbed';
}
+ if ($config->get('HTML.Nofollow')) {
+ $modules[] = 'Nofollow';
+ }
// merge in custom modules
$modules = array_merge($modules, $this->userModules);
@@ -5983,6 +6284,17 @@ protected static function escapeCommentedCDATA($string) {
}
/**
+ * Special Internet Explorer conditional comments should be removed.
+ */
+ protected static function removeIEConditional($string) {
+ return preg_replace(
+ '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings
+ '',
+ $string
+ );
+ }
+
+ /**
* Callback function for escapeCDATA() that does the work.
*
* @warning Though this is public in order to let the callback happen,
@@ -6004,8 +6316,10 @@ protected static function CDATACallback($matches) {
public function normalize($html, $config, $context) {
// normalize newlines to \n
- $html = str_replace("\r\n", "\n", $html);
- $html = str_replace("\r", "\n", $html);
+ if ($config->get('Core.NormalizeNewlines')) {
+ $html = str_replace("\r\n", "\n", $html);
+ $html = str_replace("\r", "\n", $html);
+ }
if ($config->get('HTML.Trusted')) {
// escape convoluted CDATA
@@ -6015,9 +6329,19 @@ public function normalize($html, $config, $context) {
// escape CDATA
$html = $this->escapeCDATA($html);
+ $html = $this->removeIEConditional($html);
+
// extract body from document if applicable
if ($config->get('Core.ConvertDocumentToFragment')) {
- $html = $this->extractBody($html);
+ $e = false;
+ if ($config->get('Core.CollectErrors')) {
+ $e =& $context->get('ErrorCollector');
+ }
+ $new_html = $this->extractBody($html);
+ if ($e && $new_html != $html) {
+ $e->send(E_WARNING, 'Lexer: Extracted body');
+ }
+ $html = $new_html;
}
// expand entities that aren't the big five
@@ -6028,6 +6352,11 @@ public function normalize($html, $config, $context) {
// represent non-SGML characters (horror, horror!)
$html = HTMLPurifier_Encoder::cleanUTF8($html);
+ // if processing instructions are to removed, remove them now
+ if ($config->get('Core.RemoveProcessingInstructions')) {
+ $html = preg_replace('#<\?.+?\?>#s', '', $html);
+ }
+
return $html;
}
@@ -6705,14 +7034,6 @@ public function validate($config, $context) {
$chars_gen_delims = ':/?#[]@';
$chars_pchar = $chars_sub_delims . ':@';
- // validate scheme (MUST BE FIRST!)
- if (!is_null($this->scheme) && is_null($this->host)) {
- $def = $config->getDefinition('URI');
- if ($def->defaultScheme === $this->scheme) {
- $this->scheme = null;
- }
- }
-
// validate host
if (!is_null($this->host)) {
$host_def = new HTMLPurifier_AttrDef_URI_Host();
@@ -6720,6 +7041,21 @@ public function validate($config, $context) {
if ($this->host === false) $this->host = null;
}
+ // validate scheme
+ // NOTE: It's not appropriate to check whether or not this
+ // scheme is in our registry, since a URIFilter may convert a
+ // URI that we don't allow into one we do. So instead, we just
+ // check if the scheme can be dropped because there is no host
+ // and it is our default scheme.
+ if (!is_null($this->scheme) && is_null($this->host) || $this->host === '') {
+ // support for relative paths is pretty abysmal when the
+ // scheme is present, so axe it when possible
+ $def = $config->getDefinition('URI');
+ if ($def->defaultScheme === $this->scheme) {
+ $this->scheme = null;
+ }
+ }
+
// validate username
if (!is_null($this->userinfo)) {
$encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
@@ -6734,32 +7070,48 @@ public function validate($config, $context) {
// validate path
$path_parts = array();
$segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
- if (!is_null($this->host)) {
+ if (!is_null($this->host)) { // this catches $this->host === ''
// path-abempty (hier and relative)
+ // http://www.example.com/my/path
+ // //www.example.com/my/path (looks odd, but works, and
+ // recognized by most browsers)
+ // (this set is valid or invalid on a scheme by scheme
+ // basis, so we'll deal with it later)
+ // file:///my/path
+ // ///my/path
$this->path = $segments_encoder->encode($this->path);
- } elseif ($this->path !== '' && $this->path[0] === '/') {
- // path-absolute (hier and relative)
- if (strlen($this->path) >= 2 && $this->path[1] === '/') {
- // This shouldn't ever happen!
- $this->path = '';
- } else {
+ } elseif ($this->path !== '') {
+ if ($this->path[0] === '/') {
+ // path-absolute (hier and relative)
+ // http:/my/path
+ // /my/path
+ if (strlen($this->path) >= 2 && $this->path[1] === '/') {
+ // This could happen if both the host gets stripped
+ // out
+ // http://my/path
+ // //my/path
+ $this->path = '';
+ } else {
+ $this->path = $segments_encoder->encode($this->path);
+ }
+ } elseif (!is_null($this->scheme)) {
+ // path-rootless (hier)
+ // http:my/path
+ // Short circuit evaluation means we don't need to check nz
$this->path = $segments_encoder->encode($this->path);
- }
- } elseif (!is_null($this->scheme) && $this->path !== '') {
- // path-rootless (hier)
- // Short circuit evaluation means we don't need to check nz
- $this->path = $segments_encoder->encode($this->path);
- } elseif (is_null($this->scheme) && $this->path !== '') {
- // path-noscheme (relative)
- // (once again, not checking nz)
- $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
- $c = strpos($this->path, '/');
- if ($c !== false) {
- $this->path =
- $segment_nc_encoder->encode(substr($this->path, 0, $c)) .
- $segments_encoder->encode(substr($this->path, $c));
} else {
- $this->path = $segment_nc_encoder->encode($this->path);
+ // path-noscheme (relative)
+ // my/path
+ // (once again, not checking nz)
+ $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
+ $c = strpos($this->path, '/');
+ if ($c !== false) {
+ $this->path =
+ $segment_nc_encoder->encode(substr($this->path, 0, $c)) .
+ $segments_encoder->encode(substr($this->path, $c));
+ } else {
+ $this->path = $segment_nc_encoder->encode($this->path);
+ }
}
} else {
// path-empty (hier and relative)
@@ -6788,6 +7140,9 @@ public function validate($config, $context) {
public function toString() {
// reconstruct authority
$authority = null;
+ // there is a rendering difference between a null authority
+ // (http:foo-bar) and an empty string authority
+ // (http:///foo-bar).
if (!is_null($this->host)) {
$authority = '';
if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@';
@@ -6795,7 +7150,12 @@ public function toString() {
if(!is_null($this->port)) $authority .= ':' . $this->port;
}
- // reconstruct the result
+ // Reconstruct the result
+ // One might wonder about parsing quirks from browsers after
+ // this reconstruction. Unfortunately, parsing behavior depends
+ // on what *scheme* was employed (file:///foo is handled *very*
+ // differently than http:///foo), so unfortunately we have to
+ // defer to the schemes to do the right thing.
$result = '';
if (!is_null($this->scheme)) $result .= $this->scheme . ':';
if (!is_null($authority)) $result .= '//' . $authority;
@@ -7026,11 +7386,13 @@ public function parse($uri) {
/**
* Validator for the components of a URI for a specific scheme
*/
-class HTMLPurifier_URIScheme
+abstract class HTMLPurifier_URIScheme
{
/**
- * Scheme's default port (integer)
+ * Scheme's default port (integer). If an explicit port number is
+ * specified that coincides with the default port, it will be
+ * elided.
*/
public $default_port = null;
@@ -7047,17 +7409,62 @@ class HTMLPurifier_URIScheme
public $hierarchical = false;
/**
- * Validates the components of a URI
- * @note This implementation should be called by children if they define
- * a default port, as it does port processing.
- * @param $uri Instance of HTMLPurifier_URI
+ * Whether or not the URI may omit a hostname when the scheme is
+ * explicitly specified, ala file:///path/to/file. As of writing,
+ * 'file' is the only scheme that browsers support his properly.
+ */
+ public $may_omit_host = false;
+
+ /**
+ * Validates the components of a URI for a specific scheme.
+ * @param $uri Reference to a HTMLPurifier_URI object
+ * @param $config HTMLPurifier_Config object
+ * @param $context HTMLPurifier_Context object
+ * @return Bool success or failure
+ */
+ public abstract function doValidate(&$uri, $config, $context);
+
+ /**
+ * Public interface for validating components of a URI. Performs a
+ * bunch of default actions. Don't overload this method.
+ * @param $uri Reference to a HTMLPurifier_URI object
* @param $config HTMLPurifier_Config object
* @param $context HTMLPurifier_Context object
* @return Bool success or failure
*/
public function validate(&$uri, $config, $context) {
if ($this->default_port == $uri->port) $uri->port = null;
- return true;
+ // kludge: browsers do funny things when the scheme but not the
+ // authority is set
+ if (!$this->may_omit_host &&
+ // if the scheme is present, a missing host is always in error
+ (!is_null($uri->scheme) && ($uri->host === '' || is_null($uri->host))) ||
+ // if the scheme is not present, a *blank* host is in error,
+ // since this translates into '///path' which most browsers
+ // interpret as being 'http://path'.
+ (is_null($uri->scheme) && $uri->host === '')
+ ) {
+ do {
+ if (is_null($uri->scheme)) {
+ if (substr($uri->path, 0, 2) != '//') {
+ $uri->host = null;
+ break;
+ }
+ // URI is '////path', so we cannot nullify the
+ // host to preserve semantics. Try expanding the
+ // hostname instead (fall through)
+ }
+ // first see if we can manually insert a hostname
+ $host = $config->get('URI.Host');
+ if (!is_null($host)) {
+ $uri->host = $host;
+ } else {
+ // we can't do anything sensible, reject the URL.
+ return false;
+ }
+ } while (false);
+ }
+ return $this->doValidate($uri, $config, $context);
}
}
@@ -8227,7 +8634,8 @@ public function validate($string, $config, $context) {
$keywords = array();
$keywords['h'] = false; // left, right
$keywords['v'] = false; // top, bottom
- $keywords['c'] = false; // center
+ $keywords['ch'] = false; // center (first word)
+ $keywords['cv'] = false; // center (second word)
$measures = array();
$i = 0;
@@ -8247,6 +8655,13 @@ public function validate($string, $config, $context) {
$lbit = ctype_lower($bit) ? $bit : strtolower($bit);
if (isset($lookup[$lbit])) {
$status = $lookup[$lbit];
+ if ($status == 'c') {
+ if ($i == 0) {
+ $status = 'ch';
+ } else {
+ $status = 'cv';
+ }
+ }
$keywords[$status] = $lbit;
$i++;
}
@@ -8269,20 +8684,19 @@ public function validate($string, $config, $context) {
if (!$i) return false; // no valid values were caught
-
$ret = array();
// first keyword
if ($keywords['h']) $ret[] = $keywords['h'];
- elseif (count($measures)) $ret[] = array_shift($measures);
- elseif ($keywords['c']) {
- $ret[] = $keywords['c'];
- $keywords['c'] = false; // prevent re-use: center = center center
+ elseif ($keywords['ch']) {
+ $ret[] = $keywords['ch'];
+ $keywords['cv'] = false; // prevent re-use: center = center center
}
+ elseif (count($measures)) $ret[] = array_shift($measures);
if ($keywords['v']) $ret[] = $keywords['v'];
+ elseif ($keywords['cv']) $ret[] = $keywords['cv'];
elseif (count($measures)) $ret[] = array_shift($measures);
- elseif ($keywords['c']) $ret[] = $keywords['c'];
if (empty($ret)) return false;
return implode(' ', $ret);
@@ -8693,11 +9107,43 @@ public function validate($string, $config, $context) {
/**
* Validates a font family list according to CSS spec
- * @todo whitelisting allowed fonts would be nice
*/
class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
{
+ protected $mask = null;
+
+ public function __construct() {
+ $this->mask = '- ';
+ for ($c = 'a'; $c <= 'z'; $c++) $this->mask .= $c;
+ for ($c = 'A'; $c <= 'Z'; $c++) $this->mask .= $c;
+ for ($c = '0'; $c <= '9'; $c++) $this->mask .= $c; // cast-y, but should be fine
+ // special bytes used by UTF-8
+ for ($i = 0x80; $i <= 0xFF; $i++) {
+ // We don't bother excluding invalid bytes in this range,
+ // because the our restriction of well-formed UTF-8 will
+ // prevent these from ever occurring.
+ $this->mask .= chr($i);
+ }
+
+ /*
+ PHP's internal strcspn implementation is
+ O(length of string * length of mask), making it inefficient
+ for large masks. However, it's still faster than
+ preg_match 8)
+ for (p = s1;;) {
+ spanp = s2;
+ do {
+ if (*spanp == c || p == s1_end) {
+ return p - s1;
+ }
+ } while (spanp++ < (s2_end - 1));
+ c = *++p;
+ }
+ */
+ // possible optimization: invert the mask.
+ }
+
public function validate($string, $config, $context) {
static $generic_names = array(
'serif' => true,
@@ -8706,6 +9152,7 @@ public function validate($string, $config, $context) {
'fantasy' => true,
'cursive' => true
);
+ $allowed_fonts = $config->get('CSS.AllowedFonts');
// assume that no font names contain commas in them
$fonts = explode(',', $string);
@@ -8715,7 +9162,9 @@ public function validate($string, $config, $context) {
if ($font === '') continue;
// match a generic name
if (isset($generic_names[$font])) {
- $final .= $font . ', ';
+ if ($allowed_fonts === null || isset($allowed_fonts[$font])) {
+ $final .= $font . ', ';
+ }
continue;
}
// match a quoted name
@@ -8725,50 +9174,122 @@ public function validate($string, $config, $context) {
$quote = $font[0];
if ($font[$length - 1] !== $quote) continue;
$font = substr($font, 1, $length - 2);
+ }
- $new_font = '';
- for ($i = 0, $c = strlen($font); $i < $c; $i++) {
- if ($font[$i] === '\\') {
- $i++;
- if ($i >= $c) {
- $new_font .= '\\';
- break;
- }
- if (ctype_xdigit($font[$i])) {
- $code = $font[$i];
- for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
- if (!ctype_xdigit($font[$i])) break;
- $code .= $font[$i];
- }
- // We have to be extremely careful when adding
- // new characters, to make sure we're not breaking
- // the encoding.
- $char = HTMLPurifier_Encoder::unichr(hexdec($code));
- if (HTMLPurifier_Encoder::cleanUTF8($char) === '') continue;
- $new_font .= $char;
- if ($i < $c && trim($font[$i]) !== '') $i--;
- continue;
- }
- if ($font[$i] === "\n") continue;
- }
- $new_font .= $font[$i];
- }
+ $font = $this->expandCSSEscape($font);
- $font = $new_font;
- }
// $font is a pure representation of the font name
+ if ($allowed_fonts !== null && !isset($allowed_fonts[$font])) {
+ continue;
+ }
+
if (ctype_alnum($font) && $font !== '') {
// very simple font, allow it in unharmed
$final .= $font . ', ';
continue;
}
- // complicated font, requires quoting
+ // bugger out on whitespace. form feed (0C) really
+ // shouldn't show up regardless
+ $font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font);
+
+ // Here, there are various classes of characters which need
+ // to be treated differently:
+ // - Alphanumeric characters are essentially safe. We
+ // handled these above.
+ // - Spaces require quoting, though most parsers will do
+ // the right thing if there aren't any characters that
+ // can be misinterpreted
+ // - Dashes rarely occur, but they fairly unproblematic
+ // for parsing/rendering purposes.
+ // The above characters cover the majority of Western font
+ // names.
+ // - Arbitrary Unicode characters not in ASCII. Because
+ // most parsers give little thought to Unicode, treatment
+ // of these codepoints is basically uniform, even for
+ // punctuation-like codepoints. These characters can
+ // show up in non-Western pages and are supported by most
+ // major browsers, for example: "MS 明朝" is a
+ // legitimate font-name
+ // <http://ja.wikipedia.org/wiki/MS_明朝>. See
+ // the CSS3 spec for more examples:
+ // <http://www.w3.org/TR/2011/WD-css3-fonts-20110324/localizedfamilynames.png>
+ // You can see live samples of these on the Internet:
+ // <http://www.google.co.jp/search?q=font-family+MS+明朝|ゴシック>
+ // However, most of these fonts have ASCII equivalents:
+ // for example, 'MS Mincho', and it's considered
+ // professional to use ASCII font names instead of
+ // Unicode font names. Thanks Takeshi Terada for
+ // providing this information.
+ // The following characters, to my knowledge, have not been
+ // used to name font names.
+ // - Single quote. While theoretically you might find a
+ // font name that has a single quote in its name (serving
+ // as an apostrophe, e.g. Dave's Scribble), I haven't
+ // been able to find any actual examples of this.
+ // Internet Explorer's cssText translation (which I
+ // believe is invoked by innerHTML) normalizes any
+ // quoting to single quotes, and fails to escape single
+ // quotes. (Note that this is not IE's behavior for all
+ // CSS properties, just some sort of special casing for
+ // font-family). So a single quote *cannot* be used
+ // safely in the font-family context if there will be an
+ // innerHTML/cssText translation. Note that Firefox 3.x
+ // does this too.
+ // - Double quote. In IE, these get normalized to
+ // single-quotes, no matter what the encoding. (Fun
+ // fact, in IE8, the 'content' CSS property gained
+ // support, where they special cased to preserve encoded
+ // double quotes, but still translate unadorned double
+ // quotes into single quotes.) So, because their
+ // fixpoint behavior is identical to single quotes, they
+ // cannot be allowed either. Firefox 3.x displays
+ // single-quote style behavior.
+ // - Backslashes are reduced by one (so \\ -> \) every
+ // iteration, so they cannot be used safely. This shows
+ // up in IE7, IE8 and FF3
+ // - Semicolons, commas and backticks are handled properly.
+ // - The rest of the ASCII punctuation is handled properly.
+ // We haven't checked what browsers do to unadorned
+ // versions, but this is not important as long as the
+ // browser doesn't /remove/ surrounding quotes (as IE does
+ // for HTML).
+ //
+ // With these results in hand, we conclude that there are
+ // various levels of safety:
+ // - Paranoid: alphanumeric, spaces and dashes(?)
+ // - International: Paranoid + non-ASCII Unicode
+ // - Edgy: Everything except quotes, backslashes
+ // - NoJS: Standards compliance, e.g. sod IE. Note that
+ // with some judicious character escaping (since certain
+ // types of escaping doesn't work) this is theoretically
+ // OK as long as innerHTML/cssText is not called.
+ // We believe that international is a reasonable default
+ // (that we will implement now), and once we do more
+ // extensive research, we may feel comfortable with dropping
+ // it down to edgy.
+
+ // Edgy: alphanumeric, spaces, dashes and Unicode. Use of
+ // str(c)spn assumes that the string was already well formed
+ // Unicode (which of course it is).
+ if (strspn($font, $this->mask) !== strlen($font)) {
+ continue;
+ }
- // armor single quotes and new lines
- $font = str_replace("\\", "\\\\", $font);
- $font = str_replace("'", "\\'", $font);
+ // Historical:
+ // In the absence of innerHTML/cssText, these ugly
+ // transforms don't pose a security risk (as \\ and \"
+ // might--these escapes are not supported by most browsers).
+ // We could try to be clever and use single-quote wrapping
+ // when there is a double quote present, but I have choosen
+ // not to implement that. (NOTE: you can reduce the amount
+ // of escapes by one depending on what quoting style you use)
+ // $font = str_replace('\\', '\\5C ', $font);
+ // $font = str_replace('"', '\\22 ', $font);
+ // $font = str_replace("'", '\\27 ', $font);
+
+ // font possibly with spaces, requires quoting
$final .= "'$font', ";
}
$final = rtrim($final, ', ');
@@ -9123,20 +9644,25 @@ public function validate($uri_string, $config, $context) {
$uri = substr($uri, 1, $new_length - 1);
}
- $keys = array( '(', ')', ',', ' ', '"', "'");
- $values = array('\\(', '\\)', '\\,', '\\ ', '\\"', "\\'");
- $uri = str_replace($values, $keys, $uri);
+ $uri = $this->expandCSSEscape($uri);
$result = parent::validate($uri, $config, $context);
if ($result === false) return false;
- // escape necessary characters according to CSS spec
- // except for the comma, none of these should appear in the
- // URI at all
- $result = str_replace($keys, $values, $result);
+ // extra sanity check; should have been done by URI
+ $result = str_replace(array('"', "\\", "\n", "\x0c", "\r"), "", $result);
+
+ // suspicious characters are ()'; we're going to percent encode
+ // them for safety.
+ $result = str_replace(array('(', ')', "'"), array('%28', '%29', '%27'), $result);
- return "url($result)";
+ // there's an extra bug where ampersands lose their escaping on
+ // an innerHTML cycle, so a very unlucky query parameter could
+ // then change the meaning of the URL. Unfortunately, there's
+ // not much we can do about that...
+
+ return "url(\"$result\")";
}
@@ -9617,6 +10143,12 @@ public function __construct() {
public function validate($string, $config, $context) {
$length = strlen($string);
+ // empty hostname is OK; it's usually semantically equivalent:
+ // the default host as defined by a URI scheme is used:
+ //
+ // If the URI scheme defines a default for host, then that
+ // default applies when the host subcomponent is undefined
+ // or when the registered name is empty (zero length).
if ($string === '') return '';
if ($length > 1 && $string[0] === '[' && $string[$length-1] === ']') {
//IPv6
@@ -10026,7 +10558,8 @@ public function transform($attr, $config, $context) {
if ($src) {
$alt = $config->get('Attr.DefaultImageAlt');
if ($alt === null) {
- $attr['alt'] = basename($attr['src']);
+ // truncate if the alt is too long
+ $attr['alt'] = substr(basename($attr['src']),0,40);
} else {
$attr['alt'] = $alt;
}
@@ -10238,6 +10771,48 @@ public function transform($attr, $config, $context) {
+// must be called POST validation
+
+/**
+ * Adds rel="nofollow" to all outbound links. This transform is
+ * only attached if Attr.Nofollow is TRUE.
+ */
+class HTMLPurifier_AttrTransform_Nofollow extends HTMLPurifier_AttrTransform
+{
+ private $parser;
+
+ public function __construct() {
+ $this->parser = new HTMLPurifier_URIParser();
+ }
+
+ public function transform($attr, $config, $context) {
+
+ if (!isset($attr['href'])) {
+ return $attr;
+ }
+
+ // XXX Kind of inefficient
+ $url = $this->parser->parse($attr['href']);
+ $scheme = $url->getSchemeObj($config, $context);
+
+ if (!is_null($url->host) && $scheme !== false && $scheme->browsable) {
+ if (isset($attr['rel'])) {
+ $attr['rel'] .= ' nofollow';
+ } else {
+ $attr['rel'] = 'nofollow';
+ }
+ }
+
+ return $attr;
+
+ }
+
+}
+
+
+
+
+
class HTMLPurifier_AttrTransform_SafeEmbed extends HTMLPurifier_AttrTransform
{
public $name = "SafeEmbed";
@@ -10290,6 +10865,7 @@ class HTMLPurifier_AttrTransform_SafeParam extends HTMLPurifier_AttrTransform
public function __construct() {
$this->uri = new HTMLPurifier_AttrDef_URI(true); // embedded
+ $this->wmode = new HTMLPurifier_AttrDef_Enum(array('window', 'opaque', 'transparent'));
}
public function transform($attr, $config, $context) {
@@ -10304,12 +10880,25 @@ public function transform($attr, $config, $context) {
case 'allowNetworking':
$attr['value'] = 'internal';
break;
+ case 'allowFullScreen':
+ if ($config->get('HTML.FlashAllowFullScreen')) {
+ $attr['value'] = ($attr['value'] == 'true') ? 'true' : 'false';
+ } else {
+ $attr['value'] = 'false';
+ }
+ break;
case 'wmode':
- $attr['value'] = 'window';
+ $attr['value'] = $this->wmode->validate($attr['value'], $config, $context);
break;
case 'movie':
+ case 'src':
+ $attr['name'] = "movie";
$attr['value'] = $this->uri->validate($attr['value'], $config, $context);
break;
+ case 'flashvars':
+ // we're going to allow arbitrary inputs to the SWF, on
+ // the reasoning that it could only hack the SWF, not us.
+ break;
// add other cases to support other param name/value pairs
default:
$attr['name'] = $attr['value'] = null;
@@ -11008,14 +11597,14 @@ public function add($def, $config) {
$file = $this->generateFilePath($config);
if (file_exists($file)) return false;
if (!$this->_prepareDir($config)) return false;
- return $this->_write($file, serialize($def));
+ return $this->_write($file, serialize($def), $config);
}
public function set($def, $config) {
if (!$this->checkDefType($def)) return;
$file = $this->generateFilePath($config);
if (!$this->_prepareDir($config)) return false;
- return $this->_write($file, serialize($def));
+ return $this->_write($file, serialize($def), $config);
}
public function replace($def, $config) {
@@ -11023,7 +11612,7 @@ public function replace($def, $config) {
$file = $this->generateFilePath($config);
if (!file_exists($file)) return false;
if (!$this->_prepareDir($config)) return false;
- return $this->_write($file, serialize($def));
+ return $this->_write($file, serialize($def), $config);
}
public function get($config) {
@@ -11096,18 +11685,34 @@ public function generateBaseDirectoryPath($config) {
* Convenience wrapper function for file_put_contents
* @param $file File name to write to
* @param $data Data to write into file
+ * @param $config Config object
* @return Number of bytes written if success, or false if failure.
*/
- private function _write($file, $data) {
- return file_put_contents($file, $data);
+ private function _write($file, $data, $config) {
+ $result = file_put_contents($file, $data);
+ if ($result !== false) {
+ // set permissions of the new file (no execute)
+ $chmod = $config->get('Cache.SerializerPermissions');
+ if (!$chmod) {
+ $chmod = 0644; // invalid config or simpletest
+ }
+ $chmod = $chmod & 0666;
+ chmod($file, $chmod);
+ }
+ return $result;
}
/**
* Prepares the directory that this type stores the serials in
+ * @param $config Config object
* @return True if successful
*/
private function _prepareDir($config) {
$directory = $this->generateDirectoryPath($config);
+ $chmod = $config->get('Cache.SerializerPermissions');
+ if (!$chmod) {
+ $chmod = 0755; // invalid config or simpletest
+ }
if (!is_dir($directory)) {
$base = $this->generateBaseDirectoryPath($config);
if (!is_dir($base)) {
@@ -11115,13 +11720,13 @@ private function _prepareDir($config) {
please create or change using %Cache.SerializerPath',
E_USER_WARNING);
return false;
- } elseif (!$this->_testPermissions($base)) {
+ } elseif (!$this->_testPermissions($base, $chmod)) {
return false;
}
- $old = umask(0022); // disable group and world writes
- mkdir($directory);
+ $old = umask(0000);
+ mkdir($directory, $chmod);
umask($old);
- } elseif (!$this->_testPermissions($directory)) {
+ } elseif (!$this->_testPermissions($directory, $chmod)) {
return false;
}
return true;
@@ -11130,8 +11735,11 @@ private function _prepareDir($config) {
/**
* Tests permissions on a directory and throws out friendly
* error messages and attempts to chmod it itself if possible
+ * @param $dir Directory path
+ * @param $chmod Permissions
+ * @return True if directory writable
*/
- private function _testPermissions($dir) {
+ private function _testPermissions($dir, $chmod) {
// early abort, if it is writable, everything is hunky-dory
if (is_writable($dir)) return true;
if (!is_dir($dir)) {
@@ -11145,17 +11753,17 @@ private function _testPermissions($dir) {
// POSIX system, we can give more specific advice
if (fileowner($dir) === posix_getuid()) {
// we can chmod it ourselves
- chmod($dir, 0755);
- return true;
+ $chmod = $chmod | 0700;
+ if (chmod($dir, $chmod)) return true;
} elseif (filegroup($dir) === posix_getgid()) {
- $chmod = '775';
+ $chmod = $chmod | 0070;
} else {
// PHP's probably running as nobody, so we'll
// need to give global permissions
- $chmod = '777';
+ $chmod = $chmod | 0777;
}
trigger_error('Directory '.$dir.' not writable, '.
- 'please chmod to ' . $chmod,
+ 'please chmod to ' . decoct($chmod),
E_USER_WARNING);
} else {
// generic error message
@@ -11717,8 +12325,10 @@ class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule
public $content_sets = array('Flow' => 'List');
public function setup($config) {
- $this->addElement('ol', 'List', 'Required: li', 'Common');
- $this->addElement('ul', 'List', 'Required: li', 'Common');
+ $ol = $this->addElement('ol', 'List', 'Required: li', 'Common');
+ $ol->wrap = "li";
+ $ul = $this->addElement('ul', 'List', 'Required: li', 'Common');
+ $ul->wrap = "li";
$this->addElement('dl', 'List', 'Required: dt | dd', 'Common');
$this->addElement('li', false, 'Flow', 'Common');
@@ -11755,6 +12365,26 @@ public function setup($config) {
+/**
+ * Module adds the nofollow attribute transformation to a tags. It
+ * is enabled by HTML.Nofollow
+ */
+class HTMLPurifier_HTMLModule_Nofollow extends HTMLPurifier_HTMLModule
+{
+
+ public $name = 'Nofollow';
+
+ public function setup($config) {
+ $a = $this->addBlankElement('a');
+ $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_Nofollow();
+ }
+
+}
+
+
+
+
+
class HTMLPurifier_HTMLModule_NonXMLCommonAttributes extends HTMLPurifier_HTMLModule
{
public $name = 'NonXMLCommonAttributes';
@@ -11937,7 +12567,8 @@ public function setup($config) {
'height' => 'Pixels#' . $max,
'allowscriptaccess' => 'Enum#never',
'allownetworking' => 'Enum#internal',
- 'wmode' => 'Enum#window',
+ 'flashvars' => 'Text',
+ 'wmode' => 'Enum#window,transparent,opaque',
'name' => 'ID',
)
);
@@ -11979,7 +12610,9 @@ public function setup($config) {
'type' => 'Enum#application/x-shockwave-flash',
'width' => 'Pixels#' . $max,
'height' => 'Pixels#' . $max,
- 'data' => 'URI#embedded'
+ 'data' => 'URI#embedded',
+ 'codebase' => new HTMLPurifier_AttrDef_Enum(array(
+ 'http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,40,0')),
)
);
$object->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeObject();
@@ -12508,6 +13141,7 @@ public function makeFixes() {
$r['thead@background'] = new HTMLPurifier_AttrTransform_Background();
$r['tfoot@background'] = new HTMLPurifier_AttrTransform_Background();
$r['tbody@background'] = new HTMLPurifier_AttrTransform_Background();
+ $r['table@height'] = new HTMLPurifier_AttrTransform_Length('height');
return $r;
}
@@ -12763,16 +13397,21 @@ public function handleText(&$token) {
// ----
// This is a degenerate case
} else {
- // State 1.2: PAR1
- // ----
+ if (!$token->is_whitespace || $this->_isInline($current)) {
+ // State 1.2: PAR1
+ // ----
- // State 1.3: PAR1\n\nPAR2
- // ------------
+ // State 1.3: PAR1\n\nPAR2
+ // ------------
- // State 1.4: <div>PAR1\n\nPAR2 (see State 2)
- // ------------
- $token = array($this->_pStart());
- $this->_splitText($text, $token);
+ // State 1.4: <div>PAR1\n\nPAR2 (see State 2)
+ // ------------
+ $token = array($this->_pStart());
+ $this->_splitText($text, $token);
+ } else {
+ // State 1.5: \n<hr />
+ // --
+ }
}
} else {
// State 2: <div>PAR1... (similar to 1.4)
@@ -13243,6 +13882,67 @@ public function handleElement(&$token) {
/**
+ * Injector that removes spans with no attributes
+ */
+class HTMLPurifier_Injector_RemoveSpansWithoutAttributes extends HTMLPurifier_Injector
+{
+ public $name = 'RemoveSpansWithoutAttributes';
+ public $needed = array('span');
+
+ private $attrValidator;
+
+ /**
+ * Used by AttrValidator
+ */
+ private $config;
+ private $context;
+
+ public function prepare($config, $context) {
+ $this->attrValidator = new HTMLPurifier_AttrValidator();
+ $this->config = $config;
+ $this->context = $context;
+ return parent::prepare($config, $context);
+ }
+
+ public function handleElement(&$token) {
+ if ($token->name !== 'span' || !$token instanceof HTMLPurifier_Token_Start) {
+ return;
+ }
+
+ // We need to validate the attributes now since this doesn't normally
+ // happen until after MakeWellFormed. If all the attributes are removed
+ // the span needs to be removed too.
+ $this->attrValidator->validateToken($token, $this->config, $this->context);
+ $token->armor['ValidateAttributes'] = true;
+
+ if (!empty($token->attr)) {
+ return;
+ }
+
+ $nesting = 0;
+ $spanContentTokens = array();
+ while ($this->forwardUntilEndToken($i, $current, $nesting)) {}
+
+ if ($current instanceof HTMLPurifier_Token_End && $current->name === 'span') {
+ // Mark closing span tag for deletion
+ $current->markForDeletion = true;
+ // Delete open span tag
+ $token = false;
+ }
+ }
+
+ public function handleEnd(&$token) {
+ if ($token->markForDeletion) {
+ $token = false;
+ }
+ }
+}
+
+
+
+
+
+/**
* Adds important param elements to inside of object in order to make
* things safe.
*/
@@ -13262,6 +13962,9 @@ class HTMLPurifier_Injector_SafeObject extends HTMLPurifier_Injector
protected $allowedParam = array(
'wmode' => true,
'movie' => true,
+ 'flashvars' => true,
+ 'src' => true,
+ 'allowFullScreen' => true, // if omitted, assume to be 'false'
);
public function prepare($config, $context) {
@@ -13289,7 +13992,8 @@ public function handleElement(&$token) {
// We need this fix because YouTube doesn't supply a data
// attribute, which we need if a type is specified. This is
// *very* Flash specific.
- if (!isset($this->objectStack[$i]->attr['data']) && $token->attr['name'] == 'movie') {
+ if (!isset($this->objectStack[$i]->attr['data']) &&
+ ($token->attr['name'] == 'movie' || $token->attr['name'] == 'src')) {
$this->objectStack[$i]->attr['data'] = $token->attr['value'];
}
// Check if the parameter is the correct value but has not
@@ -13402,23 +14106,57 @@ public function tokenizeHTML($html, $config, $context) {
}
/**
- * Recursive function that tokenizes a node, putting it into an accumulator.
- *
+ * Iterative function that tokenizes a node, putting it into an accumulator.
+ * To iterate is human, to recurse divine - L. Peter Deutsch
* @param $node DOMNode to be tokenized.
* @param $tokens Array-list of already tokenized tokens.
- * @param $collect Says whether or start and close are collected, set to
- * false at first recursion because it's the implicit DIV
- * tag you're dealing with.
* @returns Tokens of node appended to previously passed tokens.
*/
- protected function tokenizeDOM($node, &$tokens, $collect = false) {
+ protected function tokenizeDOM($node, &$tokens) {
+
+ $level = 0;
+ $nodes = array($level => array($node));
+ $closingNodes = array();
+ do {
+ while (!empty($nodes[$level])) {
+ $node = array_shift($nodes[$level]); // FIFO
+ $collect = $level > 0 ? true : false;
+ $needEndingTag = $this->createStartNode($node, $tokens, $collect);
+ if ($needEndingTag) {
+ $closingNodes[$level][] = $node;
+ }
+ if ($node->childNodes && $node->childNodes->length) {
+ $level++;
+ $nodes[$level] = array();
+ foreach ($node->childNodes as $childNode) {
+ array_push($nodes[$level], $childNode);
+ }
+ }
+ }
+ $level--;
+ if ($level && isset($closingNodes[$level])) {
+ while($node = array_pop($closingNodes[$level])) {
+ $this->createEndNode($node, $tokens);
+ }
+ }
+ } while ($level > 0);
+ }
+ /**
+ * @param $node DOMNode to be tokenized.
+ * @param $tokens Array-list of already tokenized tokens.
+ * @param $collect Says whether or start and close are collected, set to
+ * false at first recursion because it's the implicit DIV
+ * tag you're dealing with.
+ * @returns bool if the token needs an endtoken
+ */
+ protected function createStartNode($node, &$tokens, $collect) {
// intercept non element nodes. WE MUST catch all of them,
// but we're not getting the character reference nodes because
// those should have been preprocessed
if ($node->nodeType === XML_TEXT_NODE) {
$tokens[] = $this->factory->createText($node->data);
- return;
+ return false;
} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
// undo libxml's special treatment of <script> and <style> tags
$last = end($tokens);
@@ -13436,48 +14174,44 @@ protected function tokenizeDOM($node, &$tokens, $collect = false) {
}
}
$tokens[] = $this->factory->createText($this->parseData($data));
- return;
+ return false;
} elseif ($node->nodeType === XML_COMMENT_NODE) {
// this is code is only invoked for comments in script/style in versions
// of libxml pre-2.6.28 (regular comments, of course, are still
// handled regularly)
$tokens[] = $this->factory->createComment($node->data);
- return;
+ return false;
} elseif (
// not-well tested: there may be other nodes we have to grab
$node->nodeType !== XML_ELEMENT_NODE
) {
- return;
+ return false;
}
- $attr = $node->hasAttributes() ?
- $this->transformAttrToAssoc($node->attributes) :
- array();
+ $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
// We still have to make sure that the element actually IS empty
if (!$node->childNodes->length) {
if ($collect) {
$tokens[] = $this->factory->createEmpty($node->tagName, $attr);
}
+ return false;
} else {
- if ($collect) { // don't wrap on first iteration
+ if ($collect) {
$tokens[] = $this->factory->createStart(
$tag_name = $node->tagName, // somehow, it get's dropped
$attr
);
}
- foreach ($node->childNodes as $node) {
- // remember, it's an accumulator. Otherwise, we'd have
- // to use array_merge
- $this->tokenizeDOM($node, $tokens, true);
- }
- if ($collect) {
- $tokens[] = $this->factory->createEnd($tag_name);
- }
+ return true;
}
+ }
+ protected function createEndNode($node, &$tokens) {
+ $tokens[] = $this->factory->createEnd($node->tagName);
}
+
/**
* Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
*
@@ -13928,7 +14662,7 @@ public function parseAttributeString($string, $config, $context) {
}
}
if ($value === false) $value = '';
- return array($key => $value);
+ return array($key => $this->parseData($value));
}
// setup loop environment
@@ -14411,6 +15145,14 @@ public function execute($tokens, $config, $context) {
/**
* Takes tokens makes them well-formed (balance end tags, etc.)
+ *
+ * Specification of the armor attributes this strategy uses:
+ *
+ * - MakeWellFormed_TagClosedError: This armor field is used to
+ * suppress tag closed errors for certain tokens [TagClosedSuppress],
+ * in particular, if a tag was generated automatically by HTML
+ * Purifier, we may rely on our infrastructure to close it for us
+ * and shouldn't report an error to the user [TagClosedAuto].
*/
class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
{
@@ -14452,6 +15194,12 @@ public function execute($tokens, $config, $context) {
// local variables
$generator = new HTMLPurifier_Generator($config, $context);
$escape_invalid_tags = $config->get('Core.EscapeInvalidTags');
+ // used for autoclose early abortion
+ $global_parent_allowed_elements = array();
+ if (isset($definition->info[$definition->info_parent])) {
+ // may be unset under testing circumstances
+ $global_parent_allowed_elements = $definition->info[$definition->info_parent]->child->getAllowedElements($config);
+ }
$e = $context->get('ErrorCollector', true);
$t = false; // token index
$i = false; // injector index
@@ -14492,6 +15240,7 @@ public function execute($tokens, $config, $context) {
$this->injectors[] = $injector;
}
foreach ($custom_injectors as $injector) {
+ if (!$injector) continue;
if (is_string($injector)) {
$injector = "HTMLPurifier_Injector_$injector";
$injector = new $injector;
@@ -14510,7 +15259,7 @@ public function execute($tokens, $config, $context) {
// -- end INJECTOR --
- // a note on punting:
+ // a note on reprocessing:
// In order to reduce code duplication, whenever some code needs
// to make HTML changes in order to make things "correct", the
// new HTML gets sent through the purifier, regardless of its
@@ -14557,7 +15306,7 @@ public function execute($tokens, $config, $context) {
$top_nesting = array_pop($this->stack);
$this->stack[] = $top_nesting;
- // send error
+ // send error [TagClosedSuppress]
if ($e && !isset($top_nesting->armor['MakeWellFormed_TagClosedError'])) {
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $top_nesting);
}