From b7154fb885c3749059416252836000abb650805d Mon Sep 17 00:00:00 2001 From: "Nicholas K. Dionysopoulos" Date: Sat, 13 Jun 2015 22:19:19 +0300 Subject: [PATCH] UTF-8 Multibyte (utf8mb4) support Strip 4-byte characters when the server doesn't support multibyte unicode characters --- libraries/joomla/filter/input.php | 38 ++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/libraries/joomla/filter/input.php b/libraries/joomla/filter/input.php index 821055051d22a..09bd10bd2100b 100644 --- a/libraries/joomla/filter/input.php +++ b/libraries/joomla/filter/input.php @@ -67,6 +67,14 @@ class JFilterInput */ public $xssAuto; + /** + * A flag for Unicode Supplementary Characters (4-byte Unicode character) stripping. + * + * @var integer + * @since CMS 3.5.0 + */ + public $stripUSC = 0; + /** * The list of the default blacklisted tags. * @@ -120,10 +128,11 @@ class JFilterInput * @param integer $tagsMethod WhiteList method = 0, BlackList method = 1 * @param integer $attrMethod WhiteList method = 0, BlackList method = 1 * @param integer $xssAuto Only auto clean essentials = 0, Allow clean blacklisted tags/attr = 1 + * @param integer $stripUSC Strip 4-byte unicode characters = 1, no strip = 0, ask the database driver = -1 * * @since 11.1 */ - public function __construct($tagsArray = array(), $attrArray = array(), $tagsMethod = 0, $attrMethod = 0, $xssAuto = 1) + public function __construct($tagsArray = array(), $attrArray = array(), $tagsMethod = 0, $attrMethod = 0, $xssAuto = 1, $stripUSC = -1) { // Make sure user defined arrays are in lowercase $tagsArray = array_map('strtolower', (array) $tagsArray); @@ -135,6 +144,21 @@ public function __construct($tagsArray = array(), $attrArray = array(), $tagsMet $this->tagsMethod = $tagsMethod; $this->attrMethod = $attrMethod; $this->xssAuto = $xssAuto; + $this->stripUSC = $stripUSC; + + /** + * If Unicode Supplementary Characters stripping is not set we have to check with the database driver. If the + * driver does not support USCs (i.e. there is no utf8mb4 support) we will enable USC stripping. + */ + if ($this->stripUSC == -1) + { + // Get the database driver + $db = JFactory::getDbo(); + // This trick is required to let the driver determine the utf-8 multibyte support + $db->connect(); + // And now we can decide if we should strip USCs + $this->stripUSC = $db->hasUTF8mb4Support() ? 0 : 1; + } } /** @@ -145,18 +169,19 @@ public function __construct($tagsArray = array(), $attrArray = array(), $tagsMet * @param integer $tagsMethod WhiteList method = 0, BlackList method = 1 * @param integer $attrMethod WhiteList method = 0, BlackList method = 1 * @param integer $xssAuto Only auto clean essentials = 0, Allow clean blacklisted tags/attr = 1 + * @param integer $stripUSC Strip 4-byte unicode characters = 1, no strip = 0, ask the database driver = -1 * * @return JFilterInput The JFilterInput object. * * @since 11.1 */ - public static function &getInstance($tagsArray = array(), $attrArray = array(), $tagsMethod = 0, $attrMethod = 0, $xssAuto = 1) + public static function &getInstance($tagsArray = array(), $attrArray = array(), $tagsMethod = 0, $attrMethod = 0, $xssAuto = 1, $stripUSC = -1) { $sig = md5(serialize(array($tagsArray, $attrArray, $tagsMethod, $attrMethod, $xssAuto))); if (empty(self::$instances[$sig])) { - self::$instances[$sig] = new JFilterInput($tagsArray, $attrArray, $tagsMethod, $attrMethod, $xssAuto); + self::$instances[$sig] = new JFilterInput($tagsArray, $attrArray, $tagsMethod, $attrMethod, $xssAuto, $stripUSC); } return self::$instances[$sig]; @@ -192,6 +217,13 @@ public static function &getInstance($tagsArray = array(), $attrArray = array(), */ public function clean($source, $type = 'string') { + // Strip Unicode Supplementary Characters when requested to do so + if ($this->stripUSC) + { + $source = preg_replace('/[\xF0-\xF7].../s', "\xE2\xAF\x91", $source); + // Alternatively: preg_replace('/[\x{10000}-\x{10FFFF}]/u', "\xE2\xAF\x91", $source) but it'd be slower. + } + // Handle the type constraint switch (strtoupper($type)) {