Skip to content

Commit

Permalink
UTF-8 Multibyte (utf8mb4) support
Browse files Browse the repository at this point in the history
Strip 4-byte characters when the server doesn't support multibyte unicode characters
  • Loading branch information
Nicholas K. Dionysopoulos committed Jun 13, 2015
1 parent 477d8a4 commit b7154fb
Showing 1 changed file with 35 additions and 3 deletions.
38 changes: 35 additions & 3 deletions libraries/joomla/filter/input.php
Expand Up @@ -67,6 +67,14 @@ class JFilterInput
*/
public $xssAuto;

/**
* A flag for Unicode Supplementary Characters (4-byte Unicode character) stripping.
*
* @var integer
* @since CMS 3.5.0
*/
public $stripUSC = 0;

/**
* The list of the default blacklisted tags.
*
Expand Down Expand Up @@ -120,10 +128,11 @@ class JFilterInput
* @param integer $tagsMethod WhiteList method = 0, BlackList method = 1
* @param integer $attrMethod WhiteList method = 0, BlackList method = 1
* @param integer $xssAuto Only auto clean essentials = 0, Allow clean blacklisted tags/attr = 1
* @param integer $stripUSC Strip 4-byte unicode characters = 1, no strip = 0, ask the database driver = -1
*
* @since 11.1
*/
public function __construct($tagsArray = array(), $attrArray = array(), $tagsMethod = 0, $attrMethod = 0, $xssAuto = 1)
public function __construct($tagsArray = array(), $attrArray = array(), $tagsMethod = 0, $attrMethod = 0, $xssAuto = 1, $stripUSC = -1)
{
// Make sure user defined arrays are in lowercase
$tagsArray = array_map('strtolower', (array) $tagsArray);
Expand All @@ -135,6 +144,21 @@ public function __construct($tagsArray = array(), $attrArray = array(), $tagsMet
$this->tagsMethod = $tagsMethod;
$this->attrMethod = $attrMethod;
$this->xssAuto = $xssAuto;
$this->stripUSC = $stripUSC;

/**
* If Unicode Supplementary Characters stripping is not set we have to check with the database driver. If the
* driver does not support USCs (i.e. there is no utf8mb4 support) we will enable USC stripping.
*/
if ($this->stripUSC == -1)
{
// Get the database driver
$db = JFactory::getDbo();
// This trick is required to let the driver determine the utf-8 multibyte support
$db->connect();
// And now we can decide if we should strip USCs
$this->stripUSC = $db->hasUTF8mb4Support() ? 0 : 1;
}
}

/**
Expand All @@ -145,18 +169,19 @@ public function __construct($tagsArray = array(), $attrArray = array(), $tagsMet
* @param integer $tagsMethod WhiteList method = 0, BlackList method = 1
* @param integer $attrMethod WhiteList method = 0, BlackList method = 1
* @param integer $xssAuto Only auto clean essentials = 0, Allow clean blacklisted tags/attr = 1
* @param integer $stripUSC Strip 4-byte unicode characters = 1, no strip = 0, ask the database driver = -1
*
* @return JFilterInput The JFilterInput object.
*
* @since 11.1
*/
public static function &getInstance($tagsArray = array(), $attrArray = array(), $tagsMethod = 0, $attrMethod = 0, $xssAuto = 1)
public static function &getInstance($tagsArray = array(), $attrArray = array(), $tagsMethod = 0, $attrMethod = 0, $xssAuto = 1, $stripUSC = -1)
{
$sig = md5(serialize(array($tagsArray, $attrArray, $tagsMethod, $attrMethod, $xssAuto)));

if (empty(self::$instances[$sig]))
{
self::$instances[$sig] = new JFilterInput($tagsArray, $attrArray, $tagsMethod, $attrMethod, $xssAuto);
self::$instances[$sig] = new JFilterInput($tagsArray, $attrArray, $tagsMethod, $attrMethod, $xssAuto, $stripUSC);
}

return self::$instances[$sig];
Expand Down Expand Up @@ -192,6 +217,13 @@ public static function &getInstance($tagsArray = array(), $attrArray = array(),
*/
public function clean($source, $type = 'string')
{
// Strip Unicode Supplementary Characters when requested to do so
if ($this->stripUSC)
{
$source = preg_replace('/[\xF0-\xF7].../s', "\xE2\xAF\x91", $source);
// Alternatively: preg_replace('/[\x{10000}-\x{10FFFF}]/u', "\xE2\xAF\x91", $source) but it'd be slower.
}

// Handle the type constraint
switch (strtoupper($type))
{
Expand Down

0 comments on commit b7154fb

Please sign in to comment.