Skip to content

Commit

Permalink
Improved HMTL parser for Smart Search
Browse files Browse the repository at this point in the history
  • Loading branch information
Chris Davenport committed Dec 7, 2014
1 parent 9c91c70 commit 6fd968c
Showing 1 changed file with 90 additions and 17 deletions.
107 changes: 90 additions & 17 deletions administrator/components/com_finder/helpers/indexer/parser/html.php
Expand Up @@ -19,44 +19,117 @@
class FinderIndexerParserHtml extends FinderIndexerParser
{
/**
* Method to process HTML input and extract the plain text.
* Method to parse input and extract the plain text. Because this method is
* called from both inside and outside the indexer, it needs to be able to
* batch out its parsing functionality to deal with the inefficiencies of
* regular expressions. We will parse recursively in 2KB chunks.
*
* @param string $input The input to process.
* @param string $input The input to parse.
*
* @return string The plain text input.
*
* @since 2.5
*/
protected function process($input)
public function parse($input)
{
// Strip invalid UTF-8 characters.
$input = iconv("utf-8", "utf-8//IGNORE", $input);

// Strip all script tags.
$input = preg_replace('#<script[^>]*>.*?</script>#si', ' ', $input);
// Convert <style> tags to <script> tags so we can remove them efficiently.
$input = str_replace(array('<style', '</style'), array('<script', '</script'), $input);

// Strip the tags from the input
$input = strip_tags($input);
// Strip all script blocks.
$input = $this->removeBlocks($input, '<script', '</script>');

// Decode HTML entities.
$input = html_entity_decode($input, ENT_QUOTES, 'UTF-8');

// Deal with spacing issues in the input
// Convert entities equivalent to spaces to actual spaces.
$input = str_replace(array('&nbsp;', '&#160;'), ' ', $input);

// This fixes issues such as '<h1>Title</h1><p>Paragraph</p>'
// being transformed into 'TitleParagraph' with no space.
$input = str_replace('>', '> ', $input);

// Strip HTML tags.
$input = strip_tags($input);

return parent::parse($input);
}

/**
* Method to process HTML input and extract the plain text.
*
* @param string $input The input to process.
*
* @return string The plain text input.
*
* @since 2.5
*/
protected function process($input)
{
// Replace any amount of white space with a single space.
$input = trim(preg_replace('#\s+#u', ' ', $input));

// Remove last parts of HTML code which may be caused by a cut of the string
if (strpos($input, '>') !== false)
return $input;
}

/**
* Method to remove blocks of text between a start and an end tag.
* Each block removed is effectively replaced by a single space.
*
* Note: The start tag and the end tag must be different.
* Note: Blocks must not be nested.
* Note: This method will function correctly with multi-byte strings.
*
* @param string $input String to be processed.
* @param string $startTag String representing the start tag.
* @param string $endTag String representing the end tag.
*
* @return string with blocks removed.
*/
private function removeBlocks($input, $startTag, $endTag)
{
$return = '';
$blocks = array();
$offset = 0;
$startTagLength = strlen($startTag);
$endTagLength = strlen($endTag);

// Find the first start tag.
$start = stripos($input, $startTag);

// If no start tags were found, return the string unchanged.
if ($start === false)
{
$input = substr($input, strpos($input, '>') + 1);
return $input;
}

if (strpos($input, '<') !== false)
// Look for all blocks defined by the start and end tags.
while ($start !== false)
{
$input = substr($input, 0, strpos($input, '<'));
// Accumulate the substring up to the start tag.
$return .= substr($input, $offset, $start - $offset) . ' ';

// Look for an end tag corresponding to the start tag.
$end = stripos($input, $endTag, $start + $startTagLength);

// If no corresponding end tag, leave the string alone.
if ($end === false)
{
break;
}

// Advance the start position.
$offset = $end + $endTagLength;

// Look for the next start tag and loop.
$start = stripos($input, $startTag, $offset);
}

// Decode entities and remove unneeded white spaces
$input = html_entity_decode($input, ENT_QUOTES, 'UTF-8');
$input = trim(preg_replace('#\s+#u', ' ', $input));
// Add in the final substring after the last end tag.
$return .= substr($input, $offset);

return $input;
return $return;
}
}

0 comments on commit 6fd968c

Please sign in to comment.