Improved HMTL parser for Smart Search

joomla · Dec 7, 2014 · 6fd968c · 6fd968c
1 parent 9c91c70
commit 6fd968c
Showing 1 changed file with 90 additions and 17 deletions.
diff --git a/administrator/components/com_finder/helpers/indexer/parser/html.php b/administrator/components/com_finder/helpers/indexer/parser/html.php
@@ -19,44 +19,117 @@
 class FinderIndexerParserHtml extends FinderIndexerParser
 {
 	/**
-	 * Method to process HTML input and extract the plain text.
+	 * Method to parse input and extract the plain text. Because this method is
+	 * called from both inside and outside the indexer, it needs to be able to
+	 * batch out its parsing functionality to deal with the inefficiencies of
+	 * regular expressions. We will parse recursively in 2KB chunks.
 	 *
-	 * @param   string  $input  The input to process.
+	 * @param   string  $input  The input to parse.
 	 *
 	 * @return  string  The plain text input.
 	 *
 	 * @since   2.5
 	 */
-	protected function process($input)
+	public function parse($input)
 	{
 		// Strip invalid UTF-8 characters.
 		$input = iconv("utf-8", "utf-8//IGNORE", $input);
 
-		// Strip all script tags.
-		$input = preg_replace('#<script[^>]*>.*?</script>#si', ' ', $input);
+		// Convert <style> tags to <script> tags so we can remove them efficiently.
+		$input = str_replace(array('<style', '</style'), array('<script', '</script'), $input);
 
-		// Strip the tags from the input
-		$input = strip_tags($input);
+		// Strip all script blocks.
+		$input = $this->removeBlocks($input, '<script', '</script>');
+
+		// Decode HTML entities.
+		$input = html_entity_decode($input, ENT_QUOTES, 'UTF-8');
 
-		// Deal with spacing issues in the input
+		// Convert entities equivalent to spaces to actual spaces.
 		$input = str_replace(array('&nbsp;', '&#160;'), ' ', $input);
+
+		// This fixes issues such as '<h1>Title</h1><p>Paragraph</p>'
+		// being transformed into 'TitleParagraph' with no space.
+		$input = str_replace('>', '> ', $input);
+
+		// Strip HTML tags.
+		$input = strip_tags($input);
+
+		return parent::parse($input);
+	}
+
+	/**
+	 * Method to process HTML input and extract the plain text.
+	 *
+	 * @param   string  $input  The input to process.
+	 *
+	 * @return  string  The plain text input.
+	 *
+	 * @since   2.5
+	 */
+	protected function process($input)
+	{
+		// Replace any amount of white space with a single space.
 		$input = trim(preg_replace('#\s+#u', ' ', $input));
 
-		// Remove last parts of HTML code which may be caused by a cut of the string
-		if (strpos($input, '>') !== false)
+		return $input;
+	}
+
+	/**
+	 * Method to remove blocks of text between a start and an end tag.
+	 * Each block removed is effectively replaced by a single space.
+	 *
+	 * Note: The start tag and the end tag must be different.
+	 * Note: Blocks must not be nested.
+	 * Note: This method will function correctly with multi-byte strings.
+	 *
+	 * @param   string  $input     String to be processed.
+	 * @param   string  $startTag  String representing the start tag.
+	 * @param   string  $endTag    String representing the end tag.
+	 *
+	 * @return  string with blocks removed.
+	 */
+	private function removeBlocks($input, $startTag, $endTag)
+	{
+		$return = '';
+		$blocks = array();
+		$offset = 0;
+		$startTagLength = strlen($startTag);
+		$endTagLength = strlen($endTag);
+
+		// Find the first start tag.
+		$start = stripos($input, $startTag);
+
+		// If no start tags were found, return the string unchanged.
+		if ($start === false)
 		{
-			$input = substr($input, strpos($input, '>') + 1);
+			return $input;
 		}
 
-		if (strpos($input, '<') !== false)
+		// Look for all blocks defined by the start and end tags.
+		while ($start !== false)
 		{
-			$input = substr($input, 0, strpos($input, '<'));
+			// Accumulate the substring up to the start tag.
+			$return .= substr($input, $offset, $start - $offset) . ' ';
+
+			// Look for an end tag corresponding to the start tag.
+			$end = stripos($input, $endTag, $start + $startTagLength);
+
+			// If no corresponding end tag, leave the string alone.
+			if ($end === false)
+			{
+				break;
+			}
+
+			// Advance the start position.
+			$offset = $end + $endTagLength;
+
+			// Look for the next start tag and loop.
+			$start = stripos($input, $startTag, $offset);
 		}
 
-		// Decode entities and remove unneeded white spaces
-		$input = html_entity_decode($input, ENT_QUOTES, 'UTF-8');
-		$input = trim(preg_replace('#\s+#u', ' ', $input));
+		// Add in the final substring after the last end tag.
+		$return .= substr($input, $offset);
 
-		return $input;
+		return $return;
 	}
 }