Permalink
Browse files

Merge pull request #34 from heiglandreas/addXmlTokenizer

Add xml tokenizer
  • Loading branch information...
2 parents 3a89ada + a521626 commit acdd95783c2e6250877f1f0dc3153ccc7128de2f @heiglandreas committed Jul 27, 2017
Showing with 227 additions and 1 deletion.
  1. +21 −1 doc/examples.xml
  2. +118 −0 src/Tokenizer/XmlTokenizer.php
  3. +28 −0 tests/HyphenatorFeatureTest.php
  4. +60 −0 tests/Tokenizer/XmlTokenizerTest.php
View
@@ -59,6 +59,26 @@ echo $h->hyphenate('We have some really long words in german like sauerstofffeld
// Thanks to lsmith for the idea!]]></programlisting>
</para>
</sect1>
+ <sect1>
+ <title>Hyphenate HTML-content</title>
+ <para>
+ <programlisting language="php"><![CDATA[use \Org\Heigl\Hyphenator as h;
+$o = new h\Options();
+$o->setHyphen('-')
+ ->setDefaultLocale('de_DE')
+ ->setRightMin(2)
+ ->setLeftMin(2)
+ ->setWordMin(5)
+ ->setFilters('Simple')
+ ->setTokenizers('Xml', 'Whitespace', 'Punctuation');
+$h = new h\Hyphenator();
+$h->setOptions($o);
+
+echo $h->hyphenate('<h1>Long Words</h1><p>We have some really long words in german<br />like sauerstofffeldflasche.</p>');
+// prints <h1>Long Words</h1><p>We have some re-al-ly long words in ger-man<br />like sau-er-stoff-feld-fla-sche.</p>
+// Thanks to mablae for the idea!]]></programlisting>
+ </para>
+ </sect1>
<caution>
<title>Performance-Hint</title>
<para>
@@ -71,4 +91,4 @@ echo $h->hyphenate('We have some really long words in german like sauerstofffeld
second on a 2.5GHz Intel Core2 Duo using 4GB RAM.
</para>
</caution>
-</chapter>
+</chapter>
@@ -0,0 +1,118 @@
+<?php
+/**
+ * Copyright (c) 2008-2011 Andreas Heigl<andreas@heigl.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * @category Hyphenation
+ * @package Org_Heigl_Hyphenator
+ * @subpackage Tokenizer
+ * @author Andreas Heigl <andreas@heigl.org>
+ * @copyright 2008-2011 Andreas Heigl<andreas@heigl.org>
+ * @license http://www.opensource.org/licenses/mit-license.php MIT-License
+ * @version 2.0.1
+ * @link http://github.com/heiglandreas/Hyphenator
+ * @since 11.11.2011
+ */
+
+namespace Org\Heigl\Hyphenator\Tokenizer;
+
+/**
+ * Use Whitespace to split any input into tokens
+ *
+ * @category Hyphenation
+ * @package Org_Heigl_Hyphenator
+ * @subpackage Tokenizer
+ * @author Andreas Heigl <andreas@heigl.org>
+ * @copyright 2008-2011 Andreas Heigl<andreas@heigl.org>
+ * @license http://www.opensource.org/licenses/mit-license.php MIT-License
+ * @version 2.0.1
+ * @link http://github.com/heiglandreas/Hyphenator
+ * @since 04.11.2011
+ */
+class XmlTokenizer implements Tokenizer
+{
+ /**
+ * Split the given input into tokens using Html-Elements as splitter
+ *
+ * The input can be a string or a tokenRegistry. If the input is a
+ * TokenRegistry, each item will be tokenized.
+ *
+ * @param string|\Org\Heigl\Hyphenator\Tokenizer\TokenRegistry $input The
+ * input to be tokenized
+ *
+ * @return \Org\Heigl\Hyphenator\Tokenizer\TokenRegistry
+ */
+ public function run($input)
+ {
+ if ($input instanceof TokenRegistry) {
+ // Tokenize a TokenRegistry
+ foreach ($input as $token) {
+ if (! $token instanceof WordToken) {
+ continue;
+ }
+ $newTokens = $this->_tokenize($token->get());
+ if ($newTokens == array($token)) {
+ continue;
+ }
+ $input->replace($token, $newTokens);
+ }
+
+ return $input ;
+ }
+
+ // Tokenize a simple string.
+ $array = $this->_tokenize($input);
+ $registry = new TokenRegistry();
+ foreach ($array as $item) {
+ $registry->add($item);
+ }
+
+ return $registry;
+ }
+
+ /**
+ * Split the given string into tokens using whitespace.
+ *
+ * Each whitespace is placed in a WhitespaceToken and everything else is
+ * placed in a WordToken-Object
+ *
+ * @param string $input The String to tokenize
+ *
+ * @return Token
+ */
+ protected function _tokenize($input)
+ {
+ $tokens = array();
+ $splits = preg_split("/(<\/?[^>]+\/?>)/u", $input, -1, PREG_SPLIT_DELIM_CAPTURE);
+
+ foreach ($splits as $split) {
+ if (! $split) {
+ continue;
+ }
+ if (0 === mb_strpos($split, '<')) {
+ $tokens[] = new NonWordToken($split);
+ continue;
+ }
+ $tokens[] = new WordToken($split);
+ }
+
+ return $tokens;
+ }
+}
@@ -110,4 +110,32 @@ public function hyphenationOfSingleWordWithDefaultOutputProvider()
['urinstinkt ', 'de_DE', 'ur^instinkt ', h\Hyphenator::QUALITY_HIGHEST], // Sturm will not be hyphenated…
];
}
+
+ /**
+ * @dataProvider hyphenationOfHtmlWithDefaultOutputProvider
+ */
+ public function testHyphenationOfHtmlWithDefaultOutput($html, $language, $expected, $quality = 9)
+ {
+ $o = new h\Options();
+ $o->setHyphen('^')
+ ->setDefaultLocale($language)
+ ->setRightMin(2)
+ ->setLeftMin(2)
+ ->setWordMin(4)
+ ->setFilters('Simple')
+ ->setQuality($quality)
+ ->setTokenizers('Xml, Whitespace, Punctuation');
+
+ $h = new h\Hyphenator();
+ $h->setOptions($o);
+
+ $this->assertEquals($expected, $h->hyphenate($html));
+ }
+
+ public function hyphenationOfHtmlWithDefaultOutputProvider()
+ {
+ return [
+ ['<xml>Otto<br/>Aussichtsturm</html>', 'de_DE', '<xml>Ot^to<br/>Aus^sicht^sturm</html>', h\Hyphenator::QUALITY_NORMAL],
+ ];
+ }
}
@@ -0,0 +1,60 @@
+<?php
+/**
+ * Copyright (c) 2008-2011 Andreas Heigl<andreas@heigl.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * @category Hyphenator
+ * @package Org\Heigl\Hyphenator
+ * @author Andreas Heigl <andreas@heigl.org>
+ * @copyright 2008-2011 Andreas Heigl<andreas@heigl.org>
+ * @license http://www.opensource.org/licenses/mit-license.php MIT-License
+ * @version 2.0.1
+ * @since 02.11.2011
+ */
+
+namespace Org\Heigl\HyphenatorTest\Tokenizer;
+
+use \Org\Heigl\Hyphenator\Tokenizer as t;
+
+/**
+ * This class tests the functionality of the class Token
+ *
+ * @category Hyphenator
+ * @package Org\Heigl\Hyphenator
+ * @author Andreas Heigl <andreas@heigl.org>
+ * @copyright 2008-2011 Andreas Heigl<andreas@heigl.org>
+ * @license http://www.opensource.org/licenses/mit-license.php MIT-License
+ * @version 2.0.1
+ * @since 02.11.2011
+ */
+class XmlTokenizerTest extends \PHPUnit_Framework_TestCase
+{
+ public function testTokenizingString()
+ {
+ $tokenizer = new t\XmlTokenizer();
+ $tReg = new t\TokenRegistry();
+ $tReg->add(new t\NonWordToken('<xml>'))
+ ->add(new t\WordToken('Foo'))
+ ->add(new t\NonWordToken('<br/>'))
+ ->add(new t\NonWordToken('</html>'));
+ $registry = $tokenizer->run('<xml>Foo<br/></html>');
+ $this->assertEquals($tReg, $registry);
+ }
+}

0 comments on commit acdd957

Please sign in to comment.