Skip to content
This repository has been archived by the owner on Sep 20, 2021. It is now read-only.

Commit

Permalink
New lexing algorithm: “offset” instead of “anchor”.
Browse files Browse the repository at this point in the history
Now, the lexer keeps the text untouched, in the same buffer/string, and
uses the `$offset` argument of `preg_match`. It avoids some `mb_*`
calls, creation of substrings for each consumed tokens, etc.

The lexer is 1.38 times faster and requires 1.27 times less memory
  • Loading branch information
Hywan committed Oct 7, 2013
1 parent 3fbc481 commit 4dad944
Showing 1 changed file with 27 additions and 20 deletions.
47 changes: 27 additions & 20 deletions Llk/Lexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ public function lexMe ( $text, Array $tokens ) {
$this->_tokens = $tokens;
$this->_nsStack = null;
$offset = 0;
$maxOffset = strlen($this->_text);
$tokenized = array();
$this->_lexerState = 'default';
$stack = false;
Expand Down Expand Up @@ -141,9 +142,9 @@ public function lexMe ( $text, Array $tokens ) {
if(true == $stack)
$this->_nsStack = new \SplStack();

while(0 < strlen($this->_text)) {
while($offset < $maxOffset) {

$nextToken = $this->nextToken();
$nextToken = $this->nextToken($offset);

if(null === $nextToken)
throw new \Hoa\Compiler\Exception\UnrecognizedToken(
Expand All @@ -159,8 +160,7 @@ public function lexMe ( $text, Array $tokens ) {
$tokenized[] = $nextToken;
}

$offset += $nextToken['length'];
$this->_text = mb_substr($this->_text, $nextToken['length']);
$offset += strlen($nextToken['value']);
}

$tokenized[] = array(
Expand All @@ -179,10 +179,11 @@ public function lexMe ( $text, Array $tokens ) {
* Compute the next token recognized at the beginning of the string.
*
* @access protected
* @param int $offset Offset.
* @return array
* @throw \Hoa\Compiler\Exception\UnrecognizedToken
*/
protected function nextToken ( ) {
protected function nextToken ( $offset ) {

$tokenArray = &$this->_tokens[$this->_lexerState];

Expand All @@ -193,7 +194,7 @@ protected function nextToken ( ) {
if(null === $nextState)
$nextState = $this->_lexerState;

$out = $this->matchLexeme($lexeme, $regex);
$out = $this->matchLexeme($lexeme, $regex, $offset);

if(null !== $out) {

Expand Down Expand Up @@ -248,28 +249,34 @@ protected function nextToken ( ) {
* @access protected
* @param string $lexeme Name of the lexeme.
* @param string $regex Regular expression describing the lexeme.
* @param int $offset Offset.
* @return array
* @throw \Hoa\Compiler\Exception\Lexer
*/
protected function matchLexeme ( $lexeme, $regex ) {
protected function matchLexeme ( $lexeme, $regex, $offset ) {

$_regex = str_replace('#', '\#', $regex);
$preg = preg_match(
'#(?|' . $_regex . ')#u',
$this->_text,
$matches,
PREG_OFFSET_CAPTURE,
$offset
);

if(0 !== preg_match('#^(?:' . $_regex . ')#u', $this->_text, $matches)) {

if('' === $matches[0])
throw new \Hoa\Compiler\Exception\Lexer(
'A lexeme must not match an empty value, which is the ' .
'case of "%s" (%s).', 3, array($lexeme, $regex));
if(0 === $preg || $offset !== $matches[0][1])
return null;

return array(
'token' => $lexeme,
'value' => $matches[0],
'length' => mb_strlen($matches[0])
);
}
if('' === $matches[0])
throw new \Hoa\Compiler\Exception\Lexer(
'A lexeme must not match an empty value, which is the ' .
'case of "%s" (%s).', 3, array($lexeme, $regex));

return null;
return array(
'token' => $lexeme,
'value' => $matches[0][0],
'length' => mb_strlen($matches[0][0])
);
}
}

Expand Down

0 comments on commit 4dad944

Please sign in to comment.