Skip to content
Permalink
Browse files

Detailed parsing of word-like lexemes in lexer

  • Loading branch information...
isagalaev committed Sep 3, 2015
1 parent a16d9d3 commit e99526c3415c2cb14b36b2e3c276c071dd99558f
Showing with 35 additions and 28 deletions.
  1. +27 −17 src/lexer.rs
  2. +8 −11 src/parser.rs
@@ -13,7 +13,7 @@ fn is_whitespace(value: u8) -> bool {
}
}

fn is_scalar(value: u8) -> bool {
fn is_word(value: u8) -> bool {
match value {
b'a' ... b'z' | b'0' ... b'9' |
b'E' | b'.' | b'+' | b'-' => true,
@@ -74,7 +74,9 @@ fn unescape(lexeme: &[u8]) -> Result<String> {
#[derive(Debug, PartialEq)]
pub enum Lexeme {
String(String),
Scalar(String),
Number(f64),
Boolean(bool),
Null,
OBrace,
CBrace,
OBracket,
@@ -151,23 +153,11 @@ impl<T: io::Read> Iterator for Lexer<T> {
}
self.pos += 1;
Lexeme::String(itry!(unescape(&result[..])))
} else if !is_scalar(self.buf[self.pos]) {
let ch = self.buf[self.pos];
self.pos += 1;
match ch {
b'{' => Lexeme::OBrace,
b'}' => Lexeme::CBrace,
b'[' => Lexeme::OBracket,
b']' => Lexeme::CBracket,
b',' => Lexeme::Comma,
b':' => Lexeme::Colon,
_ => return Some(Err(Error::Unknown(ch.to_string()))),
}
} else {
} else if is_word(self.buf[self.pos]) {
let mut result = vec![];
loop {
let start = self.pos;
while self.pos < self.len && is_scalar(self.buf[self.pos]) {
while self.pos < self.len && is_word(self.buf[self.pos]) {
self.pos += 1;
}
result.extend(self.buf[start..self.pos].iter().cloned());
@@ -176,7 +166,27 @@ impl<T: io::Read> Iterator for Lexer<T> {
_ => break,
}
}
Lexeme::Scalar(itry!(str::from_utf8(&result[..])).to_string())
match &result[..] {
b"true" => Lexeme::Boolean(true),
b"false" => Lexeme::Boolean(false),
b"null" => Lexeme::Null,
_ => {
let s = unsafe { str::from_utf8_unchecked(&result[..]).to_string() };
Lexeme::Number(itry!(s.parse().map_err(|_| Error::Unknown(s))))
}
}
} else {
let ch = self.buf[self.pos];
self.pos += 1;
match ch {
b'{' => Lexeme::OBrace,
b'}' => Lexeme::CBrace,
b'[' => Lexeme::OBracket,
b']' => Lexeme::CBracket,
b',' => Lexeme::Comma,
b':' => Lexeme::Colon,
_ => return Some(Err(Error::Unknown(ch.to_string()))),
}
}))
}
}
@@ -48,21 +48,18 @@ impl<T: Read> Parser<T> {
self.lexer.next().unwrap_or(Err(Error::MoreLexemes))
}

fn process_event(&self, lexeme: Lexeme) -> Result<Event> {
Ok(match lexeme {
fn process_event(&self, lexeme: Lexeme) -> Event {
match lexeme {
Lexeme::OBracket => Event::StartArray,
Lexeme::OBrace => Event::StartMap,
Lexeme::CBracket => Event::EndArray,
Lexeme::CBrace => Event::EndMap,
Lexeme::String(s) => Event::String(s),
Lexeme::Scalar(ref s) if s == "null" => Event::Null,
Lexeme::Scalar(ref s) if s == "true" => Event::Boolean(true),
Lexeme::Scalar(ref s) if s == "false" => Event::Boolean(false),
Lexeme::Scalar(s) => {
Event::Number(try!(s.parse().map_err(|_| Error::Unknown(s))))
},
_ => unreachable!(),
})
Lexeme::Number(n) => Event::Number(n),
Lexeme::Null => Event::Null,
Lexeme::Boolean(b) => Event::Boolean(b),
Lexeme::Comma | Lexeme::Colon => unreachable!(),
}
}

}
@@ -106,7 +103,7 @@ impl<T: Read> Iterator for Parser<T> {
State::Comma
};

return Some(self.process_event(lexeme))
return Some(Ok(self.process_event(lexeme)))
}
State::Key(can_close) => {
if let Some(&Ok(Lexeme::CBrace)) = self.lexer.peek() {

2 comments on commit e99526c

@Suor

This comment has been minimized.

Copy link

Suor replied Nov 12, 2015

You are still making copies of vectors "true", "false" and such, as well as real strings. You can try representing strings by offset + length and matching named constants inplace.

And comments on your blog don't work. Switch to Disqus?

@isagalaev

This comment has been minimized.

Copy link
Owner Author

isagalaev replied Nov 13, 2015

You are still making copies of vectors "true", "false" and such, as well as real strings. You can try representing strings by offset + length and matching named constants inplace.

It's not that simple unfortunately as it won't work when a lexeme crosses the buffer boundary. Still, I did a copy-less checking of "true", "false" and "null" in 7e341b3. In short, I walk through the buffer in the usual way and check each character against the known str.

And avoiding copies of quotes strings is a whole another story :-)

And comments on your blog don't work. Switch to Disqus?

They actually do work, it just didn't show you that your comment is waiting for moderation. I'll have a look at it, thanks for the heads-up!

Please sign in to comment.
You can’t perform that action at this time.