Permalink
Browse files

Crude lexeme typing

- Distinguish strings, "scalars" and single-byte lexemes
- Everything was forced to compile, hence a lot of ugliness everywhere
- Tests pass!
  • Loading branch information...
isagalaev committed Aug 31, 2015
1 parent b3a8d4a commit 02951fef7525c623b3fa88a87d0d2fa167c1066c
Showing with 113 additions and 93 deletions.
  1. +21 −7 src/errors.rs
  2. +33 −12 src/lexer.rs
  3. +59 −74 src/parser.rs
View
@@ -1,4 +1,6 @@
use std::{io, str, error, fmt, result};
use std::{io, str, string, error, fmt, result};
use ::lexer::Lexeme;
#[macro_export]
@@ -15,11 +17,13 @@ macro_rules! itry {
pub enum Error {
Unterminated,
IO(io::Error),
Unexpected(String),
Utf8(str::Utf8Error),
Unknown(String),
Unexpected(Lexeme),
Utf8(string::FromUtf8Error),
Utf8s(str::Utf8Error),
Escape(String),
MoreLexemes,
Unmatched(char),
Unmatched(Lexeme),
AdditionalData,
}
@@ -28,11 +32,13 @@ impl fmt::Display for Error {
match *self {
Error::Unterminated => write!(f, "{}", self),
Error::IO(_) => write!(f, "I/O Error: {}", self),
Error::Unexpected(ref s) => write!(f, "Unexpected lexeme: '{}'", s),
Error::Unknown(ref s) => write!(f, "Unexpected lexeme: '{}'", s),
Error::Unexpected(ref s) => write!(f, "Unexpected lexeme: '{:?}'", s),
Error::Utf8(ref e) => write!(f, "UTF8 Error: {}", e),
Error::Utf8s(ref e) => write!(f, "UTF8 Error: {}", e),
Error::Escape(ref s) => write!(f, "Malformed escape: '{}'", s),
Error::MoreLexemes => write!(f, "More lexemes expected"),
Error::Unmatched(ref c) => write!(f, "Unmatched container terminator: {}", c),
Error::Unmatched(ref s) => write!(f, "Unmatched container terminator: {:?}", s),
Error::AdditionalData => write!(f, "Additional data in the source stream after parsed value"),
}
}
@@ -43,8 +49,10 @@ impl error::Error for Error {
match *self {
Error::Unterminated => "unterminated string",
Error::IO(ref e) => e.description(),
Error::Unknown(..) => "unknown lexeme",
Error::Unexpected(..) => "unexpected lexeme",
Error::Utf8(ref e) => e.description(),
Error::Utf8s(ref e) => e.description(),
Error::Escape(..) => "malformed escape",
Error::MoreLexemes => "more lexemes expected",
Error::Unmatched(..) => "unmatched container terminator",
@@ -68,9 +76,15 @@ impl From<io::Error> for Error {
}
}
impl From<string::FromUtf8Error> for Error {
fn from(e: string::FromUtf8Error) -> Self {
Error::Utf8(e)
}
}
impl From<str::Utf8Error> for Error {
fn from(e: str::Utf8Error) -> Self {
Error::Utf8(e)
Error::Utf8s(e)
}
}
View
@@ -1,6 +1,6 @@
use std::io;
use ::errors::{Error, ResultIterator};
use ::errors::{Error, Result, ResultIterator};
const BUFSIZE: usize = 64 * 1024;
@@ -13,14 +13,26 @@ fn is_whitespace(value: u8) -> bool {
}
}
fn is_lexeme(value: u8) -> bool {
fn is_scalar(value: u8) -> bool {
match value {
b'a' ... b'z' | b'0' ... b'9' |
b'E' | b'.' | b'+' | b'-' => true,
_ => false,
}
}
#[derive(Debug, PartialEq)]
pub enum Lexeme {
String(String),
Scalar(String),
OBrace,
CBrace,
OBracket,
CBracket,
Comma,
Colon,
}
enum Buffer {
Within,
Reset,
@@ -60,7 +72,7 @@ impl<T: io::Read> Lexer<T> {
}
impl<T: io::Read> Iterator for Lexer<T> {
type Item = Result<Vec<u8>, Error>;
type Item = Result<Lexeme>;
fn next(&mut self) -> Option<Self::Item> {
while match itry!(self.ensure_buffer()) {
@@ -70,9 +82,8 @@ impl<T: io::Read> Iterator for Lexer<T> {
self.pos += 1;
}
let mut result = vec![];
if self.buf[self.pos] == b'"' {
result.push(b'"');
Some(Ok(if self.buf[self.pos] == b'"' {
let mut result = vec![];
let mut escaped = false;
self.pos += 1;
loop {
@@ -89,14 +100,24 @@ impl<T: io::Read> Iterator for Lexer<T> {
}
}
self.pos += 1;
result.push(b'"');
} else if !is_lexeme(self.buf[self.pos]) {
result.push(self.buf[self.pos]);
Lexeme::String(itry!(String::from_utf8(result)))
} else if !is_scalar(self.buf[self.pos]) {
let ch = self.buf[self.pos];
self.pos += 1;
match ch {
b'{' => Lexeme::OBrace,
b'}' => Lexeme::CBrace,
b'[' => Lexeme::OBracket,
b']' => Lexeme::CBracket,
b',' => Lexeme::Comma,
b':' => Lexeme::Colon,
_ => return Some(Err(Error::Unknown(ch.to_string()))),
}
} else {
let mut result = vec![];
loop {
let start = self.pos;
while self.pos < self.len && is_lexeme(self.buf[self.pos]) {
while self.pos < self.len && is_scalar(self.buf[self.pos]) {
self.pos += 1;
}
result.extend(self.buf[start..self.pos].iter().cloned());
@@ -105,7 +126,7 @@ impl<T: io::Read> Iterator for Lexer<T> {
_ => break,
}
}
}
Some(Ok(result))
Lexeme::Scalar(itry!(String::from_utf8(result)))
}))
}
}
View
@@ -2,7 +2,7 @@ use std::io::Read;
use std::iter::Peekable;
use std::{str, char};
use ::lexer;
use ::lexer::{Lexer, Lexeme};
use ::errors::{Error, Result, ResultIterator};
@@ -20,11 +20,6 @@ pub enum Event {
EndMap,
}
#[inline]
fn unexpected(lexeme: Vec<u8>) -> Option<Result<Event>> {
Some(Err(Error::Unexpected(str::from_utf8(&lexeme[..]).unwrap().to_string())))
}
#[derive(Debug)]
enum State {
Closed,
@@ -34,11 +29,6 @@ enum State {
Comma,
}
#[inline]
fn trim(lexeme: &[u8]) -> &[u8] {
&lexeme[1..lexeme.len() - 1]
}
#[inline]
fn hexdecode(s: &[u8]) -> Option<char> {
let mut value = 0;
@@ -51,10 +41,10 @@ fn hexdecode(s: &[u8]) -> Option<char> {
char::from_u32(value)
}
fn unescape(lexeme: &[u8]) -> Result<String> {
fn unescape(lexeme_str: String) -> Result<String> {
let lexeme = lexeme_str.as_bytes();
let len = lexeme.len();
let mut result = String::with_capacity(lexeme.len());
let mut result = String::with_capacity(len);
let mut pos = 0;
while pos < len {
let start = pos;
@@ -91,48 +81,39 @@ fn unescape(lexeme: &[u8]) -> Result<String> {
}
pub struct Parser<T: Read> {
lexer: Peekable<ResultIterator<lexer::Lexer<T>>>,
stack: Vec<u8>,
lexer: Peekable<ResultIterator<Lexer<T>>>,
stack: Vec<Lexeme>,
state: State,
}
impl<T: Read> Parser<T> {
pub fn new(f: T) -> ResultIterator<Parser<T>> {
ResultIterator::new(Parser {
lexer: lexer::Lexer::new(f).peekable(),
lexer: Lexer::new(f).peekable(),
stack: vec![],
state: State::Event(false),
})
}
fn consume_lexeme(&mut self) -> Result<Vec<u8>> {
fn consume_lexeme(&mut self) -> Result<Lexeme> {
self.lexer.next().unwrap_or(Err(Error::MoreLexemes))
}
fn check_lexeme(&mut self, lexemes: &[&[u8]]) -> bool {
match self.lexer.peek() {
None | Some(&Err(..)) => false,
Some(&Ok(ref next)) => {
lexemes.iter().any(|l| *l == &next[..])
}
}
}
fn process_event(&self, lexeme: &[u8]) -> Result<Event> {
fn process_event(&self, lexeme: Lexeme) -> Result<Event> {
Ok(match lexeme {
b"null" => Event::Null,
b"true" => Event::Boolean(true),
b"false" => Event::Boolean(false),
b"[" => Event::StartArray,
b"{" => Event::StartMap,
b"]" => Event::EndArray,
b"}" => Event::EndMap,
_ if lexeme[0] == b'"' => Event::String(try!(unescape(trim(lexeme)))),
_ => {
let s = try!(str::from_utf8(lexeme));
Event::Number(try!(s.parse().map_err(|_| Error::Unexpected(str::from_utf8(lexeme).unwrap().to_string()))))
}
Lexeme::OBracket => Event::StartArray,
Lexeme::OBrace => Event::StartMap,
Lexeme::CBracket => Event::EndArray,
Lexeme::CBrace => Event::EndMap,
Lexeme::String(s) => Event::String(try!(unescape(s))),
Lexeme::Scalar(ref s) if s == "null" => Event::Null,
Lexeme::Scalar(ref s) if s == "true" => Event::Boolean(true),
Lexeme::Scalar(ref s) if s == "false" => Event::Boolean(false),
Lexeme::Scalar(s) => {
Event::Number(try!(s.parse().map_err(|_| Error::Unknown(s))))
},
_ => unreachable!(),
})
}
@@ -153,68 +134,72 @@ impl<T: Read> Iterator for Parser<T> {
State::Event(can_close) => {
let lexeme = itry!(self.consume_lexeme());
match &lexeme[..] {
b"]" | b"}" if !can_close => return unexpected(lexeme),
b"[" | b"{" => self.stack.push(lexeme[0]),
b"]" | b"}" => {
let expected = if lexeme[0] == b']' { b'[' } else { b'{' };
match &lexeme {
&Lexeme::CBracket | &Lexeme::CBrace if !can_close => return Some(Err(Error::Unexpected(lexeme))),
&Lexeme::OBracket => self.stack.push(Lexeme::OBracket),
&Lexeme::OBrace => self.stack.push(Lexeme::OBrace),
&Lexeme::CBracket | &Lexeme::CBrace => {
let expected = if Lexeme::CBracket == lexeme { Lexeme::OBracket } else { Lexeme::OBrace };
match self.stack.pop() {
Some(value) if value == expected => (),
_ => return Some(Err(Error::Unmatched(lexeme[0] as char))),
Some(ref value) if *value == expected => (),
_ => return Some(Err(Error::Unmatched(lexeme))),
}
}
_ => ()
};
self.state = if self.stack.len() == 0 {
State::Closed
} else if lexeme == b"[" {
} else if lexeme == Lexeme::OBracket {
State::Event(true)
} else if lexeme == b"{" {
} else if lexeme == Lexeme::OBrace {
State::Key(true)
} else {
State::Comma
};
return Some(self.process_event(&lexeme))
return Some(self.process_event(lexeme))
}
State::Key(can_close) => {
if self.check_lexeme(&[b"}"]) {
if let Some(&Ok(Lexeme::CBrace)) = self.lexer.peek() {
if !can_close {
return unexpected(vec![b'}'])
return Some(Err(Error::Unexpected(Lexeme::CBrace)))
}
self.state = State::Event(true);
continue;
}
let lexeme = itry!(self.consume_lexeme());
if lexeme[0] != b'"' {
return unexpected(lexeme)
}
self.state = State::Colon;
let s = itry!(str::from_utf8(trim(&lexeme)));
return Some(Ok(Event::Key(s.to_string())))
return Some(match itry!(self.consume_lexeme()) {
Lexeme::String(s) => {
self.state = State::Colon;
Ok(Event::Key(s))
}
lexeme => Err(Error::Unexpected(lexeme))
})
}
State::Colon => {
let lexeme = itry!(self.consume_lexeme());
if lexeme != b":" {
return unexpected(lexeme)
match itry!(self.consume_lexeme()) {
Lexeme::Colon => self.state = State::Event(false),
lexeme => return Some(Err(Error::Unexpected(lexeme))),
}
self.state = State::Event(false);
}
State::Comma => {
if self.check_lexeme(&[b"]", b"}"]) {
self.state = State::Event(true);
continue;
match self.lexer.peek() {
Some(&Ok(Lexeme::CBrace)) | Some(&Ok(Lexeme::CBracket)) => {
self.state = State::Event(true);
continue
}
_ => (),
}
let lexeme = itry!(self.consume_lexeme());
if lexeme != b"," {
return unexpected(lexeme)
match itry!(self.consume_lexeme()) {
Lexeme::Comma => {
self.state = if self.stack[self.stack.len() - 1] == Lexeme::OBracket {
State::Event(false)
} else {
State::Key(false)
}
},
lexeme => return Some(Err(Error::Unexpected(lexeme))),
}
self.state = if self.stack[self.stack.len() - 1] == b'[' {
State::Event(false)
} else {
State::Key(false)
};
}
}
}

0 comments on commit 02951fe

Please sign in to comment.