diff --git a/grammars/benches/json.rs b/grammars/benches/json.rs index 3141663a..e34669f7 100644 --- a/grammars/benches/json.rs +++ b/grammars/benches/json.rs @@ -78,9 +78,8 @@ fn bench_line_col(c: &mut Criterion) { }); } -// nested iter time: [258.27 µs 260.05 µs 262.64 µs] -// nested iter (fast-line-col) time: [14.943 µs 14.963 µs 14.993 µs] -// flatten iter time: [2.0367 µs 2.1104 µs 2.2144 µs] +// pairs nested iter time: [2.0168 ms 2.0381 ms 2.0725 ms] +// pairs flatten iter time: [4.5973 µs 4.6132 µs 4.6307 µs] fn bench_pairs_iter(c: &mut Criterion) { let data = include_str!("data.json"); @@ -90,13 +89,13 @@ fn bench_pairs_iter(c: &mut Criterion) { } } - c.bench_function("nested iter", |b| { + c.bench_function("pairs nested iter", |b| { let pairs = autocorrect::JsonParser::parse(autocorrect::Rule::item, &data).unwrap(); b.iter(move || iter_all_pairs(pairs.clone())); }); - c.bench_function("flatten iter", |b| { + c.bench_function("pairs flatten iter", |b| { let pairs = autocorrect::JsonParser::parse(autocorrect::Rule::item, &data).unwrap(); b.iter(move || { diff --git a/pest/src/iterators/flat_pairs.rs b/pest/src/iterators/flat_pairs.rs index 411d88b2..87de4898 100644 --- a/pest/src/iterators/flat_pairs.rs +++ b/pest/src/iterators/flat_pairs.rs @@ -11,6 +11,7 @@ use alloc::rc::Rc; use alloc::vec::Vec; use core::fmt; +use super::line_index::LineIndex; use super::pair::{self, Pair}; use super::queueable_token::QueueableToken; use super::tokens::{self, Tokens}; @@ -28,20 +29,22 @@ pub struct FlatPairs<'i, R> { input: &'i str, start: usize, end: usize, + line_index: Rc, } /// # Safety /// /// All `QueueableToken`s' `input_pos` must be valid character boundary indices into `input`. -pub unsafe fn new( +pub unsafe fn new<'a, R: RuleType>( queue: Rc>>, - input: &str, + input: &'a str, start: usize, end: usize, -) -> FlatPairs<'_, R> { +) -> FlatPairs<'a, R> { FlatPairs { queue, input, + line_index: Rc::new(LineIndex::new(input)), start, end, } @@ -107,7 +110,14 @@ impl<'i, R: RuleType> Iterator for FlatPairs<'i, R> { return None; } - let pair = unsafe { pair::new(Rc::clone(&self.queue), self.input, self.start) }; + let pair = unsafe { + pair::new( + Rc::clone(&self.queue), + self.input, + Rc::clone(&self.line_index), + self.start, + ) + }; self.next_start(); Some(pair) @@ -122,7 +132,14 @@ impl<'i, R: RuleType> DoubleEndedIterator for FlatPairs<'i, R> { self.next_start_from_end(); - let pair = unsafe { pair::new(Rc::clone(&self.queue), self.input, self.end) }; + let pair = unsafe { + pair::new( + Rc::clone(&self.queue), + self.input, + Rc::clone(&self.line_index), + self.end, + ) + }; Some(pair) } @@ -141,6 +158,7 @@ impl<'i, R: Clone> Clone for FlatPairs<'i, R> { FlatPairs { queue: Rc::clone(&self.queue), input: self.input, + line_index: Rc::clone(&self.line_index), start: self.start, end: self.end, } diff --git a/pest/src/iterators/line_index.rs b/pest/src/iterators/line_index.rs new file mode 100644 index 00000000..1956c5dd --- /dev/null +++ b/pest/src/iterators/line_index.rs @@ -0,0 +1,70 @@ +//! `LineIndex` to make a line_offsets, each item is an offset (start from 0) of the beginning of the line. +//! +//! For example, the text: `"hello\nworld"`, the line_offsets will store `[0, 8]`. +//! +//! Then `line_col` with a offset just need to find the line index by binary search, +//! +//! - `line` is the index of the line_offsets +//! - `col` is the offset minus the line start offset +//! +//! Inspired by rust-analyzer's `LineIndex`: +//! https://github.com/rust-lang/rust/blob/1.67.0/src/tools/rust-analyzer/crates/ide-db/src/line_index.rs +use alloc::vec::Vec; + +#[derive(Clone)] +pub struct LineIndex { + /// Offset the the beginning of each line, zero-based + line_offsets: Vec, +} + +impl LineIndex { + pub fn new(text: &str) -> LineIndex { + let mut line_offsets = Vec::with_capacity(0); + line_offsets.push(0); + + let mut offset = 0; + + for c in text.chars() { + offset += 1; + if c == '\n' { + line_offsets.push(offset); + } + } + + LineIndex { line_offsets } + } + + pub fn line_col(&self, offset: usize) -> (usize, usize) { + let line = self.line_offsets.partition_point(|&it| it <= offset) - 1; + let line_start_offset = self.line_offsets[line]; + let col = offset - line_start_offset; + + (line + 1, col + 1) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_line_index() { + let text = "hello\nworld"; + let table = [ + (00, 1, 1), + (01, 1, 2), + (05, 1, 6), + (06, 2, 1), + (07, 2, 2), + (08, 2, 3), + (10, 2, 5), + (11, 2, 6), + (12, 2, 7), + ]; + + let index = LineIndex::new(text); + for &(offset, line, col) in &table { + assert_eq!(index.line_col(offset), (line, col)); + } + } +} diff --git a/pest/src/iterators/mod.rs b/pest/src/iterators/mod.rs index 1a789637..7f81019a 100644 --- a/pest/src/iterators/mod.rs +++ b/pest/src/iterators/mod.rs @@ -10,6 +10,7 @@ //! Types and iterators for parser output. mod flat_pairs; +mod line_index; mod pair; pub(crate) mod pairs; mod queueable_token; diff --git a/pest/src/iterators/pair.rs b/pest/src/iterators/pair.rs index 2c813478..3575cd89 100644 --- a/pest/src/iterators/pair.rs +++ b/pest/src/iterators/pair.rs @@ -20,6 +20,7 @@ use core::str; #[cfg(feature = "pretty-print")] use serde::ser::SerializeStruct; +use super::line_index::LineIndex; use super::pairs::{self, Pairs}; use super::queueable_token::QueueableToken; use super::tokens::{self, Tokens}; @@ -43,22 +44,23 @@ pub struct Pair<'i, R> { input: &'i str, /// Token index into `queue`. start: usize, - pub(crate) line_col: Option<(usize, usize)>, + line_index: Rc, } /// # Safety /// /// All `QueueableToken`s' `input_pos` must be valid character boundary indices into `input`. -pub unsafe fn new( +pub unsafe fn new<'a, R: RuleType>( queue: Rc>>, - input: &str, + input: &'a str, + line_index: Rc, start: usize, -) -> Pair<'_, R> { +) -> Pair<'a, R> { Pair { queue, input, start, - line_col: None, + line_index, } } @@ -245,10 +247,8 @@ impl<'i, R: RuleType> Pair<'i, R> { /// Returns the `line`, `col` of this pair start. pub fn line_col(&self) -> (usize, usize) { - match &self.line_col { - Some(line_col) => (line_col.0, line_col.1), - None => self.as_span().start_pos().line_col(), - } + let start = self.pos(self.start); + self.line_index.line_col(start) } fn pair(&self) -> usize { diff --git a/pest/src/iterators/pairs.rs b/pest/src/iterators/pairs.rs index d4596b0f..3bfe231e 100644 --- a/pest/src/iterators/pairs.rs +++ b/pest/src/iterators/pairs.rs @@ -20,27 +20,12 @@ use core::str; use serde::ser::SerializeStruct; use super::flat_pairs::{self, FlatPairs}; +use super::line_index::LineIndex; use super::pair::{self, Pair}; use super::queueable_token::QueueableToken; use super::tokens::{self, Tokens}; -use crate::{position, RuleType}; +use crate::RuleType; -#[derive(Clone)] -pub struct Cursor { - pub line: usize, - pub col: usize, - pub end: usize, -} - -impl Default for Cursor { - fn default() -> Cursor { - Cursor { - line: 1, - col: 1, - end: 0, - } - } -} /// An iterator over [`Pair`]s. It is created by [`pest::state`] and [`Pair::into_inner`]. /// /// [`Pair`]: struct.Pair.html @@ -52,7 +37,7 @@ pub struct Pairs<'i, R> { input: &'i str, start: usize, end: usize, - cursor: Cursor, + line_index: Rc, } pub fn new( @@ -66,7 +51,7 @@ pub fn new( input, start, end, - cursor: Cursor::default(), + line_index: Rc::new(LineIndex::new(input)), } } @@ -199,7 +184,14 @@ impl<'i, R: RuleType> Pairs<'i, R> { #[inline] pub fn peek(&self) -> Option> { if self.start < self.end { - Some(unsafe { pair::new(Rc::clone(&self.queue), self.input, self.start) }) + Some(unsafe { + pair::new( + Rc::clone(&self.queue), + self.input, + Rc::clone(&self.line_index), + self.start, + ) + }) } else { None } @@ -237,42 +229,13 @@ impl<'i, R: RuleType> Pairs<'i, R> { } } } - - /// Move the cursor (line, col) by a part of the input. - fn move_cursor(&mut self, input: &str, start: usize, end: usize) -> (usize, usize) { - // Move cursor for some skiped characters (by skip(n)) - let prev_end = self.cursor.end; - if prev_end != start { - self.move_cursor(input, prev_end, start); - } - - let (prev_line, prev_col) = (self.cursor.line, self.cursor.col); - - let part = &input[self.cursor.end..end]; - let (l, c) = position::line_col(part, part.len(), (0, 0)); - - self.cursor.line += l; - // Has new line - if l > 0 { - self.cursor.col = c; - } else { - self.cursor.col += c; - } - self.cursor.end = end; - - (prev_line, prev_col) - } } impl<'i, R: RuleType> Iterator for Pairs<'i, R> { type Item = Pair<'i, R>; fn next(&mut self) -> Option { - let mut pair = self.peek()?; - let span = pair.as_span(); - - let (l, c) = self.move_cursor(self.input, span.start(), span.end()); - pair.line_col = Some((l, c)); + let pair = self.peek()?; self.start = self.pair() + 1; Some(pair) @@ -287,7 +250,14 @@ impl<'i, R: RuleType> DoubleEndedIterator for Pairs<'i, R> { self.end = self.pair_from_end(); - let pair = unsafe { pair::new(Rc::clone(&self.queue), self.input, self.end) }; + let pair = unsafe { + pair::new( + Rc::clone(&self.queue), + self.input, + Rc::clone(&self.line_index), + self.end, + ) + }; Some(pair) } @@ -478,26 +448,14 @@ mod tests { let pair = pairs.next().unwrap(); assert_eq!(pair.as_str(), "abc"); assert_eq!(pair.line_col(), (1, 1)); - assert_eq!( - (pairs.cursor.line, pairs.cursor.col, pairs.cursor.end), - (1, 4, 3) - ); let pair = pairs.next().unwrap(); assert_eq!(pair.as_str(), "e"); assert_eq!(pair.line_col(), (2, 1)); - assert_eq!( - (pairs.cursor.line, pairs.cursor.col, pairs.cursor.end), - (2, 2, 5) - ); let pair = pairs.next().unwrap(); assert_eq!(pair.as_str(), "fgh"); assert_eq!(pair.line_col(), (2, 2)); - assert_eq!( - (pairs.cursor.line, pairs.cursor.col, pairs.cursor.end), - (2, 5, 8) - ); } #[test]