From e3e5f364beaf42ff44e41eb7c74f5f390f6a3a73 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Sat, 3 Aug 2024 13:47:58 -0700 Subject: [PATCH 01/14] Create common directive_expression template for all directive_modifiers. Also renamed mandate to entry_mode. Fixes #27 --- lug/lug.hpp | 100 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 39 deletions(-) diff --git a/lug/lug.hpp b/lug/lug.hpp index 29f6fed..1fa0ab4 100644 --- a/lug/lug.hpp +++ b/lug/lug.hpp @@ -112,7 +112,7 @@ struct program std::vector actions; std::vector captures; std::vector predicates; - directives mandate{directives::eps}; + directives entry_mode{directives::eps}; void concatenate(program const& src) { @@ -135,7 +135,7 @@ struct program instructions.push_back(instr); instructions.insert(instructions.end(), i + 1, j); } - mandate = (mandate & ~directives::eps) | (mandate & src.mandate & directives::eps); + entry_mode = (entry_mode & ~directives::eps) | (entry_mode & src.entry_mode & directives::eps); } void swap(program& p) noexcept @@ -145,7 +145,7 @@ struct program actions.swap(p.actions); captures.swap(p.captures); predicates.swap(p.predicates); - std::swap(mandate, p.mandate); + std::swap(entry_mode, p.entry_mode); } }; @@ -360,8 +360,8 @@ template encoder_metadata(Frame&&) -> encoder_metadata mode_; + directives entry_mode_{directives::none}; virtual void do_append(instruction instr) = 0; virtual void do_append(program const&) = 0; [[nodiscard]] virtual immediate do_add_rune_set(unicode::rune_set&& /*r*/) { return immediate{0}; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) @@ -376,7 +376,7 @@ class encoder encoder& do_call(rule const* r, program const* p, std::ptrdiff_t off, unsigned short prec) { auto callee_mode = mode_.back(); - skip(p->mandate ^ directives::eps, directives::noskip); + skip(p->entry_mode ^ directives::eps, directives::noskip); do_add_callee(r, p, length(), callee_mode); return encode(opcode::call, off, immediate{prec}); } @@ -401,9 +401,10 @@ class encoder return encode(op, detail::string_pack(value), penum); } - void do_skip() + void do_skip(directives& last_mode) { - mode_.back() = (mode_.back() & ~(directives::preskip | directives::postskip)) | directives::lexeme | directives::noskip; + last_mode &= ~(directives::preskip | directives::postskip); + last_mode |= (directives::lexeme | directives::noskip); (*grammar::implicit_space())(*this); } @@ -416,7 +417,6 @@ class encoder encoder& operator=(encoder&&) = delete; template >> [[nodiscard]] decltype(auto) evaluate(E const& e, M const& m); template >> [[nodiscard]] std::ptrdiff_t evaluate_length(E const& e, M const& m); - encoder& dpsh(directives enable, directives disable) { directives const prev = mode_.back(); mode_.push_back((prev & ~disable) | enable); return *this; } encoder& append(instruction instr) { do_append(instr); return *this; } encoder& append(program const& p) { do_append(p); return *this; } encoder& call(program const& p, unsigned short prec) { return do_call(nullptr, &p, 0, prec); } @@ -427,31 +427,41 @@ class encoder encoder& encode(opcode op, syntactic_predicate&& p) { return append(instruction{op, operands::none, do_add_syntactic_predicate(std::move(p))}); } encoder& encode(opcode op, std::ptrdiff_t off, immediate imm = immediate{0}) { return append(instruction{op, operands::off, imm}).append(instruction{off}); } [[nodiscard]] std::ptrdiff_t length() const noexcept { return do_length(); } - [[nodiscard]] directives mandate() const noexcept { return (mandate_ & ~directives::eps) | mode_.back(); } [[nodiscard]] directives mode() const noexcept { return mode_.back(); } + [[nodiscard]] directives entry_mode() const noexcept { return (entry_mode_ & ~directives::eps) | mode_.back(); } encoder& match(unicode::rune_set&& runes) { return skip().encode(opcode::match_set, do_add_rune_set(std::move(runes))); } encoder& match_eps() { return skip(directives::lexeme).encode(opcode::match); } encoder& match_any() { return skip().encode(opcode::match_any); } template >> encoder& match_class(T properties) { return skip().do_match_class(Op, properties); } + encoder& dpsh(directives enable, directives disable) + { + directives const prev = mode_.back(); + mode_.push_back((prev & ~disable) | enable); + return *this; + } + encoder& dpop(directives relay) { - auto const prev = detail::pop_back(mode_); - auto const next = (mode_.back() & ~relay) | (prev & relay); - if ((next & directives::postskip) == directives::none && (prev & (directives::lexeme | directives::noskip | directives::postskip)) == directives::postskip) - do_skip(); + directives const prev = detail::pop_back(mode_); + directives& last_mode = mode_.back(); + directives const next = (last_mode & ~relay) | (prev & relay); + if (((next & directives::postskip) == directives::none) && ((prev & (directives::lexeme | directives::noskip | directives::postskip)) == directives::postskip)) + do_skip(last_mode); mode_.back() = next; return *this; } - encoder& skip(directives callee_mandate = directives::eps, directives callee_skip = directives::lexeme) + encoder& skip(directives callee_mode = directives::eps, directives callee_skip = directives::lexeme) { - auto const mode = mode_.back(); - if (mandate_ == directives::none) - mandate_ = (mode & (directives::caseless | directives::lexeme | directives::noskip)) | directives::eps; - if ((((mode | callee_mandate)) & (callee_skip | directives::preskip)) == directives::preskip) - do_skip(); - mode_.back() = mode & ~(callee_mandate & directives::eps); + directives& last_mode = mode_.back(); + directives const prev = last_mode; + directives const next = last_mode & ~(callee_mode & directives::eps); + if (entry_mode_ == directives::none) + entry_mode_ = (prev & (directives::caseless | directives::lexeme | directives::noskip)) | directives::eps; + if ((((prev | callee_mode)) & (callee_skip | directives::preskip)) == directives::preskip) + do_skip(last_mode); + mode_.back() = next; return *this; } @@ -459,7 +469,7 @@ class encoder { if (auto const& p = r.program_; allow_inlining && prec <= 0 && !r.currently_encoding_ && r.callees_.empty() && !p.instructions.empty() && (p.instructions.size() <= 8) && (p.actions.size() <= 1) && (p.captures.size() <= 1) && (p.predicates.size() <= 1)) - return skip(p.mandate, directives::noskip).append(p); + return skip(p.entry_mode, directives::noskip).append(p); return do_call(&r, &r.program_, 0, prec); } @@ -536,7 +546,7 @@ class program_encoder : public encoder public: program_encoder(program& p, program_callees& c, directives initial) : encoder{initial}, program_{p}, callees_{c} {} - ~program_encoder() override { program_.mandate = mandate(); } + ~program_encoder() override { program_.entry_mode = entry_mode(); } program_encoder(program_encoder const&) = delete; program_encoder(program_encoder&&) = delete; program_encoder& operator=(program_encoder const&) = delete; @@ -774,28 +784,40 @@ struct rule_precedence_expression : terminal_encoder_expression_interface return rule_precedence_expression{*this, precedence}; } -template -struct directive_modifier +template +struct directive_expression : unary_encoder_expression_interface { - template - struct directive_expression : unary_encoder_expression_interface - { - using unary_encoder_expression_interface::unary_encoder_expression_interface; + directives enable_mask{directives::none}; + directives disable_mask{directives::none}; + directives relay_mask{directives::none}; - template - [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const - { - d.dpsh(EnableMask, DisableMask); - auto m2 = d.evaluate(this->e1, m); - d.dpop(RelayMask); - return m2; - } - }; + template >> + constexpr directive_expression(X1&& x1, directives enable, directives disable, directives relay) + : unary_encoder_expression_interface{std::forward(x1)}, enable_mask{enable}, disable_mask{disable}, relay_mask{relay} {} + + template + [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const + { + d.dpsh(enable_mask, disable_mask); + auto m2 = d.evaluate(this->e1, m); + d.dpop(relay_mask); + return m2; + } +}; +template +struct directive_modifier +{ template >> [[nodiscard]] constexpr auto operator[](E const& e) const noexcept { - return directive_expression>{make_expression(e)}; + return directive_expression>{make_expression(e), EnableMask, DisableMask, RelayMask}; + } + + template + [[nodiscard]] constexpr auto operator[](directive_expression const& e) const noexcept + { + return directive_expression{e.e1, ((e.enable_mask & ~DisableMask) | EnableMask), (e.disable_mask | DisableMask), (e.relay_mask & RelayMask) | RelayMask}; } }; @@ -2062,7 +2084,7 @@ inline auto basic_regular_expression::operator()(encoder& d, M const& m) const - if (!parse(expression_, grmr, genr)) throw bad_string_expression{}; } - d.skip((program_->mandate & directives::eps) ^ directives::eps).append(*program_); + d.skip((program_->entry_mode & directives::eps) ^ directives::eps).append(*program_); return m; } From ce5effb4a75e0bd621b8c3d884b2a4c0f221a2dc Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Sun, 4 Aug 2024 04:55:29 -0700 Subject: [PATCH 02/14] Fix directive_expression mask computations. Fixes #27 --- lug/lug.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lug/lug.hpp b/lug/lug.hpp index 1fa0ab4..4783828 100644 --- a/lug/lug.hpp +++ b/lug/lug.hpp @@ -817,7 +817,7 @@ struct directive_modifier template [[nodiscard]] constexpr auto operator[](directive_expression const& e) const noexcept { - return directive_expression{e.e1, ((e.enable_mask & ~DisableMask) | EnableMask), (e.disable_mask | DisableMask), (e.relay_mask & RelayMask) | RelayMask}; + return directive_expression{e.e1, ((EnableMask & ~e.disable_mask) | e.enable_mask), (DisableMask | e.disable_mask), RelayMask}; } }; From 1de16eaece9acc6a721f4c1872cf4eb8d8d6c443 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Sun, 4 Aug 2024 12:56:43 -0700 Subject: [PATCH 03/14] Changes and improvements: - Extract base class from lug::basic_parser (fixes #32) - Parser stack frame errors now throw lug::bad_stack instead of running parser failure code path (fixes #33) - Merge various parser stacks into a single stack using std::variant --- lug/detail.hpp | 6 - lug/error.hpp | 3 +- lug/lug.hpp | 504 +++++++++++++++++++++++++++---------------------- 3 files changed, 276 insertions(+), 237 deletions(-) diff --git a/lug/detail.hpp b/lug/detail.hpp index 60a9b0e..5ef8926 100644 --- a/lug/detail.hpp +++ b/lug/detail.hpp @@ -307,12 +307,6 @@ template return x + y; } -template -[[nodiscard]] constexpr auto make_tuple_view(Tuple&& t) noexcept -{ - return std::forward_as_tuple(std::get(std::forward(t))...); -} - template [[nodiscard]] inline InputIt escaping_find_if(InputIt first, InputIt last, UnaryPredicate pred) { diff --git a/lug/error.hpp b/lug/error.hpp index 7138a92..8c59ab8 100644 --- a/lug/error.hpp +++ b/lug/error.hpp @@ -17,12 +17,13 @@ class reenterant_parse_error : public lug_error { public: reenterant_parse_error class reenterant_read_error : public lug_error { public: reenterant_read_error() : lug_error{"attempted to read or modify input source while reading"} {} }; class parse_context_error : public lug_error { public: parse_context_error() : lug_error{"operation valid only inside calling context of parser::parse" } {} }; class accept_context_error : public lug_error{ public: accept_context_error() : lug_error{"operation valid only inside calling context of parser::accept"} {} }; -class attribute_stack_error : public lug_error{ public: attribute_stack_error() : lug_error{"incompatible or invalid stack frame"} {} }; +class attribute_stack_error : public lug_error{ public: attribute_stack_error() : lug_error{"incompatible or invalid attribute stack frame"} {} }; class bad_string_expression : public lug_error { public: explicit bad_string_expression(std::string const& s = "invalid string or bracket expression") : lug_error{s} {} }; class bad_character_class : public bad_string_expression { public: bad_character_class() : bad_string_expression{"invalid character class"} {} }; class bad_character_range : public bad_string_expression { public: bad_character_range() : bad_string_expression{"character range is reversed"} {} }; class bad_grammar : public lug_error { public: bad_grammar() : lug_error{"invalid or empty grammar"} {} }; class bad_opcode : public lug_error { public: bad_opcode() : lug_error{"invalid opcode"} {} }; +class bad_stack : public lug_error{ public: bad_stack() : lug_error{"empty or invalid parser stack error"} {} }; } // namespace lug diff --git a/lug/lug.hpp b/lug/lug.hpp index 4783828..9a3b0dd 100644 --- a/lug/lug.hpp +++ b/lug/lug.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include namespace lug { @@ -27,6 +28,7 @@ class environment; class multi_input_source; class string_input_source; class string_view_input_source; +class parser_base; template class basic_parser; using parser = basic_parser; struct program; @@ -201,40 +203,49 @@ class syntax class environment { + friend class parser_base; template friend class basic_parser; - static constexpr unsigned short max_call_depth = (std::numeric_limits::max)(); + static constexpr std::size_t max_call_depth = (std::numeric_limits::max)(); static inline std::vector const empty_symbols_{}; std::vector attribute_frame_stack_; std::vector attribute_result_stack_; std::unordered_set conditions_; std::unordered_map> symbols_; std::vector> positions_; - std::string_view match_, subject_; + std::string_view match_; + std::string_view subject_; + std::size_t call_depth_{0}; + std::size_t prune_depth_{max_call_depth}; syntax_position origin_{1, 1}; - unsigned int tab_width_{8}, tab_alignment_{8}; - unsigned short prune_depth_{max_call_depth}, call_depth_{0}; + unsigned int tab_width_{8}; + unsigned int tab_alignment_{8}; virtual void on_accept_started() {} virtual void on_accept_ended() {} - unsigned short start_accept(std::string_view m, std::string_view s) + std::size_t start_accept(std::string_view m, std::string_view s) { reset_match_and_subject(m, s); on_accept_started(); return call_depth_; } - void end_accept(unsigned short prior_call_depth) + void end_accept(std::size_t prior_call_depth) { on_accept_ended(); - reset_call_depth(prior_call_depth); + call_depth_ = prior_call_depth; + prune_depth_ = max_call_depth; } - void reset_call_depth(unsigned short depth) noexcept + [[nodiscard]] bool accept_response(std::size_t response_call_depth) noexcept { - prune_depth_ = max_call_depth; - call_depth_ = depth; + if (prune_depth_ > response_call_depth) { + call_depth_ = response_call_depth; + prune_depth_ = max_call_depth; + return true; + } + return false; } void reset_match_and_subject(std::string_view m, std::string_view s) @@ -273,8 +284,8 @@ class environment [[nodiscard]] syntax_position position_begin(syntax_range const& range) { return position_at(range.index); } [[nodiscard]] syntax_position position_end(syntax_range const& range) { return position_at(range.index + range.size); } [[nodiscard]] std::pair position_range(syntax_range const& range) { return {position_begin(range), position_end(range)}; } - [[nodiscard]] unsigned short call_depth() const noexcept { return call_depth_; } - [[nodiscard]] unsigned short prune_depth() const noexcept { return prune_depth_; } + [[nodiscard]] std::size_t call_depth() const noexcept { return call_depth_; } + [[nodiscard]] std::size_t prune_depth() const noexcept { return prune_depth_; } void escape() { prune_depth_ = call_depth_; } [[nodiscard]] syntax_position position_at(std::size_t index) @@ -1455,36 +1466,171 @@ class string_view_input_source struct parser_registers { - std::size_t sr{0}; std::size_t mr{0}; std::size_t rc{0}; std::ptrdiff_t pc{0}; std::size_t fc{0}; - [[nodiscard]] auto as_tuple() noexcept { return std::forward_as_tuple(sr, mr, rc, pc, fc); } - [[nodiscard]] auto as_tuple() const noexcept { return std::forward_as_tuple(sr, mr, rc, pc, fc); } + std::size_t sr{0}; std::size_t mr{0}; std::size_t rc{0}; std::ptrdiff_t pc{0}; + [[nodiscard]] auto as_tuple() noexcept { return std::forward_as_tuple(sr, mr, rc, pc); } + [[nodiscard]] auto as_tuple() const noexcept { return std::forward_as_tuple(sr, mr, rc, pc); } }; -template -class basic_parser +class parser_base { - enum class stack_frame_type : unsigned char { backtrack, call, capture, condition, lrcall, symbol_definition, symbol_table }; - enum class subject_location : std::size_t {}; - struct response { unsigned short call_depth; unsigned short action_index; syntax_range range; }; - struct lrmemo { std::size_t srr{0}; std::size_t sra{0}; std::size_t prec{0}; std::ptrdiff_t pcr{0}; std::ptrdiff_t pca{0}; std::size_t rcr{0}; std::vector responses; }; +protected: static constexpr std::size_t lrfailcode = (std::numeric_limits::max)(); static constexpr std::size_t max_size = (std::numeric_limits::max)(); + struct response { std::size_t call_depth{0}; std::size_t action_index{0}; syntax_range range{0, 0}; constexpr response() noexcept = default; constexpr response(std::size_t c, std::size_t a, syntax_range const& r) noexcept : call_depth{c}, action_index{a}, range{r} {} }; + struct backtrack_frame { std::size_t sr; std::size_t rc; std::ptrdiff_t pc; constexpr backtrack_frame(std::size_t s, std::size_t r, std::ptrdiff_t p) noexcept : sr{s}, rc{r}, pc{p} {} }; + struct call_frame { std::ptrdiff_t pc; constexpr explicit call_frame(std::ptrdiff_t p) noexcept : pc{p} {} }; + struct capture_frame { std::size_t sr; constexpr explicit capture_frame(std::size_t s) noexcept : sr{s} {} }; + struct condition_frame { std::string_view name; bool value; constexpr condition_frame(std::string_view n, bool v) noexcept : name{n}, value{v} {} }; + struct lrmemo_frame { std::size_t srr; std::size_t sra; std::size_t prec; std::ptrdiff_t pcr; std::ptrdiff_t pca; std::size_t rcr; std::vector responses; lrmemo_frame(std::size_t sr, std::size_t sa, std::size_t p, std::ptrdiff_t pc, std::ptrdiff_t pa, std::size_t rc) noexcept : srr{sr}, sra{sa}, prec{p}, pcr{pc}, pca{pa}, rcr{rc} {} }; + struct symbol_frame { std::string_view name; std::size_t sr; constexpr symbol_frame(std::string_view n, std::size_t s) noexcept : name{n}, sr{s} {} }; + using symbol_table_frame = std::unordered_map>; + using stack_frame = std::variant; lug::grammar const* grammar_; lug::environment* environment_; - InputSource input_source_; + std::vector responses_; + std::vector stack_frames_; std::unordered_map casefolded_subjects_; - parser_registers registers_{0, 0, 0, 0, 0}; - bool parsing_{false}, cut_deferred_{false}; + parser_registers registers_{0, 0, 0, 0}; + std::size_t call_depth_{0}; std::size_t cut_frame_{0}; - std::vector stack_frames_; - std::vector> backtrack_stack_; // sr, rc, pc - std::vector call_stack_; // pc - std::vector capture_stack_; // sr - std::vector> condition_stack_; // name, value - std::vector> symbol_definition_stack_; // name, sr - std::vector>> symbol_table_stack_; - std::vector lrmemo_stack_; - std::vector responses_; + std::ptrdiff_t cut_inhibited_{0}; + bool cut_deferred_{false}; + bool parsing_{false}; + + template + void commit([[maybe_unused]] std::size_t& sr, [[maybe_unused]] std::size_t& rc, std::ptrdiff_t& pc, int off) + { + if (stack_frames_.empty()) + throw bad_stack{}; + auto& backtrack = std::get(stack_frames_.back()); + if constexpr (Opcode == opcode::commit_partial) { + backtrack.sr = sr; + backtrack.rc = rc; + } else { + if constexpr (Opcode == opcode::commit_back) + sr = backtrack.sr; + stack_frames_.pop_back(); + } + pc += off; + } + + void pop_responses_after(std::size_t n) + { + if (n < responses_.size()) + responses_.resize(n); + } + + [[nodiscard]] auto restore_responses_after(std::size_t n, std::vector const& restore) + { + pop_responses_after(n); + responses_.insert(responses_.end(), restore.begin(), restore.end()); + return responses_.size(); + } + + [[nodiscard]] auto drop_responses_after(std::size_t n) + { + std::vector dropped; + if (n < responses_.size()) { + dropped.assign(responses_.begin() + static_cast(n), responses_.end()); + responses_.resize(n); + } + return dropped; + } + + [[nodiscard]] auto push_response(std::size_t call_depth, std::size_t action_index, syntax_range const& range = {parser_base::max_size, 0}) + { + responses_.emplace_back(call_depth, action_index, range); + return responses_.size(); + } + + [[nodiscard]] std::pair fail_one(std::size_t& sr, std::size_t& rc, std::ptrdiff_t& pc) + { + auto const fail_result = std::visit([this, &sr, &rc, &pc](auto& frame) -> std::pair { + using frame_type = std::decay_t; + if constexpr (std::is_same_v) { + sr = frame.sr; + rc = frame.rc; + pc = frame.pc; + return {0, false}; + } else if constexpr (std::is_same_v) { + --call_depth_; + return {1, false}; + } else if constexpr (std::is_same_v) { + return {1, true}; + } else if constexpr (std::is_same_v) { + environment_->set_condition(frame.name, frame.value); + return {1, false}; + } else if constexpr (std::is_same_v) { + --call_depth_; + if (frame.sra != parser_base::lrfailcode) { + sr = frame.sra; + rc = restore_responses_after(frame.rcr, frame.responses); + pc = frame.pcr; + return {0, true}; + } + return {1, true}; + } else if constexpr (std::is_same_v) { + return {1, false}; + } else if constexpr (std::is_same_v) { + environment_->symbols_.swap(frame); + return {1, false}; + } else { + static_assert(detail::always_false_v, "non-exhaustive visitor!"); + } + }, stack_frames_.back()); + stack_frames_.pop_back(); + return fail_result; + } + + void do_accept(std::string_view match, std::string_view subject) + { + auto const prior_call_depth = environment_->start_accept(match, subject); + detail::scope_exit const cleanup{[this, prior_call_depth]{ environment_->end_accept(prior_call_depth); }}; + auto const& actions = grammar_->program().actions; + auto const& captures = grammar_->program().captures; + for (auto& resp : responses_) { + if (environment_->accept_response(resp.call_depth)) { + if (resp.range.index < parser_base::max_size) + captures[resp.action_index](*environment_, syntax{match.substr(resp.range.index, resp.range.size), resp.range.index}); + else + actions[resp.action_index](*environment_); + } + } + } + + [[nodiscard]] auto do_drain() + { + environment_->reset_origin(); + casefolded_subjects_.clear(); + responses_.clear(); + registers_.mr -= registers_.sr; + registers_.sr = 0; + registers_.rc = 0; + cut_frame_ = stack_frames_.size(); + cut_deferred_ = false; + return registers_.as_tuple(); + } + +public: + explicit parser_base(lug::grammar const& g, lug::environment& e) : grammar_{&g}, environment_{&e} {} + [[nodiscard]] lug::grammar const& grammar() const noexcept { return *grammar_; } + [[nodiscard]] lug::environment& environment() const noexcept { return *environment_; } + [[nodiscard]] std::size_t subject_index() const noexcept { return registers_.sr; } + [[nodiscard]] std::size_t max_subject_index() const noexcept { return registers_.mr; } + [[nodiscard]] syntax_position subject_position() { return environment_->position_at(registers_.sr); } + [[nodiscard]] syntax_position max_subject_position() { return environment_->position_at(registers_.mr); } + [[nodiscard]] syntax_position position_at(std::size_t index) { return environment_->position_at(index); } + [[nodiscard]] syntax_position position_begin(syntax_range const& range) { return environment_->position_at(range.index); } + [[nodiscard]] syntax_position position_end(syntax_range const& range) { return environment_->position_at(range.index + range.size); } + [[nodiscard]] std::pair position_range(syntax_range const& range) { return {position_begin(range), position_end(range)}; } + [[nodiscard]] parser_registers& registers() noexcept { return registers_; } + [[nodiscard]] parser_registers const& registers() const noexcept { return registers_; } +}; + +template +class basic_parser : public parser_base +{ + InputSource input_source_; [[nodiscard]] bool available(std::size_t sr, std::size_t sn) { @@ -1583,109 +1729,31 @@ class basic_parser return (symbol_index < symbols.size()) ? match_sequence(sr, mod(symbols[symbols.size() - symbol_index - 1]), std::forward(comp)) : false; } - template - [[nodiscard]] bool commit(std::size_t& sr, std::size_t& rc, std::ptrdiff_t& pc, int off) + void accept(std::size_t sr, std::size_t mr, std::size_t rc, std::ptrdiff_t pc) { - if (stack_frames_.empty() || stack_frames_.back() != stack_frame_type::backtrack) - return false; - if constexpr (Opcode == opcode::commit_partial) { - detail::make_tuple_view<0, 1>(backtrack_stack_.back()) = {sr, rc}; - } else { - std::ignore = std::tie(sr, rc); - if constexpr (Opcode == opcode::commit_back) - sr = std::get<0>(backtrack_stack_.back()); - pop_stack_frame(backtrack_stack_); - } - pc += off; - return true; + registers_ = {sr, (std::max)(mr, sr), rc, pc}; + do_accept(match(), subject()); } - void accept(std::size_t sr, std::size_t mr, std::size_t rc, std::ptrdiff_t pc) + void accept_if_deferred(std::size_t sr, std::size_t mr, std::size_t rc, std::ptrdiff_t pc) { - registers_ = {sr, (std::max)(mr, sr), rc, pc, 0}; - auto const full_match = match(); - auto const prior_call_depth = environment_->start_accept(full_match, subject()); - detail::scope_exit const cleanup{[this, prior_call_depth]{ environment_->end_accept(prior_call_depth); }}; - auto const& actions = grammar_->program().actions; - auto const& captures = grammar_->program().captures; - for (auto& resp : responses_) { - if (environment_->prune_depth() <= resp.call_depth) - continue; - environment_->reset_call_depth(resp.call_depth); - if (resp.range.index < max_size) - captures[resp.action_index](*environment_, syntax{full_match.substr(resp.range.index, resp.range.size), resp.range.index}); - else - actions[resp.action_index](*environment_); + --cut_inhibited_; + if (cut_deferred_ && (cut_inhibited_ > 0)) { + accept(sr, mr, rc, pc); + cut_deferred_ = false; } } [[nodiscard]] auto drain() { - environment_->reset_origin(); input_source_.drain_buffer(registers_.sr); - casefolded_subjects_.clear(); - responses_.clear(); - registers_.mr -= registers_.sr; - registers_.sr = 0, registers_.rc = 0; - cut_deferred_ = false, cut_frame_ = stack_frames_.size(); - return registers_.as_tuple(); - } - - void pop_responses_after(std::size_t n) - { - if (n < responses_.size()) - responses_.resize(n); - } - - [[nodiscard]] auto drop_responses_after(std::size_t n) - { - std::vector dropped; - if (n < responses_.size()) { - dropped.assign(responses_.begin() + static_cast(n), responses_.end()); - responses_.resize(n); - } - return dropped; - } - - [[nodiscard]] auto restore_responses_after(std::size_t n, std::vector const& restore) - { - pop_responses_after(n); - responses_.insert(responses_.end(), restore.begin(), restore.end()); - return responses_.size(); - } - - [[nodiscard]] auto push_response(std::size_t depth, std::size_t action_index, syntax_range range = {max_size, 0}) - { - responses_.push_back({static_cast(depth), static_cast(action_index), range}); - return responses_.size(); - } - - template - void pop_stack_frame(Stack& stack, Args&... args) - { - stack.pop_back(), stack_frames_.pop_back(); - cut_frame_ = (std::min)(cut_frame_, stack_frames_.size()); - if constexpr (std::is_same_v || std::is_same_v) - if (cut_deferred_ && capture_stack_.empty() && lrmemo_stack_.empty()) - accept(args...); + return do_drain(); } public: - basic_parser(lug::grammar const& g, lug::environment& e) : grammar_{&g}, environment_{&e} {} - [[nodiscard]] lug::grammar const& grammar() const noexcept { return *grammar_; } - [[nodiscard]] lug::environment& environment() const noexcept { return *environment_; } + basic_parser(lug::grammar const& g, lug::environment& e) : parser_base{g, e} {} [[nodiscard]] std::string_view match() const noexcept { return input_source_.buffer().substr(0, registers_.sr); } [[nodiscard]] std::string_view subject() const noexcept { return input_source_.buffer().substr(registers_.sr, input_source_.buffer().size() - registers_.sr); } - [[nodiscard]] std::size_t subject_index() const noexcept { return registers_.sr; } - [[nodiscard]] std::size_t max_subject_index() const noexcept { return registers_.mr; } - [[nodiscard]] syntax_position subject_position() { return environment_->position_at(registers_.sr); } - [[nodiscard]] syntax_position max_subject_position() { return environment_->position_at(registers_.mr); } - [[nodiscard]] syntax_position position_at(std::size_t index) { return environment_->position_at(index); } - [[nodiscard]] syntax_position position_begin(syntax_range const& range) { return environment_->position_at(range.index); } - [[nodiscard]] syntax_position position_end(syntax_range const& range) { return environment_->position_at(range.index + range.size); } - [[nodiscard]] std::pair position_range(syntax_range const& range) { return {position_begin(range), position_end(range)}; } - [[nodiscard]] parser_registers& registers() noexcept { return registers_; } - [[nodiscard]] parser_registers const& registers() const noexcept { return registers_; } [[nodiscard]] bool available(std::size_t sn) { return available(registers_.sr, sn); } template ::value>> @@ -1722,11 +1790,12 @@ class basic_parser program const& prog = grammar_->program(); if (prog.instructions.empty()) throw bad_grammar{}; - auto [sr, mr, rc, pc, fc] = drain(); - bool result = false; - bool done = false; - pc = 0; - fc = 0; + call_depth_ = 0; + cut_inhibited_ = 0; + std::size_t sr = 0, mr = 0, rc = 0; + std::ptrdiff_t pc = 0, fc = 0; + bool result = false, done = false; + std::tie(sr, mr, rc, std::ignore) = drain(); while (!done) { auto [op, imm, off, str] = instruction::decode(prog.instructions, pc); switch (op) { @@ -1775,113 +1844,90 @@ class basic_parser goto failure; } break; case opcode::choice: { - stack_frames_.push_back(stack_frame_type::backtrack); - backtrack_stack_.emplace_back(sr - imm, rc, pc + off); + stack_frames_.emplace_back(std::in_place_type, sr - imm, rc, pc + off); } break; case opcode::commit: { - if (!commit(sr, rc, pc, off)) - goto failure; + commit(sr, rc, pc, off); } break; case opcode::commit_back: { - if (!commit(sr, rc, pc, off)) - goto failure; + commit(sr, rc, pc, off); } break; case opcode::commit_partial: { - if (!commit(sr, rc, pc, off)) - goto failure; + commit(sr, rc, pc, off); } break; case opcode::jump: { pc += off; } break; case opcode::call: { if (imm != 0) { - auto const memo = detail::escaping_find_if(lrmemo_stack_.crbegin(), lrmemo_stack_.crend(), [srr = sr, pca = pc + off](auto const& m) { - if ((m.srr == srr) && (m.pca == pca)) - return 1; - return ((m.srr < srr) ? 0 : -1); + auto const frame_it = detail::escaping_find_if(stack_frames_.crbegin(), stack_frames_.crend(), [srr = sr, pca = pc + off](auto const& frame) { + if (auto const* const memo_ptr = std::get_if(&frame); memo_ptr != nullptr) { + if ((memo_ptr->srr == srr) && (memo_ptr->pca == pca)) + return 1; + if (memo_ptr->srr >= srr) + return -1; + } + return 0; }); - if (memo != lrmemo_stack_.crend()) { - if ((memo->sra == lrfailcode) || (imm < memo->prec)) + if (frame_it != stack_frames_.crend()) { + auto const& memo = std::get(*frame_it); + if ((memo.sra == parser_base::lrfailcode) || (imm < memo.prec)) goto failure; - sr = memo->sra, rc = restore_responses_after(rc, memo->responses); + sr = memo.sra; + rc = restore_responses_after(rc, memo.responses); continue; } - stack_frames_.push_back(stack_frame_type::lrcall); - lrmemo_stack_.push_back({sr, lrfailcode, imm, pc, pc + off, rc, std::vector{}}); + stack_frames_.emplace_back(std::in_place_type, sr, parser_base::lrfailcode, imm, pc, pc + off, rc); + ++cut_inhibited_; } else { - stack_frames_.push_back(stack_frame_type::call); - call_stack_.push_back(pc); + stack_frames_.emplace_back(std::in_place_type, pc); } + ++call_depth_; pc += off; } break; case opcode::ret: { if (stack_frames_.empty()) - goto failure; - switch (stack_frames_.back()) { - case stack_frame_type::call: { - pc = call_stack_.back(); - pop_stack_frame(call_stack_); - } break; - case stack_frame_type::lrcall: { - auto& memo = lrmemo_stack_.back(); - if ((memo.sra == lrfailcode) || (sr > memo.sra)) { - memo.sra = sr, memo.responses = drop_responses_after(memo.rcr); - sr = memo.srr, pc = memo.pca, rc = memo.rcr; - continue; - } - sr = memo.sra, pc = memo.pcr, rc = restore_responses_after(memo.rcr, memo.responses); - pop_stack_frame(lrmemo_stack_, sr, mr, rc, pc); - } break; - default: goto failure; + throw bad_stack{}; + auto& frame = stack_frames_.back(); + if (auto* const call = std::get_if(&frame); call != nullptr) { + --call_depth_; + pc = call->pc; + stack_frames_.pop_back(); + } else if (auto* const memo = std::get_if(&frame); memo != nullptr) { + if ((memo->sra == parser_base::lrfailcode) || (sr > memo->sra)) { + memo->sra = sr, memo->responses = drop_responses_after(memo->rcr); + sr = memo->srr, pc = memo->pca, rc = memo->rcr; + continue; + } + --call_depth_; + sr = memo->sra; + pc = memo->pcr; + rc = restore_responses_after(memo->rcr, memo->responses); + stack_frames_.pop_back(); + accept_if_deferred(sr, mr, rc, pc); + } else { + throw bad_stack{}; } } break; case opcode::fail: { - fc = imm; + fc = static_cast(imm); failure: for (mr = (std::max)(mr, sr), ++fc; fc > 0; --fc) { if (done = (cut_frame_ >= stack_frames_.size()); done) { - registers_ = {sr, mr, rc, pc, 0}; + registers_ = {sr, mr, rc, pc}; break; } - switch (stack_frames_.back()) { - case stack_frame_type::backtrack: { - std::tie(sr, rc, pc) = backtrack_stack_.back(); - pop_stack_frame(backtrack_stack_); - } break; - case stack_frame_type::call: { - pop_stack_frame(call_stack_), ++fc; - } break; - case stack_frame_type::capture: { - pop_stack_frame(capture_stack_, sr, mr, rc, pc), ++fc; - } break; - case stack_frame_type::condition: { - auto const& [cond_name, cond_value] = condition_stack_.back(); - environment_->set_condition(cond_name, cond_value); - pop_stack_frame(condition_stack_), ++fc; - } break; - case stack_frame_type::lrcall: { - if (auto const& memo = lrmemo_stack_.back(); memo.sra != lrfailcode) - sr = memo.sra, pc = memo.pcr, rc = restore_responses_after(memo.rcr, memo.responses); - else - ++fc; - pop_stack_frame(lrmemo_stack_, sr, mr, rc, pc); - } break; - case stack_frame_type::symbol_definition: { - pop_stack_frame(symbol_definition_stack_), ++fc; - } break; - case stack_frame_type::symbol_table: { - environment_->symbols_.swap(symbol_table_stack_.back()); - pop_stack_frame(symbol_table_stack_), ++fc; - } break; - default: break; - } + auto const fail_result = fail_one(sr, rc, pc); + fc += fail_result.first; + if (fail_result.second) + accept_if_deferred(sr, mr, rc, pc); } pop_responses_after(rc); } break; case opcode::accept: { - if (cut_deferred_ = (!capture_stack_.empty() || !lrmemo_stack_.empty()); !cut_deferred_) { + if (cut_deferred_ = (cut_inhibited_ > 0); !cut_deferred_) { accept(sr, mr, rc, pc); - std::tie(sr, mr, rc, pc, std::ignore) = drain(); + std::tie(sr, mr, rc, pc) = drain(); } } break; case opcode::accept_final: { @@ -1889,45 +1935,45 @@ class basic_parser result = done = true; } break; case opcode::action: { - rc = push_response(call_stack_.size() + lrmemo_stack_.size(), imm); + rc = push_response(call_depth_, imm); } break; case opcode::predicate: { - registers_ = {sr, (std::max)(mr, sr), rc, pc, 0}; + registers_ = {sr, (std::max)(mr, sr), rc, pc}; environment_->reset_match_and_subject(match(), subject()); bool const accepted = prog.predicates[imm](*environment_); - std::tie(sr, mr, rc, pc, fc) = registers_.as_tuple(); + std::tie(sr, mr, rc, pc) = registers_.as_tuple(); pop_responses_after(rc); if (!accepted) goto failure; } break; case opcode::capture_start: { - stack_frames_.push_back(stack_frame_type::capture); - capture_stack_.push_back(static_cast(sr)); + stack_frames_.emplace_back(std::in_place_type, sr); + ++cut_inhibited_; } break; case opcode::capture_end: { - if (stack_frames_.empty() || (stack_frames_.back() != stack_frame_type::capture)) - goto failure; - auto const sr0 = static_cast(capture_stack_.back()); + if (stack_frames_.empty()) + throw bad_stack{}; + auto const sr0 = std::get(stack_frames_.back()).sr; auto const sr1 = sr; - pop_stack_frame(capture_stack_, sr, mr, rc, pc); + stack_frames_.pop_back(); + accept_if_deferred(sr, mr, rc, pc); if (sr0 > sr1) goto failure; - rc = push_response(call_stack_.size() + lrmemo_stack_.size(), imm, {sr0, sr1 - sr0}); + rc = push_response(call_depth_, imm, syntax_range{sr0, sr1 - sr0}); } break; case opcode::condition_test: { if (environment_->has_condition(str) != (imm != 0)) goto failure; } break; case opcode::condition_push: { - stack_frames_.push_back(stack_frame_type::condition); - condition_stack_.emplace_back(str, environment_->set_condition(str, imm != 0)); + stack_frames_.emplace_back(std::in_place_type, str, environment_->set_condition(str, imm != 0)); } break; case opcode::condition_pop: { - if (stack_frames_.empty() || (stack_frames_.back() != stack_frame_type::condition)) - goto failure; - auto const& [cond_name, cond_value] = condition_stack_.back(); - environment_->set_condition(cond_name, cond_value); - pop_stack_frame(condition_stack_); + if (stack_frames_.empty()) + throw bad_stack{}; + auto& condition = std::get(stack_frames_.back()); + environment_->set_condition(condition.name, condition.value); + stack_frames_.pop_back(); } break; case opcode::symbol_exists: { if (environment_->has_symbol(str) != (imm != 0)) @@ -1966,35 +2012,33 @@ class basic_parser goto failure; } break; case opcode::symbol_start: { - stack_frames_.push_back(stack_frame_type::symbol_definition); - symbol_definition_stack_.emplace_back(str, static_cast(sr)); + stack_frames_.emplace_back(std::in_place_type, str, sr); } break; case opcode::symbol_end: { - if (stack_frames_.empty() || (stack_frames_.back() != stack_frame_type::symbol_definition)) - goto failure; - auto const [symbol_name, symbol_sr] = symbol_definition_stack_.back(); - auto const sr0 = static_cast(symbol_sr); + if (stack_frames_.empty()) + throw bad_stack{}; + auto const symbol = std::get(stack_frames_.back()); + auto const sr0 = static_cast(symbol.sr); auto const sr1 = sr; - pop_stack_frame(symbol_definition_stack_); + stack_frames_.pop_back(); if (sr0 > sr1) goto failure; - environment_->add_symbol(symbol_name, std::string{match().substr(sr0, sr1 - sr0)}); + environment_->add_symbol(symbol.name, std::string{input_source_.buffer().substr(sr0, sr1 - sr0)}); } break; case opcode::symbol_push: { - stack_frames_.push_back(stack_frame_type::symbol_table); - symbol_table_stack_.emplace_back(environment_->symbols_); + stack_frames_.emplace_back(std::in_place_type, environment_->symbols_); if (imm == 1) environment_->symbols_.erase(str); else if (imm == 2) environment_->symbols_.clear(); } break; case opcode::symbol_pop: { - if (stack_frames_.empty() || (stack_frames_.back() != stack_frame_type::symbol_table)) - goto failure; - environment_->symbols_.swap(symbol_table_stack_.back()); - pop_stack_frame(symbol_table_stack_); + if (stack_frames_.empty()) + throw bad_stack{}; + environment_->symbols_.swap(std::get(stack_frames_.back())); + stack_frames_.pop_back(); } break; - default: registers_ = {sr, (std::max)(mr, sr), rc, pc, 0}; throw bad_opcode{}; + default: registers_ = {sr, (std::max)(mr, sr), rc, pc}; throw bad_opcode{}; } } return result; From cd53baaee52d040f686e9a0acdbcdf75af85cba4 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Sun, 4 Aug 2024 13:06:10 -0700 Subject: [PATCH 04/14] Updated package version and changelog --- CHANGELOG.md | 6 ++++++ CMakeLists.txt | 2 +- Makefile | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 52473f5..7f2246f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## Release v0.4.0 (Under Development) + +* Extracted common base class of `lug::basic_parser` into `lug::parser_base` to reduce template bloat. +* Reduced template bloat for parser directives (improving compiler error messages) and optimized nested directives. +* Merged parser stack frames together into a single stack using std::variant. + ## Release v0.3.0 (July 4, 2024) * Added list repetition operator `e1 >> e2` to the DSL that is shorthand for `e1 > *(e2 > e1)`. diff --git a/CMakeLists.txt b/CMakeLists.txt index 61d0ace..cf8b00b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ cmake_minimum_required(VERSION 3.10) project( - lug VERSION 0.3.0 + lug VERSION 0.4.0 DESCRIPTION "Embedded DSL for PE grammar parser combinators in C++" HOMEPAGE_URL "https://github.com/jwtowner/lug" LANGUAGES CXX) diff --git a/Makefile b/Makefile index fc89f24..b498d6f 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ # See LICENSE file for copyright and license details # distribution version -VERSION = 0.3.0 +VERSION = 0.4.0 # paths PREFIX = /usr/local From e619a1b229709a5963f42803052c135a2577408c Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Sun, 4 Aug 2024 13:42:50 -0700 Subject: [PATCH 05/14] Removed goto from parser machine main loop. --- .clang-tidy | 2 - CHANGELOG.md | 3 +- lug/lug.hpp | 159 +++++++++++++++++++++++++-------------------------- 3 files changed, 80 insertions(+), 84 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index 9cc6212..a6cb299 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -11,7 +11,6 @@ Checks: - cppcoreguidelines-* - -cppcoreguidelines-avoid-magic-numbers # revisit after new instruction scheme, maybe only disable for unicode tables - -cppcoreguidelines-avoid-do-while # if removing do-while does not cause serious performance issues remove this check - - -cppcoreguidelines-avoid-goto # if removing goto does not cause serious performance issues remove this check - -cppcoreguidelines-pro-bounds-* # requires gsl::at and std::span to suppress, would prefer Standard Library hardening approach - -cppcoreguidelines-pro-type-union-access # remove after developing new instruction encoding scheme that doesn't use union - darwin-* @@ -21,7 +20,6 @@ Checks: - -google-readability-braces-around-statements # adversely affects line count - -google-runtime-int # revisit after new instruction scheme - hicpp-* - - -hicpp-avoid-goto # if removing goto does not cause serious performance issues remove this check - -hicpp-braces-around-statements # adversely affects line count - llvm-namespace-comment - misc-* diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f2246f..a229d65 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,8 @@ * Extracted common base class of `lug::basic_parser` into `lug::parser_base` to reduce template bloat. * Reduced template bloat for parser directives (improving compiler error messages) and optimized nested directives. -* Merged parser stack frames together into a single stack using std::variant. +* Merged parser stack frames together into a single stack using `std::variant`. +* Removed use of `goto` in parser machine main loop. ## Release v0.3.0 (July 4, 2024) diff --git a/lug/lug.hpp b/lug/lug.hpp index 9a3b0dd..edf99e5 100644 --- a/lug/lug.hpp +++ b/lug/lug.hpp @@ -664,7 +664,7 @@ class basic_regular_expression : public terminal_encoder_expression_interface if (classes != unicode::ctype::none) encoder.match_class(classes); if (circumflex) - encoder.encode(opcode::commit, 0).encode(opcode::fail).match_any(); + encoder.encode(opcode::commit, 0).encode(opcode::fail, immediate{1}).match_any(); } runes.clear(), classes = unicode::ctype::none, circumflex = false; } @@ -839,7 +839,7 @@ inline constexpr directive_modifier [[nodiscard]] constexpr auto operator()(encoder& /*d*/, M const& m) const -> M const& { return m; } }; struct eps_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.match_eps(); return m; } }; -struct eoi_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::choice, 2).encode(opcode::match_any, immediate{0x8000}).encode(opcode::fail, immediate{1}); return m; } }; +struct eoi_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::choice, 2).encode(opcode::match_any, immediate{0x8000}).encode(opcode::fail, immediate{2}); return m; } }; struct eol_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::match_eol); return m; } }; struct cut_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::accept); return m; } }; @@ -969,7 +969,7 @@ struct negative_lookahead_expression : unary_encoder_expression_interface [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const { auto m2 = d.encode(opcode::choice, 1 + d.evaluate_length(this->e1, m)).evaluate(this->e1, m); - d.encode(opcode::fail, immediate{1}); + d.encode(opcode::fail, immediate{2}); return m2; } }; @@ -983,7 +983,7 @@ struct positive_lookahead_expression : unary_encoder_expression_interface [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const { auto m2 = d.encode(opcode::choice, 2 + d.evaluate_length(this->e1, m)).evaluate(this->e1, m); - d.encode(opcode::commit_back, 1).encode(opcode::fail); + d.encode(opcode::commit_back, 1).encode(opcode::fail, immediate{1}); return m2; } }; @@ -1476,6 +1476,7 @@ class parser_base protected: static constexpr std::size_t lrfailcode = (std::numeric_limits::max)(); static constexpr std::size_t max_size = (std::numeric_limits::max)(); + struct response { std::size_t call_depth{0}; std::size_t action_index{0}; syntax_range range{0, 0}; constexpr response() noexcept = default; constexpr response(std::size_t c, std::size_t a, syntax_range const& r) noexcept : call_depth{c}, action_index{a}, range{r} {} }; struct backtrack_frame { std::size_t sr; std::size_t rc; std::ptrdiff_t pc; constexpr backtrack_frame(std::size_t s, std::size_t r, std::ptrdiff_t p) noexcept : sr{s}, rc{r}, pc{p} {} }; struct call_frame { std::ptrdiff_t pc; constexpr explicit call_frame(std::ptrdiff_t p) noexcept : pc{p} {} }; @@ -1485,6 +1486,7 @@ class parser_base struct symbol_frame { std::string_view name; std::size_t sr; constexpr symbol_frame(std::string_view n, std::size_t s) noexcept : name{n}, sr{s} {} }; using symbol_table_frame = std::unordered_map>; using stack_frame = std::variant; + lug::grammar const* grammar_; lug::environment* environment_; std::vector responses_; @@ -1663,20 +1665,20 @@ class basic_parser : public parser_base } template - [[nodiscard]] bool match_sequence(std::size_t& sr, std::string_view str, Compare const& comp) + [[nodiscard]] std::ptrdiff_t match_sequence(std::size_t& sr, std::string_view str, Compare const& comp) { if (std::size_t const sn = str.size(); !sn || (available(sr, sn) && comp(*this, sr, sn, str))) { sr += sn; - return true; + return 0; } - return false; + return 1; } template - [[nodiscard]] bool match_single(std::size_t& sr, Match const& match) + [[nodiscard]] std::ptrdiff_t match_single(std::size_t& sr, Match const& match) { if (!available(sr, 1)) - return false; + return 1; auto const buffer = input_source_.buffer(); auto const curr = buffer.cbegin() + static_cast(sr); auto const last = buffer.cend(); @@ -1692,41 +1694,43 @@ class basic_parser : public parser_base matched = match(); std::ignore = rune; } - if (matched) + if (matched) { sr += static_cast(std::distance(curr, next)); - return matched; + return 0; + } + return 1; } template - [[nodiscard]] bool match_symbol_all(std::size_t& sr, std::string_view symbol_name, Modify const& mod, Compare const& comp) + [[nodiscard]] std::ptrdiff_t match_symbol_all(std::size_t& sr, std::string_view symbol_name, Modify const& mod, Compare const& comp) { auto const& symbols = environment_->get_symbols(symbol_name); - if (std::size_t tsr = sr; std::all_of(symbols.begin(), symbols.end(), [&tsr, &mod, &comp, this](auto const& symbol) { return this->match_sequence(tsr, mod(symbol), comp); })) { + if (std::size_t tsr = sr; std::all_of(symbols.begin(), symbols.end(), [&tsr, &mod, &comp, this](auto const& symbol) { return (this->match_sequence(tsr, mod(symbol), comp) == 0); })) { sr = tsr; - return true; + return 0; } - return false; + return 1; } template - [[nodiscard]] bool match_symbol_any(std::size_t& sr, std::string_view symbol_name, Modify const& mod, Compare const& comp) + [[nodiscard]] std::ptrdiff_t match_symbol_any(std::size_t& sr, std::string_view symbol_name, Modify const& mod, Compare const& comp) { auto const& symbols = environment_->get_symbols(symbol_name); - return std::any_of(symbols.begin(), symbols.end(), [&sr, &mod, &comp, this](auto const& symbol) { return this->match_sequence(sr, mod(symbol), comp); }); + return std::any_of(symbols.begin(), symbols.end(), [&sr, &mod, &comp, this](auto const& symbol) { return (this->match_sequence(sr, mod(symbol), comp) == 0); }) ? 0 : 1; } template - [[nodiscard]] bool match_symbol_head(std::size_t& sr, std::string_view symbol_name, std::size_t symbol_index, Modify&& mod, Compare&& comp) + [[nodiscard]] std::ptrdiff_t match_symbol_head(std::size_t& sr, std::string_view symbol_name, std::size_t symbol_index, Modify&& mod, Compare&& comp) { auto const& symbols = environment_->get_symbols(symbol_name); - return (symbol_index < symbols.size()) ? match_sequence(sr, mod(symbols[symbol_index]), std::forward(comp)) : false; + return (symbol_index < symbols.size()) ? match_sequence(sr, mod(symbols[symbol_index]), std::forward(comp)) : 1; } template - [[nodiscard]] bool match_symbol_tail(std::size_t& sr, std::string_view symbol_name, std::size_t symbol_index, Modify&& mod, Compare&& comp) + [[nodiscard]] std::ptrdiff_t match_symbol_tail(std::size_t& sr, std::string_view symbol_name, std::size_t symbol_index, Modify&& mod, Compare&& comp) { auto const& symbols = environment_->get_symbols(symbol_name); - return (symbol_index < symbols.size()) ? match_sequence(sr, mod(symbols[symbols.size() - symbol_index - 1]), std::forward(comp)) : false; + return (symbol_index < symbols.size()) ? match_sequence(sr, mod(symbols[symbols.size() - symbol_index - 1]), std::forward(comp)) : 1; } void accept(std::size_t sr, std::size_t mr, std::size_t rc, std::ptrdiff_t pc) @@ -1800,48 +1804,42 @@ class basic_parser : public parser_base auto [op, imm, off, str] = instruction::decode(prog.instructions, pc); switch (op) { case opcode::match: { - if (!match_sequence(sr, str, std::mem_fn(&basic_parser::compare))) - goto failure; + fc = match_sequence(sr, str, std::mem_fn(&basic_parser::compare)); } break; case opcode::match_cf: { - if (!match_sequence(sr, str, std::mem_fn(&basic_parser::casefold_compare))) - goto failure; + fc = match_sequence(sr, str, std::mem_fn(&basic_parser::casefold_compare)); } break; case opcode::match_any: { if constexpr (detail::input_source_has_options::value) { - if (((imm & 0x8000U) != 0) && ((input_source_.options() & source_options::interactive) != source_options::none)) - goto failure; + if (((imm & 0x8000U) != 0) && ((input_source_.options() & source_options::interactive) != source_options::none)) { + fc = 1; + break; + } } - if (!match_single(sr, []{ return true; })) - goto failure; + fc = match_single(sr, []{ return true; }); } break; case opcode::match_any_of: { - if (!match_single(sr, [pe = static_cast(imm), s = str](auto const& r) { return unicode::any_of(r, pe, s); })) - goto failure; + fc = match_single(sr, [pe = static_cast(imm), s = str](auto const& r) { return unicode::any_of(r, pe, s); }); } break; case opcode::match_all_of: { - if (!match_single(sr, [pe = static_cast(imm), s = str](auto const& r) { return unicode::all_of(r, pe, s); })) - goto failure; + fc = match_single(sr, [pe = static_cast(imm), s = str](auto const& r) { return unicode::all_of(r, pe, s); }); } break; case opcode::match_none_of: { - if (!match_single(sr, [pe = static_cast(imm), s = str](auto const& r) { return unicode::none_of(r, pe, s); })) - goto failure; + fc = match_single(sr, [pe = static_cast(imm), s = str](auto const& r) { return unicode::none_of(r, pe, s); }); } break; case opcode::match_set: { - if (!match_single(sr, [&runes = prog.runesets[imm]](char32_t rune) { + fc = match_single(sr, [&runes = prog.runesets[imm]](char32_t rune) { auto const interval = std::lower_bound(runes.begin(), runes.end(), rune, [](auto& x, auto& y) { return x.second < y; }); - return interval != runes.end() && interval->first <= rune && rune <= interval->second; })) - goto failure; + return interval != runes.end() && interval->first <= rune && rune <= interval->second; }); } break; case opcode::match_eol: { - if (!match_single(sr, [](auto curr, auto last, auto& next, char32_t rune) { + fc = match_single(sr, [](auto curr, auto last, auto& next, char32_t rune) { if (curr == next || (unicode::query(rune).properties() & unicode::ptype::Line_Ending) == unicode::ptype::None) return false; if (U'\r' == rune) if (auto const [next2, rune2] = utf8::decode_rune(next, last); next2 != next && rune2 == U'\n') next = next2; - return true; })) - goto failure; + return true; }); } break; case opcode::choice: { stack_frames_.emplace_back(std::in_place_type, sr - imm, rc, pc + off); @@ -1871,8 +1869,10 @@ class basic_parser : public parser_base }); if (frame_it != stack_frames_.crend()) { auto const& memo = std::get(*frame_it); - if ((memo.sra == parser_base::lrfailcode) || (imm < memo.prec)) - goto failure; + if ((memo.sra == parser_base::lrfailcode) || (imm < memo.prec)) { + fc = 1; + break; + } sr = memo.sra; rc = restore_responses_after(rc, memo.responses); continue; @@ -1911,18 +1911,6 @@ class basic_parser : public parser_base } break; case opcode::fail: { fc = static_cast(imm); - failure: - for (mr = (std::max)(mr, sr), ++fc; fc > 0; --fc) { - if (done = (cut_frame_ >= stack_frames_.size()); done) { - registers_ = {sr, mr, rc, pc}; - break; - } - auto const fail_result = fail_one(sr, rc, pc); - fc += fail_result.first; - if (fail_result.second) - accept_if_deferred(sr, mr, rc, pc); - } - pop_responses_after(rc); } break; case opcode::accept: { if (cut_deferred_ = (cut_inhibited_ > 0); !cut_deferred_) { @@ -1943,8 +1931,7 @@ class basic_parser : public parser_base bool const accepted = prog.predicates[imm](*environment_); std::tie(sr, mr, rc, pc) = registers_.as_tuple(); pop_responses_after(rc); - if (!accepted) - goto failure; + fc = accepted ? 0 : 1; } break; case opcode::capture_start: { stack_frames_.emplace_back(std::in_place_type, sr); @@ -1957,13 +1944,14 @@ class basic_parser : public parser_base auto const sr1 = sr; stack_frames_.pop_back(); accept_if_deferred(sr, mr, rc, pc); - if (sr0 > sr1) - goto failure; + if (sr0 > sr1) { + fc = 1; + break; + } rc = push_response(call_depth_, imm, syntax_range{sr0, sr1 - sr0}); } break; case opcode::condition_test: { - if (environment_->has_condition(str) != (imm != 0)) - goto failure; + fc = (environment_->has_condition(str) == (imm != 0)) ? 0 : 1; } break; case opcode::condition_push: { stack_frames_.emplace_back(std::in_place_type, str, environment_->set_condition(str, imm != 0)); @@ -1976,40 +1964,31 @@ class basic_parser : public parser_base stack_frames_.pop_back(); } break; case opcode::symbol_exists: { - if (environment_->has_symbol(str) != (imm != 0)) - goto failure; + fc = (environment_->has_symbol(str) == (imm != 0)) ? 0 : 1; } break; case opcode::symbol_all: { - if (!match_symbol_all(sr, str, detail::identity{}, std::mem_fn(&basic_parser::compare))) - goto failure; + fc = match_symbol_all(sr, str, detail::identity{}, std::mem_fn(&basic_parser::compare)); } break; case opcode::symbol_all_cf: { - if (!match_symbol_all(sr, str, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare))) - goto failure; + fc = match_symbol_all(sr, str, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare)); } break; case opcode::symbol_any: { - if (!match_symbol_any(sr, str, detail::identity{}, std::mem_fn(&basic_parser::compare))) - goto failure; + fc = match_symbol_any(sr, str, detail::identity{}, std::mem_fn(&basic_parser::compare)); } break; case opcode::symbol_any_cf: { - if (!match_symbol_any(sr, str, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare))) - goto failure; + fc = match_symbol_any(sr, str, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare)); } break; case opcode::symbol_head: { - if (!match_symbol_head(sr, str, imm, detail::identity{}, std::mem_fn(&basic_parser::compare))) - goto failure; + fc = match_symbol_head(sr, str, imm, detail::identity{}, std::mem_fn(&basic_parser::compare)); } break; case opcode::symbol_head_cf: { - if (!match_symbol_head(sr, str, imm, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare))) - goto failure; + fc = match_symbol_head(sr, str, imm, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare)); } break; case opcode::symbol_tail: { - if (!match_symbol_tail(sr, str, imm, detail::identity{}, std::mem_fn(&basic_parser::compare))) - goto failure; + fc = match_symbol_tail(sr, str, imm, detail::identity{}, std::mem_fn(&basic_parser::compare)); } break; case opcode::symbol_tail_cf: { - if (!match_symbol_tail(sr, str, imm, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare))) - goto failure; + fc = match_symbol_tail(sr, str, imm, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare)); } break; case opcode::symbol_start: { stack_frames_.emplace_back(std::in_place_type, str, sr); @@ -2021,8 +2000,10 @@ class basic_parser : public parser_base auto const sr0 = static_cast(symbol.sr); auto const sr1 = sr; stack_frames_.pop_back(); - if (sr0 > sr1) - goto failure; + if (sr0 > sr1) { + fc = 1; + break; + } environment_->add_symbol(symbol.name, std::string{input_source_.buffer().substr(sr0, sr1 - sr0)}); } break; case opcode::symbol_push: { @@ -2040,6 +2021,22 @@ class basic_parser : public parser_base } break; default: registers_ = {sr, (std::max)(mr, sr), rc, pc}; throw bad_opcode{}; } + if (fc > 0) { + mr = (std::max)(mr, sr); + do { + if (done = (cut_frame_ >= stack_frames_.size()); done) { + registers_ = {sr, mr, rc, pc}; + fc = 0; + break; + } + auto const fail_result = fail_one(sr, rc, pc); + fc += fail_result.first; + if (fail_result.second) + accept_if_deferred(sr, mr, rc, pc); + --fc; + } while (fc > 0); + pop_responses_after(rc); + } } return result; } From 3b305c0d83ad1cfefcba7d1d18a7397240fd1d2d Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Sun, 4 Aug 2024 14:31:56 -0700 Subject: [PATCH 06/14] Extracted call/return code into parser_base class. --- lug/lug.hpp | 118 +++++++++++++++++++++++++++++----------------------- 1 file changed, 67 insertions(+), 51 deletions(-) diff --git a/lug/lug.hpp b/lug/lug.hpp index edf99e5..045d672 100644 --- a/lug/lug.hpp +++ b/lug/lug.hpp @@ -1545,6 +1545,68 @@ class parser_base return responses_.size(); } + [[nodiscard]] std::ptrdiff_t call_into(std::size_t& sr, std::size_t& rc, std::ptrdiff_t& pc, unsigned short imm, int off) + { + if (imm == 0) { + stack_frames_.emplace_back(std::in_place_type, pc); + ++call_depth_; + pc += off; + return 0; + } + auto const frame_it = detail::escaping_find_if(stack_frames_.crbegin(), stack_frames_.crend(), [srr = sr, pca = pc + off](auto const& frame) { + if (auto const* const memo_ptr = std::get_if(&frame); memo_ptr != nullptr) { + if ((memo_ptr->srr == srr) && (memo_ptr->pca == pca)) + return 1; + if (memo_ptr->srr >= srr) + return -1; + } + return 0; + }); + if (frame_it != stack_frames_.crend()) { + auto const& memo = std::get(*frame_it); + if ((memo.sra == parser_base::lrfailcode) || (imm < memo.prec)) + return 1; + sr = memo.sra; + rc = restore_responses_after(rc, memo.responses); + return 0; + } + stack_frames_.emplace_back(std::in_place_type, sr, parser_base::lrfailcode, imm, pc, pc + off, rc); + ++cut_inhibited_; + ++call_depth_; + pc += off; + return 0; + } + + [[nodiscard]] bool return_from(std::size_t& sr, std::size_t& rc, std::ptrdiff_t& pc) + { + if (stack_frames_.empty()) + throw bad_stack{}; + auto& frame = stack_frames_.back(); + if (auto* const call = std::get_if(&frame); call != nullptr) { + --call_depth_; + pc = call->pc; + stack_frames_.pop_back(); + return false; + } + if (auto* const memo = std::get_if(&frame); memo != nullptr) { + if ((memo->sra == parser_base::lrfailcode) || (sr > memo->sra)) { + memo->sra = sr; + memo->responses = drop_responses_after(memo->rcr); + sr = memo->srr; + pc = memo->pca; + rc = memo->rcr; + return false; + } + --call_depth_; + sr = memo->sra; + pc = memo->pcr; + rc = restore_responses_after(memo->rcr, memo->responses); + stack_frames_.pop_back(); + return true; + } + throw bad_stack{}; + } + [[nodiscard]] std::pair fail_one(std::size_t& sr, std::size_t& rc, std::ptrdiff_t& pc) { auto const fail_result = std::visit([this, &sr, &rc, &pc](auto& frame) -> std::pair { @@ -1830,14 +1892,14 @@ class basic_parser : public parser_base case opcode::match_set: { fc = match_single(sr, [&runes = prog.runesets[imm]](char32_t rune) { auto const interval = std::lower_bound(runes.begin(), runes.end(), rune, [](auto& x, auto& y) { return x.second < y; }); - return interval != runes.end() && interval->first <= rune && rune <= interval->second; }); + return (interval != runes.end()) && (interval->first <= rune) && (rune <= interval->second); }); } break; case opcode::match_eol: { fc = match_single(sr, [](auto curr, auto last, auto& next, char32_t rune) { - if (curr == next || (unicode::query(rune).properties() & unicode::ptype::Line_Ending) == unicode::ptype::None) + if ((curr == next) || ((unicode::query(rune).properties() & unicode::ptype::Line_Ending) == unicode::ptype::None)) return false; if (U'\r' == rune) - if (auto const [next2, rune2] = utf8::decode_rune(next, last); next2 != next && rune2 == U'\n') + if (auto const [next2, rune2] = utf8::decode_rune(next, last); (next2 != next) && (rune2 == U'\n')) next = next2; return true; }); } break; @@ -1857,57 +1919,11 @@ class basic_parser : public parser_base pc += off; } break; case opcode::call: { - if (imm != 0) { - auto const frame_it = detail::escaping_find_if(stack_frames_.crbegin(), stack_frames_.crend(), [srr = sr, pca = pc + off](auto const& frame) { - if (auto const* const memo_ptr = std::get_if(&frame); memo_ptr != nullptr) { - if ((memo_ptr->srr == srr) && (memo_ptr->pca == pca)) - return 1; - if (memo_ptr->srr >= srr) - return -1; - } - return 0; - }); - if (frame_it != stack_frames_.crend()) { - auto const& memo = std::get(*frame_it); - if ((memo.sra == parser_base::lrfailcode) || (imm < memo.prec)) { - fc = 1; - break; - } - sr = memo.sra; - rc = restore_responses_after(rc, memo.responses); - continue; - } - stack_frames_.emplace_back(std::in_place_type, sr, parser_base::lrfailcode, imm, pc, pc + off, rc); - ++cut_inhibited_; - } else { - stack_frames_.emplace_back(std::in_place_type, pc); - } - ++call_depth_; - pc += off; + fc = call_into(sr, rc, pc, imm, off); } break; case opcode::ret: { - if (stack_frames_.empty()) - throw bad_stack{}; - auto& frame = stack_frames_.back(); - if (auto* const call = std::get_if(&frame); call != nullptr) { - --call_depth_; - pc = call->pc; - stack_frames_.pop_back(); - } else if (auto* const memo = std::get_if(&frame); memo != nullptr) { - if ((memo->sra == parser_base::lrfailcode) || (sr > memo->sra)) { - memo->sra = sr, memo->responses = drop_responses_after(memo->rcr); - sr = memo->srr, pc = memo->pca, rc = memo->rcr; - continue; - } - --call_depth_; - sr = memo->sra; - pc = memo->pcr; - rc = restore_responses_after(memo->rcr, memo->responses); - stack_frames_.pop_back(); + if (return_from(sr, rc, pc)) accept_if_deferred(sr, mr, rc, pc); - } else { - throw bad_stack{}; - } } break; case opcode::fail: { fc = static_cast(imm); From b321211d43a7543cccfd1f5ab17e5eedc3052e59 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Mon, 5 Aug 2024 08:36:32 -0700 Subject: [PATCH 07/14] Implemented new register layout. (Fixes #17) --- lug/lug.hpp | 245 ++++++++++++++++++++++++++-------------------------- 1 file changed, 122 insertions(+), 123 deletions(-) diff --git a/lug/lug.hpp b/lug/lug.hpp index 045d672..4e9eae6 100644 --- a/lug/lug.hpp +++ b/lug/lug.hpp @@ -49,12 +49,25 @@ template inline constexpr bool is_capture_target_v = std::is_same_v::digits - 1); + std::size_t ar; // accumulator register + std::size_t sr; // subject register + std::size_t mr; // match register + std::size_t rc; // response counter + std::size_t cd; // call depth + std::size_t cf; // cut frame + std::size_t ci; // cut inhibited + std::ptrdiff_t pc; // program counter +}; + enum class opcode : unsigned char { match, match_cf, match_any, match_any_of, match_all_of, match_none_of, match_set, match_eol, choice, commit, commit_back, commit_partial, jump, call, ret, - fail, accept, accept_final, action, predicate, + fail, cut, halt, action, predicate, capture_start, capture_end, condition_test, condition_push, condition_pop, symbol_exists, symbol_all, symbol_all_cf, symbol_any, symbol_any_cf, symbol_head, symbol_head_cf, symbol_tail, symbol_tail_cf, symbol_start, @@ -841,7 +854,7 @@ struct nop_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.match_eps(); return m; } }; struct eoi_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::choice, 2).encode(opcode::match_any, immediate{0x8000}).encode(opcode::fail, immediate{2}); return m; } }; struct eol_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::match_eol); return m; } }; -struct cut_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::accept); return m; } }; +struct cut_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::cut); return m; } }; template struct match_class_combinator @@ -1345,7 +1358,7 @@ class implicit_space_rule std::vector> calls; std::unordered_set left_recursive; std::vector>, program const*>> unprocessed; - program_encoder{grprogram, grcallees, directives::eps | directives::preskip}.call(start_rule, 1, false).encode(opcode::accept_final); + program_encoder{grprogram, grcallees, directives::eps | directives::preskip}.call(start_rule, 1, false).encode(opcode::halt); calls.emplace_back(&start_rule.program_, std::get<2>(grcallees.back())); unprocessed.emplace_back(std::vector>{{&start_rule, false}}, &start_rule.program_); do { @@ -1464,13 +1477,6 @@ class string_view_input_source template > void enqueue(It first, It last) { buffer_ = std::string_view{&(*first), static_cast(last - first)}; } }; -struct parser_registers -{ - std::size_t sr{0}; std::size_t mr{0}; std::size_t rc{0}; std::ptrdiff_t pc{0}; - [[nodiscard]] auto as_tuple() noexcept { return std::forward_as_tuple(sr, mr, rc, pc); } - [[nodiscard]] auto as_tuple() const noexcept { return std::forward_as_tuple(sr, mr, rc, pc); } -}; - class parser_base { protected: @@ -1492,28 +1498,24 @@ class parser_base std::vector responses_; std::vector stack_frames_; std::unordered_map casefolded_subjects_; - parser_registers registers_{0, 0, 0, 0}; - std::size_t call_depth_{0}; - std::size_t cut_frame_{0}; - std::ptrdiff_t cut_inhibited_{0}; - bool cut_deferred_{false}; + lug::registers registers_{0, 0, 0, 0, 0, 0, 0, 0}; bool parsing_{false}; template - void commit([[maybe_unused]] std::size_t& sr, [[maybe_unused]] std::size_t& rc, std::ptrdiff_t& pc, int off) + void commit(int off) { if (stack_frames_.empty()) throw bad_stack{}; auto& backtrack = std::get(stack_frames_.back()); if constexpr (Opcode == opcode::commit_partial) { - backtrack.sr = sr; - backtrack.rc = rc; + backtrack.sr = registers_.sr; + backtrack.rc = registers_.rc; } else { if constexpr (Opcode == opcode::commit_back) - sr = backtrack.sr; + registers_.sr = backtrack.sr; stack_frames_.pop_back(); } - pc += off; + registers_.pc += off; } void pop_responses_after(std::size_t n) @@ -1545,15 +1547,15 @@ class parser_base return responses_.size(); } - [[nodiscard]] std::ptrdiff_t call_into(std::size_t& sr, std::size_t& rc, std::ptrdiff_t& pc, unsigned short imm, int off) + [[nodiscard]] std::ptrdiff_t call_into(unsigned short imm, int off) { if (imm == 0) { - stack_frames_.emplace_back(std::in_place_type, pc); - ++call_depth_; - pc += off; + stack_frames_.emplace_back(std::in_place_type, registers_.pc); + ++registers_.cd; + registers_.pc += off; return 0; } - auto const frame_it = detail::escaping_find_if(stack_frames_.crbegin(), stack_frames_.crend(), [srr = sr, pca = pc + off](auto const& frame) { + auto const frame_it = detail::escaping_find_if(stack_frames_.crbegin(), stack_frames_.crend(), [srr = registers_.sr, pca = registers_.pc + off](auto const& frame) { if (auto const* const memo_ptr = std::get_if(&frame); memo_ptr != nullptr) { if ((memo_ptr->srr == srr) && (memo_ptr->pca == pca)) return 1; @@ -1566,58 +1568,58 @@ class parser_base auto const& memo = std::get(*frame_it); if ((memo.sra == parser_base::lrfailcode) || (imm < memo.prec)) return 1; - sr = memo.sra; - rc = restore_responses_after(rc, memo.responses); + registers_.sr = memo.sra; + registers_.rc = restore_responses_after(registers_.rc, memo.responses); return 0; } - stack_frames_.emplace_back(std::in_place_type, sr, parser_base::lrfailcode, imm, pc, pc + off, rc); - ++cut_inhibited_; - ++call_depth_; - pc += off; + stack_frames_.emplace_back(std::in_place_type, registers_.sr, parser_base::lrfailcode, imm, registers_.pc, registers_.pc + off, registers_.rc); + ++registers_.cd; + ++registers_.ci; + registers_.pc += off; return 0; } - [[nodiscard]] bool return_from(std::size_t& sr, std::size_t& rc, std::ptrdiff_t& pc) + [[nodiscard]] bool return_from() { if (stack_frames_.empty()) throw bad_stack{}; auto& frame = stack_frames_.back(); if (auto* const call = std::get_if(&frame); call != nullptr) { - --call_depth_; - pc = call->pc; + --registers_.cd; + registers_.pc = call->pc; stack_frames_.pop_back(); return false; } if (auto* const memo = std::get_if(&frame); memo != nullptr) { - if ((memo->sra == parser_base::lrfailcode) || (sr > memo->sra)) { - memo->sra = sr; + if ((memo->sra == parser_base::lrfailcode) || (registers_.sr > memo->sra)) { + memo->sra = registers_.sr; memo->responses = drop_responses_after(memo->rcr); - sr = memo->srr; - pc = memo->pca; - rc = memo->rcr; + registers_.sr = memo->srr; + registers_.pc = memo->pca; + registers_.rc = memo->rcr; return false; } - --call_depth_; - sr = memo->sra; - pc = memo->pcr; - rc = restore_responses_after(memo->rcr, memo->responses); + --registers_.cd; + registers_.sr = memo->sra; + registers_.pc = memo->pcr; + registers_.rc = restore_responses_after(memo->rcr, memo->responses); stack_frames_.pop_back(); return true; } throw bad_stack{}; } - [[nodiscard]] std::pair fail_one(std::size_t& sr, std::size_t& rc, std::ptrdiff_t& pc) + [[nodiscard]] std::pair fail_one() { - auto const fail_result = std::visit([this, &sr, &rc, &pc](auto& frame) -> std::pair { + auto const fail_result = std::visit([this](auto& frame) -> std::pair { using frame_type = std::decay_t; if constexpr (std::is_same_v) { - sr = frame.sr; - rc = frame.rc; - pc = frame.pc; + registers_.sr = frame.sr; + registers_.rc = frame.rc; + registers_.pc = frame.pc; return {0, false}; } else if constexpr (std::is_same_v) { - --call_depth_; + --registers_.cd; return {1, false}; } else if constexpr (std::is_same_v) { return {1, true}; @@ -1625,11 +1627,11 @@ class parser_base environment_->set_condition(frame.name, frame.value); return {1, false}; } else if constexpr (std::is_same_v) { - --call_depth_; + --registers_.cd; if (frame.sra != parser_base::lrfailcode) { - sr = frame.sra; - rc = restore_responses_after(frame.rcr, frame.responses); - pc = frame.pcr; + registers_.sr = frame.sra; + registers_.rc = restore_responses_after(frame.rcr, frame.responses); + registers_.pc = frame.pcr; return {0, true}; } return {1, true}; @@ -1662,7 +1664,7 @@ class parser_base } } - [[nodiscard]] auto do_drain() + void do_drain() { environment_->reset_origin(); casefolded_subjects_.clear(); @@ -1670,9 +1672,8 @@ class parser_base registers_.mr -= registers_.sr; registers_.sr = 0; registers_.rc = 0; - cut_frame_ = stack_frames_.size(); - cut_deferred_ = false; - return registers_.as_tuple(); + registers_.cf = stack_frames_.size(); + registers_.ci &= ~lug::registers::cut_deferred_flag; } public: @@ -1687,8 +1688,8 @@ class parser_base [[nodiscard]] syntax_position position_begin(syntax_range const& range) { return environment_->position_at(range.index); } [[nodiscard]] syntax_position position_end(syntax_range const& range) { return environment_->position_at(range.index + range.size); } [[nodiscard]] std::pair position_range(syntax_range const& range) { return {position_begin(range), position_end(range)}; } - [[nodiscard]] parser_registers& registers() noexcept { return registers_; } - [[nodiscard]] parser_registers const& registers() const noexcept { return registers_; } + [[nodiscard]] lug::registers& registers() noexcept { return registers_; } + [[nodiscard]] lug::registers const& registers() const noexcept { return registers_; } }; template @@ -1795,25 +1796,25 @@ class basic_parser : public parser_base return (symbol_index < symbols.size()) ? match_sequence(sr, mod(symbols[symbols.size() - symbol_index - 1]), std::forward(comp)) : 1; } - void accept(std::size_t sr, std::size_t mr, std::size_t rc, std::ptrdiff_t pc) + void accept() { - registers_ = {sr, (std::max)(mr, sr), rc, pc}; + registers_.mr = (std::max)(registers_.mr, registers_.sr); do_accept(match(), subject()); } - void accept_if_deferred(std::size_t sr, std::size_t mr, std::size_t rc, std::ptrdiff_t pc) + void accept_if_deferred() { - --cut_inhibited_; - if (cut_deferred_ && (cut_inhibited_ > 0)) { - accept(sr, mr, rc, pc); - cut_deferred_ = false; + --registers_.ci; + if (registers_.ci == lug::registers::cut_deferred_flag) { + registers_.ci &= ~lug::registers::cut_deferred_flag; + accept(); } } - [[nodiscard]] auto drain() + void drain() { input_source_.drain_buffer(registers_.sr); - return do_drain(); + do_drain(); } public: @@ -1826,7 +1827,7 @@ class basic_parser : public parser_base basic_parser& enqueue(InputIt first, InputIt last) { if constexpr (detail::input_source_enqueue_drains::value) - (void)drain(); + drain(); input_source_.enqueue(std::move(first), std::move(last)); return *this; } @@ -1856,20 +1857,18 @@ class basic_parser : public parser_base program const& prog = grammar_->program(); if (prog.instructions.empty()) throw bad_grammar{}; - call_depth_ = 0; - cut_inhibited_ = 0; - std::size_t sr = 0, mr = 0, rc = 0; - std::ptrdiff_t pc = 0, fc = 0; + drain(); + registers_.pc = 0; + std::ptrdiff_t fc = 0; bool result = false, done = false; - std::tie(sr, mr, rc, std::ignore) = drain(); while (!done) { - auto [op, imm, off, str] = instruction::decode(prog.instructions, pc); + auto [op, imm, off, str] = instruction::decode(prog.instructions, registers_.pc); switch (op) { case opcode::match: { - fc = match_sequence(sr, str, std::mem_fn(&basic_parser::compare)); + fc = match_sequence(registers_.sr, str, std::mem_fn(&basic_parser::compare)); } break; case opcode::match_cf: { - fc = match_sequence(sr, str, std::mem_fn(&basic_parser::casefold_compare)); + fc = match_sequence(registers_.sr, str, std::mem_fn(&basic_parser::casefold_compare)); } break; case opcode::match_any: { if constexpr (detail::input_source_has_options::value) { @@ -1878,24 +1877,24 @@ class basic_parser : public parser_base break; } } - fc = match_single(sr, []{ return true; }); + fc = match_single(registers_.sr, []{ return true; }); } break; case opcode::match_any_of: { - fc = match_single(sr, [pe = static_cast(imm), s = str](auto const& r) { return unicode::any_of(r, pe, s); }); + fc = match_single(registers_.sr, [pe = static_cast(imm), s = str](auto const& r) { return unicode::any_of(r, pe, s); }); } break; case opcode::match_all_of: { - fc = match_single(sr, [pe = static_cast(imm), s = str](auto const& r) { return unicode::all_of(r, pe, s); }); + fc = match_single(registers_.sr, [pe = static_cast(imm), s = str](auto const& r) { return unicode::all_of(r, pe, s); }); } break; case opcode::match_none_of: { - fc = match_single(sr, [pe = static_cast(imm), s = str](auto const& r) { return unicode::none_of(r, pe, s); }); + fc = match_single(registers_.sr, [pe = static_cast(imm), s = str](auto const& r) { return unicode::none_of(r, pe, s); }); } break; case opcode::match_set: { - fc = match_single(sr, [&runes = prog.runesets[imm]](char32_t rune) { + fc = match_single(registers_.sr, [&runes = prog.runesets[imm]](char32_t rune) { auto const interval = std::lower_bound(runes.begin(), runes.end(), rune, [](auto& x, auto& y) { return x.second < y; }); return (interval != runes.end()) && (interval->first <= rune) && (rune <= interval->second); }); } break; case opcode::match_eol: { - fc = match_single(sr, [](auto curr, auto last, auto& next, char32_t rune) { + fc = match_single(registers_.sr, [](auto curr, auto last, auto& next, char32_t rune) { if ((curr == next) || ((unicode::query(rune).properties() & unicode::ptype::Line_Ending) == unicode::ptype::None)) return false; if (U'\r' == rune) @@ -1904,67 +1903,68 @@ class basic_parser : public parser_base return true; }); } break; case opcode::choice: { - stack_frames_.emplace_back(std::in_place_type, sr - imm, rc, pc + off); + stack_frames_.emplace_back(std::in_place_type, registers_.sr - imm, registers_.rc, registers_.pc + off); } break; case opcode::commit: { - commit(sr, rc, pc, off); + commit(off); } break; case opcode::commit_back: { - commit(sr, rc, pc, off); + commit(off); } break; case opcode::commit_partial: { - commit(sr, rc, pc, off); + commit(off); } break; case opcode::jump: { - pc += off; + registers_.pc += off; } break; case opcode::call: { - fc = call_into(sr, rc, pc, imm, off); + fc = call_into(imm, off); } break; case opcode::ret: { - if (return_from(sr, rc, pc)) - accept_if_deferred(sr, mr, rc, pc); + if (return_from()) + accept_if_deferred(); } break; case opcode::fail: { fc = static_cast(imm); } break; - case opcode::accept: { - if (cut_deferred_ = (cut_inhibited_ > 0); !cut_deferred_) { - accept(sr, mr, rc, pc); - std::tie(sr, mr, rc, pc) = drain(); + case opcode::cut: { + if (registers_.ci == 0) { + accept(); + drain(); + } else { + registers_.ci |= lug::registers::cut_deferred_flag; } } break; - case opcode::accept_final: { - accept(sr, mr, rc, pc); + case opcode::halt: { + accept(); result = done = true; } break; case opcode::action: { - rc = push_response(call_depth_, imm); + registers_.rc = push_response(registers_.cd, imm); } break; case opcode::predicate: { - registers_ = {sr, (std::max)(mr, sr), rc, pc}; + registers_.mr = (std::max)(registers_.mr, registers_.sr); environment_->reset_match_and_subject(match(), subject()); bool const accepted = prog.predicates[imm](*environment_); - std::tie(sr, mr, rc, pc) = registers_.as_tuple(); - pop_responses_after(rc); + pop_responses_after(registers_.rc); fc = accepted ? 0 : 1; } break; case opcode::capture_start: { - stack_frames_.emplace_back(std::in_place_type, sr); - ++cut_inhibited_; + stack_frames_.emplace_back(std::in_place_type, registers_.sr); + ++registers_.ci; } break; case opcode::capture_end: { if (stack_frames_.empty()) throw bad_stack{}; auto const sr0 = std::get(stack_frames_.back()).sr; - auto const sr1 = sr; + auto const sr1 = registers_.sr; stack_frames_.pop_back(); - accept_if_deferred(sr, mr, rc, pc); + accept_if_deferred(); if (sr0 > sr1) { fc = 1; break; } - rc = push_response(call_depth_, imm, syntax_range{sr0, sr1 - sr0}); + registers_.rc = push_response(registers_.cd, imm, syntax_range{sr0, sr1 - sr0}); } break; case opcode::condition_test: { fc = (environment_->has_condition(str) == (imm != 0)) ? 0 : 1; @@ -1983,38 +1983,38 @@ class basic_parser : public parser_base fc = (environment_->has_symbol(str) == (imm != 0)) ? 0 : 1; } break; case opcode::symbol_all: { - fc = match_symbol_all(sr, str, detail::identity{}, std::mem_fn(&basic_parser::compare)); + fc = match_symbol_all(registers_.sr, str, detail::identity{}, std::mem_fn(&basic_parser::compare)); } break; case opcode::symbol_all_cf: { - fc = match_symbol_all(sr, str, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare)); + fc = match_symbol_all(registers_.sr, str, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare)); } break; case opcode::symbol_any: { - fc = match_symbol_any(sr, str, detail::identity{}, std::mem_fn(&basic_parser::compare)); + fc = match_symbol_any(registers_.sr, str, detail::identity{}, std::mem_fn(&basic_parser::compare)); } break; case opcode::symbol_any_cf: { - fc = match_symbol_any(sr, str, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare)); + fc = match_symbol_any(registers_.sr, str, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare)); } break; case opcode::symbol_head: { - fc = match_symbol_head(sr, str, imm, detail::identity{}, std::mem_fn(&basic_parser::compare)); + fc = match_symbol_head(registers_.sr, str, imm, detail::identity{}, std::mem_fn(&basic_parser::compare)); } break; case opcode::symbol_head_cf: { - fc = match_symbol_head(sr, str, imm, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare)); + fc = match_symbol_head(registers_.sr, str, imm, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare)); } break; case opcode::symbol_tail: { - fc = match_symbol_tail(sr, str, imm, detail::identity{}, std::mem_fn(&basic_parser::compare)); + fc = match_symbol_tail(registers_.sr, str, imm, detail::identity{}, std::mem_fn(&basic_parser::compare)); } break; case opcode::symbol_tail_cf: { - fc = match_symbol_tail(sr, str, imm, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare)); + fc = match_symbol_tail(registers_.sr, str, imm, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare)); } break; case opcode::symbol_start: { - stack_frames_.emplace_back(std::in_place_type, str, sr); + stack_frames_.emplace_back(std::in_place_type, str, registers_.sr); } break; case opcode::symbol_end: { if (stack_frames_.empty()) throw bad_stack{}; auto const symbol = std::get(stack_frames_.back()); auto const sr0 = static_cast(symbol.sr); - auto const sr1 = sr; + auto const sr1 = registers_.sr; stack_frames_.pop_back(); if (sr0 > sr1) { fc = 1; @@ -2035,23 +2035,22 @@ class basic_parser : public parser_base environment_->symbols_.swap(std::get(stack_frames_.back())); stack_frames_.pop_back(); } break; - default: registers_ = {sr, (std::max)(mr, sr), rc, pc}; throw bad_opcode{}; + default: registers_.mr = (std::max)(registers_.mr, registers_.sr); throw bad_opcode{}; } if (fc > 0) { - mr = (std::max)(mr, sr); + registers_.mr = (std::max)(registers_.mr, registers_.sr); do { - if (done = (cut_frame_ >= stack_frames_.size()); done) { - registers_ = {sr, mr, rc, pc}; + if (done = (registers_.cf >= stack_frames_.size()); done) { fc = 0; break; } - auto const fail_result = fail_one(sr, rc, pc); + auto const fail_result = fail_one(); fc += fail_result.first; if (fail_result.second) - accept_if_deferred(sr, mr, rc, pc); + accept_if_deferred(); --fc; } while (fc > 0); - pop_responses_after(rc); + pop_responses_after(registers_.rc); } } return result; From 67070fadb070c3f606c3fe2b2bf8c3026d6f09e3 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Mon, 5 Aug 2024 16:10:45 -0700 Subject: [PATCH 08/14] Implemented new instruction encoding scheme that is simpler and more efficient. Fixes #17. --- lug/detail.hpp | 17 +- lug/lug.hpp | 451 +++++++++++++++++++++++-------------------------- 2 files changed, 218 insertions(+), 250 deletions(-) diff --git a/lug/detail.hpp b/lug/detail.hpp index 5ef8926..9182402 100644 --- a/lug/detail.hpp +++ b/lug/detail.hpp @@ -293,14 +293,14 @@ template >> scope_exit(Fn) -> scope_exit>; template -inline void assure_in_range(T x, U minval, V maxval) +constexpr void assure_in_range(T x, U minval, V maxval) { if (!((minval <= x) && (x <= maxval))) throw Error(); } template -[[nodiscard]] inline auto checked_add(T x, U y) +[[nodiscard]] constexpr auto checked_add(T x, U y) { if (((std::numeric_limits::max)() - x) < y) throw Error(); @@ -308,7 +308,7 @@ template } template -[[nodiscard]] inline InputIt escaping_find_if(InputIt first, InputIt last, UnaryPredicate pred) +[[nodiscard]] constexpr InputIt escaping_find_if(InputIt first, InputIt last, UnaryPredicate pred) { for ( ; first != last; ++first) { const int status = pred(*first); @@ -320,17 +320,8 @@ template return last; } -template -inline std::size_t push_back_unique(Sequence& s, T&& x) -{ - if (auto i = std::find(std::cbegin(s), std::cend(s), x); i != std::cend(s)) - return static_cast(std::distance(std::cbegin(s), i)); - s.push_back(std::forward(x)); - return s.size() - 1; -} - template -[[nodiscard]] inline auto pop_back(Sequence& s) -> typename Sequence::value_type +[[nodiscard]] constexpr auto pop_back(Sequence& s) -> typename Sequence::value_type { typename Sequence::value_type result{std::move(s.back())}; // NOLINT(misc-const-correctness) s.pop_back(); diff --git a/lug/lug.hpp b/lug/lug.hpp index 4e9eae6..2a7d179 100644 --- a/lug/lug.hpp +++ b/lug/lug.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -52,7 +53,6 @@ template inline constexpr bool is_capture_target_v = std::is_same_v::digits - 1); - std::size_t ar; // accumulator register std::size_t sr; // subject register std::size_t mr; // match register std::size_t rc; // response counter @@ -62,60 +62,29 @@ struct registers std::ptrdiff_t pc; // program counter }; -enum class opcode : unsigned char +enum class opcode : std::uint_least8_t { - match, match_cf, match_any, match_any_of, match_all_of, - match_none_of, match_set, match_eol, choice, commit, - commit_back, commit_partial, jump, call, ret, - fail, cut, halt, action, predicate, - capture_start, capture_end, condition_test, condition_push, condition_pop, - symbol_exists, symbol_all, symbol_all_cf, symbol_any, symbol_any_cf, - symbol_head, symbol_head_cf, symbol_tail, symbol_tail_cf, symbol_start, - symbol_end, symbol_push, symbol_pop + choice, commit, commit_back, commit_partial, jump, + call, ret, fail, cut, halt, + predicate, action, capture_start, capture_end, condition_pop, + symbol_end, symbol_pop, match_any, match_set, match_eol, + match = 32, match_cf, match_any_of, match_all_of, match_none_of, + condition_test, condition_push, symbol_exists, symbol_all, symbol_all_cf, + symbol_any, symbol_any_cf, symbol_head, symbol_head_cf, symbol_tail, + symbol_tail_cf, symbol_start, symbol_push }; -enum class immediate : unsigned short {}; -enum class operands : unsigned char { none = 0, off = 0x40, str = 0x80, is_bitfield_enum }; - -union instruction +struct alignas(std::uint_least64_t) instruction { - static constexpr std::size_t maxstrlen = 256; - struct prefix { opcode op; operands aux; unsigned short val; } pf; - int off; - std::array str; - - instruction(opcode op, operands aux, immediate imm) noexcept : pf{op, aux, static_cast(imm)} {} - explicit instruction(std::ptrdiff_t o) : off{static_cast(o)} { if (off != o) throw program_limit_error{}; } - explicit instruction(std::string_view s) : str{} { std::fill(std::copy_n(s.begin(), (std::min)(s.size(), std::size_t{4}), str.begin()), str.end(), char{0}); } - - [[nodiscard]] static auto decode(std::vector const& code, std::ptrdiff_t& pc) - { - prefix const pf = code[static_cast(pc++)].pf; - int const off = ((pf.aux & operands::off) != operands::none) ? code[static_cast(pc++)].off : 0; - unsigned short imm = pf.val; - std::string_view str; - if ((pf.aux & operands::str) != operands::none) { - auto const strsize = (static_cast(imm) & 0xffU) + 1U; - str = std::string_view{code[static_cast(pc)].str.data(), strsize}; - pc += static_cast((strsize + 3U) >> 2U); - imm = static_cast(static_cast(imm) >> 8U); - } - return std::make_tuple(pf.op, imm, off, str); - } - - [[nodiscard]] static std::ptrdiff_t length(prefix pf) noexcept - { - std::ptrdiff_t len = 1; - len += ((pf.aux & operands::off) != operands::none) ? 1 : 0; - len += ((pf.aux & operands::str) != operands::none) ? static_cast(((static_cast(pf.val) & 0xffU) >> 2U) + 1U) : 0; - return len; - } + opcode op; + std::uint_least8_t immediate8; + std::uint_least16_t immediate16; + std::int_least32_t offset32; + constexpr instruction(opcode o, std::uint_least8_t i8, std::uint_least16_t i16, std::int_least32_t o32) noexcept : op{o}, immediate8{i8}, immediate16{i16}, offset32{o32} {} }; -static_assert(sizeof(unicode::ctype) <= sizeof(immediate), "immediate must be large enough to hold unicode::ctype"); -static_assert(sizeof(unicode::sctype) <= sizeof(immediate), "immediate must be large enough to hold unicode::sctype"); -static_assert(sizeof(instruction) == sizeof(int), "expected instruction to be same size as int"); -static_assert(sizeof(int) <= sizeof(std::ptrdiff_t), "expected int to be no larger than ptrdiff_t"); +static_assert(sizeof(instruction) == sizeof(std::uint_least64_t), "expected instruction size to be same size as 32-bit integer"); +static_assert(alignof(instruction) == alignof(std::uint_least64_t), "expected instruction alignment to be same size as 32-bit integer"); enum class directives : std::uint_least8_t { none = 0, caseless = 1, eps = 2, lexeme = 4, noskip = 8, preskip = 16, postskip = 32, is_bitfield_enum }; using program_callees = std::vector>; @@ -123,43 +92,59 @@ using program_callees = std::vector instructions; + std::vector data; std::vector runesets; + std::vector predicates; std::vector actions; std::vector captures; - std::vector predicates; directives entry_mode{directives::eps}; void concatenate(program const& src) { + std::size_t const data_offset = data.size(); + std::size_t const runesets_offset = runesets.size(); + std::size_t const predicates_offset = predicates.size(); + std::size_t const actions_offset = actions.size(); + std::size_t const captures_offset = captures.size(); instructions.reserve(detail::checked_add(instructions.size(), src.instructions.size())); - for (auto i = src.instructions.begin(), j = i, e = src.instructions.end(); i != e; i = j) { - instruction instr = *i; - std::size_t val = 0; - switch (instr.pf.op) { - case opcode::match_set: val = detail::push_back_unique(runesets, src.runesets[instr.pf.val]); break; - case opcode::action: val = actions.size(); actions.push_back(src.actions[instr.pf.val]); break; - case opcode::predicate: val = predicates.size(); predicates.push_back(src.predicates[instr.pf.val]); break; - case opcode::capture_end: val = captures.size(); captures.push_back(src.captures[instr.pf.val]); break; - default: val = (std::numeric_limits::max)(); break; - } - if (val != (std::numeric_limits::max)()) { - detail::assure_in_range(val, 0U, (std::numeric_limits::max)()); - instr.pf.val = static_cast(val); + for (auto const& instr : src.instructions) { + instruction new_instr{instr}; + if (new_instr.op < opcode::match) { + std::optional object; + switch (new_instr.op) { + case opcode::match_set: object = instr.immediate16 + runesets_offset; break; + case opcode::predicate: object = instr.immediate16 + predicates_offset; break; + case opcode::action: object = instr.immediate16 + actions_offset; break; + case opcode::capture_end: object = instr.immediate16 + captures_offset; break; + default: break; + } + if (object.has_value()) { + detail::assure_in_range(*object, 0U, (std::numeric_limits::max)()); + new_instr.immediate16 = static_cast(*object); + } + } else { + std::size_t const offset32 = static_cast(new_instr.offset32) + data_offset; + detail::assure_in_range(offset32, 0U, static_cast((std::numeric_limits::max)())); + new_instr.offset32 = static_cast(offset32); } - j = std::next(i, instruction::length(instr.pf)); - instructions.push_back(instr); - instructions.insert(instructions.end(), i + 1, j); + instructions.push_back(new_instr); } + data.insert(data.end(), src.data.begin(), src.data.end()); + runesets.insert(runesets.end(), src.runesets.begin(), src.runesets.end()); + predicates.insert(predicates.end(), src.predicates.begin(), src.predicates.end()); + actions.insert(actions.end(), src.actions.begin(), src.actions.end()); + captures.insert(captures.end(), src.captures.begin(), src.captures.end()); entry_mode = (entry_mode & ~directives::eps) | (entry_mode & src.entry_mode & directives::eps); } void swap(program& p) noexcept { instructions.swap(p.instructions); + data.swap(p.data); runesets.swap(p.runesets); + predicates.swap(p.predicates); actions.swap(p.actions); captures.swap(p.captures); - predicates.swap(p.predicates); std::swap(entry_mode, p.entry_mode); } }; @@ -181,7 +166,7 @@ class rule rule& operator=(rule&& r) noexcept = default; ~rule() = default; void swap(rule& r) noexcept { program_.swap(r.program_); callees_.swap(r.callees_); } - [[nodiscard]] auto operator[](unsigned short precedence) const noexcept; + [[nodiscard]] auto operator[](std::uint_least16_t prec) const noexcept; }; class grammar @@ -388,41 +373,22 @@ class encoder directives entry_mode_{directives::none}; virtual void do_append(instruction instr) = 0; virtual void do_append(program const&) = 0; - [[nodiscard]] virtual immediate do_add_rune_set(unicode::rune_set&& /*r*/) { return immediate{0}; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) - [[nodiscard]] virtual immediate do_add_semantic_action(semantic_action&& /*a*/) { return immediate{0}; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) - [[nodiscard]] virtual immediate do_add_semantic_capture_action(semantic_capture_action&& /*a*/) { return immediate{0}; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) - [[nodiscard]] virtual immediate do_add_syntactic_predicate(syntactic_predicate&& /*p*/) { return immediate{0}; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) + [[nodiscard]] virtual std::pair do_add_string(std::string_view /*s*/) { return {std::int_least32_t{0}, std::uint_least16_t{0}}; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) + [[nodiscard]] virtual std::uint_least16_t do_add_rune_set(unicode::rune_set&& /*r*/) { return 0; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) + [[nodiscard]] virtual std::uint_least16_t do_add_semantic_action(semantic_action&& /*a*/) { return 0; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) + [[nodiscard]] virtual std::uint_least16_t do_add_semantic_capture_action(semantic_capture_action&& /*a*/) { return 0; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) + [[nodiscard]] virtual std::uint_least16_t do_add_syntactic_predicate(syntactic_predicate&& /*p*/) { return 0; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) virtual void do_add_callee(rule const* /*r*/, program const* /*p*/, std::ptrdiff_t /*n*/, directives /*d*/) {} [[nodiscard]] virtual bool do_should_evaluate_length() const noexcept { return true; } [[nodiscard]] virtual std::ptrdiff_t do_length() const noexcept = 0; protected: - encoder& do_call(rule const* r, program const* p, std::ptrdiff_t off, unsigned short prec) + encoder& do_call(rule const* r, program const* p, std::ptrdiff_t off, std::uint_least16_t prec) { auto callee_mode = mode_.back(); skip(p->entry_mode ^ directives::eps, directives::noskip); do_add_callee(r, p, length(), callee_mode); - return encode(opcode::call, off, immediate{prec}); - } - - encoder& do_match(opcode op, std::string_view sequence) - { - while (sequence.size() > instruction::maxstrlen) { - std::string_view subsequence = sequence.substr(0, instruction::maxstrlen); - while (!subsequence.empty() && !utf8::is_lead(subsequence.back())) - subsequence.remove_suffix(1); - subsequence.remove_suffix(!subsequence.empty() ? 1 : 0); - encode(op, subsequence); - sequence.remove_prefix(subsequence.size()); - } - return encode(op, sequence); - } - - template - encoder& do_match_class(opcode op, T value) - { - constexpr auto penum = immediate{static_cast(unicode::to_property_enum_v>)}; - return encode(op, detail::string_pack(value), penum); + return encode(opcode::call, off, prec, 0); } void do_skip(directives& last_mode) @@ -443,20 +409,26 @@ class encoder template >> [[nodiscard]] std::ptrdiff_t evaluate_length(E const& e, M const& m); encoder& append(instruction instr) { do_append(instr); return *this; } encoder& append(program const& p) { do_append(p); return *this; } - encoder& call(program const& p, unsigned short prec) { return do_call(nullptr, &p, 0, prec); } - encoder& call(grammar const& g, unsigned short prec) { return do_call(nullptr, &g.program(), 3, prec); } - encoder& encode(opcode op, immediate imm = immediate{0}) { return append(instruction{op, operands::none, imm}); } - encoder& encode(opcode op, semantic_action&& a) { return append(instruction{op, operands::none, do_add_semantic_action(std::move(a))}); } - encoder& encode(opcode op, semantic_capture_action&& a) { return append(instruction{op, operands::none, do_add_semantic_capture_action(std::move(a))}); } - encoder& encode(opcode op, syntactic_predicate&& p) { return append(instruction{op, operands::none, do_add_syntactic_predicate(std::move(p))}); } - encoder& encode(opcode op, std::ptrdiff_t off, immediate imm = immediate{0}) { return append(instruction{op, operands::off, imm}).append(instruction{off}); } + encoder& call(program const& p, std::uint_least16_t prec) { return do_call(nullptr, &p, 0, prec); } + encoder& call(grammar const& g, std::uint_least16_t prec) { return do_call(nullptr, &g.program(), 3, prec); } + encoder& encode(opcode op) { return append(instruction{op, 0, 0, 0}); } + encoder& encode(opcode op, std::uint_least16_t imm16, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, imm16, 0}); } + encoder& encode(opcode op, std::string_view str, std::uint_least8_t imm8 = 0) { auto const range = do_add_string(str); return append(instruction{op, imm8, range.second, range.first}); } + encoder& encode(opcode op, semantic_action&& a, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, do_add_semantic_action(std::move(a)), 0}); } + encoder& encode(opcode op, semantic_capture_action&& a, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, do_add_semantic_capture_action(std::move(a)), 0}); } + encoder& encode(opcode op, syntactic_predicate&& p, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, do_add_syntactic_predicate(std::move(p)), 0}); } [[nodiscard]] std::ptrdiff_t length() const noexcept { return do_length(); } [[nodiscard]] directives mode() const noexcept { return mode_.back(); } [[nodiscard]] directives entry_mode() const noexcept { return (entry_mode_ & ~directives::eps) | mode_.back(); } - encoder& match(unicode::rune_set&& runes) { return skip().encode(opcode::match_set, do_add_rune_set(std::move(runes))); } - encoder& match_eps() { return skip(directives::lexeme).encode(opcode::match); } + encoder& match(unicode::rune_set&& runes) { return skip().encode(opcode::match_set, do_add_rune_set(std::move(runes)), 0); } + encoder& match_eps() { return skip(directives::lexeme).encode(opcode::match, std::string_view{}); } encoder& match_any() { return skip().encode(opcode::match_any); } - template >> encoder& match_class(T properties) { return skip().do_match_class(Op, properties); } + + encoder& encode(opcode op, std::ptrdiff_t off, std::uint_least16_t imm16, std::uint_least8_t imm8) + { + detail::assure_in_range(off, (std::numeric_limits::min)(), (std::numeric_limits::max)()); + return append(instruction{op, imm8, imm16, static_cast(off)}); + } encoder& dpsh(directives enable, directives disable) { @@ -489,7 +461,7 @@ class encoder return *this; } - encoder& call(rule const& r, unsigned short prec, bool allow_inlining = true) + encoder& call(rule const& r, std::uint_least16_t prec, bool allow_inlining = true) { if (auto const& p = r.program_; allow_inlining && prec <= 0 && !r.currently_encoding_ && r.callees_.empty() && !p.instructions.empty() && (p.instructions.size() <= 8) && (p.actions.size() <= 1) && (p.captures.size() <= 1) && (p.predicates.size() <= 1)) @@ -498,7 +470,7 @@ class encoder } template - [[nodiscard]] auto call_with_frame(M const& m, T&& target, unsigned short prec, Args&&... args) -> M const& + [[nodiscard]] auto call_with_frame(M const& m, T&& target, std::uint_least16_t prec, Args&&... args) -> M const& { if constexpr (std::tuple_size_v != 0) encode(opcode::action, semantic_action{[frame = m.attribute_frame](environment& envr) { envr.push_attribute_frame(frame); }}); @@ -508,26 +480,18 @@ class encoder return m; } - encoder& encode(opcode op, std::string_view subsequence, immediate imm = immediate{0}) - { - if (!subsequence.empty()) { - detail::assure_in_range(static_cast(imm), 0U, instruction::maxstrlen - 1); - detail::assure_in_range(subsequence.size(), 1U, instruction::maxstrlen); - do_append(instruction{op, operands::str, static_cast(static_cast((static_cast(imm) << 8U) | static_cast(subsequence.size() - 1)))}); - do { - do_append(instruction{subsequence}); - subsequence.remove_prefix((std::min)(std::size_t{4}, subsequence.size())); - } while (!subsequence.empty()); - } - return *this; - } - encoder& match(std::string_view subject) { skip(!subject.empty() ? directives::eps : directives::none); if ((mode() & directives::caseless) != directives::none) - return do_match(opcode::match_cf, utf8::tocasefold(subject)); - return do_match(opcode::match, subject); + return encode(opcode::match_cf, utf8::tocasefold(subject)); + return encode(opcode::match, subject); + } + + template >> + encoder& match_class(T properties) + { + return skip().encode(Op, detail::string_pack(properties), static_cast(unicode::to_property_enum_v>)); } }; @@ -555,17 +519,27 @@ class program_encoder : public encoder void do_append(instruction instr) final { program_.instructions.push_back(instr); } void do_append(program const& p) final { program_.concatenate(p); } void do_add_callee(rule const* r, program const* p, std::ptrdiff_t n, directives d) final { callees_.emplace_back(r, p, n, d); } - [[nodiscard]] immediate do_add_rune_set(unicode::rune_set&& r) final { return add_item(program_.runesets, std::move(r)); } - [[nodiscard]] immediate do_add_semantic_action(semantic_action&& a) final { return add_item(program_.actions, std::move(a)); } - [[nodiscard]] immediate do_add_semantic_capture_action(semantic_capture_action&& a) final { return add_item(program_.captures, std::move(a)); } - [[nodiscard]] immediate do_add_syntactic_predicate(syntactic_predicate&& p) final { return add_item(program_.predicates, std::move(p)); } + [[nodiscard]] std::uint_least16_t do_add_rune_set(unicode::rune_set&& r) final { return add_item(program_.runesets, std::move(r)); } + [[nodiscard]] std::uint_least16_t do_add_semantic_action(semantic_action&& a) final { return add_item(program_.actions, std::move(a)); } + [[nodiscard]] std::uint_least16_t do_add_semantic_capture_action(semantic_capture_action&& a) final { return add_item(program_.captures, std::move(a)); } + [[nodiscard]] std::uint_least16_t do_add_syntactic_predicate(syntactic_predicate&& p) final { return add_item(program_.predicates, std::move(p)); } + + [[nodiscard]] std::pair do_add_string(std::string_view s) final + { + std::size_t const index = program_.data.size(); + std::size_t const size = s.size(); + detail::assure_in_range(index, 0U, static_cast((std::numeric_limits::max)())); + detail::assure_in_range(size, 0U, (std::numeric_limits::max)()); + program_.data.insert(program_.data.end(), s.begin(), s.end()); + return {static_cast(index), static_cast(size)}; + } template - [[nodiscard]] immediate add_item(std::vector& items, Item&& item) + [[nodiscard]] std::uint_least16_t add_item(std::vector& items, Item&& item) { - detail::assure_in_range(items.size(), 0U, (std::numeric_limits::max)() - 1U); + detail::assure_in_range(items.size(), 0U, (std::numeric_limits::max)() - 1U); items.push_back(std::forward(item)); - return static_cast(items.size() - 1); + return static_cast(items.size() - 1); } public: @@ -671,13 +645,13 @@ class basic_regular_expression : public terminal_encoder_expression_interface encoder.match(std::move(runes)); } else { if (circumflex) - encoder.encode(opcode::choice, 3 + (!runes.empty() ? 1 : 0) + (classes != unicode::ctype::none ? 1 : 0)); + encoder.encode(opcode::choice, 2 + (!runes.empty() ? 1 : 0) + (classes != unicode::ctype::none ? 1 : 0), 0, 0); if (!runes.empty()) encoder.match(std::move(runes)); if (classes != unicode::ctype::none) encoder.match_class(classes); if (circumflex) - encoder.encode(opcode::commit, 0).encode(opcode::fail, immediate{1}).match_any(); + encoder.encode(opcode::commit, 0, 0, 0).encode(opcode::fail, 0, 1).match_any(); } runes.clear(), classes = unicode::ctype::none, circumflex = false; } @@ -798,14 +772,14 @@ inline rule::rule(rule const& r) struct rule_precedence_expression : terminal_encoder_expression_interface { std::reference_wrapper target; - unsigned short precedence; - rule_precedence_expression(rule const& t, unsigned short p) noexcept : target{t}, precedence{p} {} - template [[nodiscard]] auto operator()(encoder& d, M const& m) const -> M const& { return d.call_with_frame(m, target.get(), precedence); } + std::uint_least16_t prec; + rule_precedence_expression(rule const& t, std::uint_least16_t p) noexcept : target{t}, prec{p} {} + template [[nodiscard]] auto operator()(encoder& d, M const& m) const -> M const& { return d.call_with_frame(m, target.get(), prec); } }; -[[nodiscard]] inline auto rule::operator[](unsigned short precedence) const noexcept +[[nodiscard]] inline auto rule::operator[](std::uint_least16_t prec) const noexcept { - return rule_precedence_expression{*this, precedence}; + return rule_precedence_expression{*this, prec}; } template @@ -852,7 +826,7 @@ inline constexpr directive_modifier [[nodiscard]] constexpr auto operator()(encoder& /*d*/, M const& m) const -> M const& { return m; } }; struct eps_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.match_eps(); return m; } }; -struct eoi_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::choice, 2).encode(opcode::match_any, immediate{0x8000}).encode(opcode::fail, immediate{2}); return m; } }; +struct eoi_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::choice, 2, 0, 0).encode(opcode::match_any, 0x8000, 0).encode(opcode::fail, 0, 2); return m; } }; struct eol_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::match_eol); return m; } }; struct cut_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::cut); return m; } }; @@ -889,7 +863,7 @@ struct condition_test_combinator { std::string_view name; constexpr explicit condition_test_expression(std::string_view n) noexcept : name{n} {} - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::condition_test, name, immediate{Value ? 1 : 0}); return m; } + template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::condition_test, name, Value ? 1 : 0); return m; } }; [[nodiscard]] constexpr condition_test_expression operator()(std::string_view name) const noexcept { return condition_test_expression{name}; } @@ -907,7 +881,7 @@ struct condition_block_combinator template [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const { - d.encode(opcode::condition_push, name, immediate{Value ? 1 : 0}); + d.encode(opcode::condition_push, name, Value ? 1 : 0); auto m2 = d.evaluate(this->e1, m); d.encode(opcode::condition_pop); return m2; @@ -935,7 +909,7 @@ struct symbol_exists_combinator { std::string_view name; constexpr explicit symbol_exists_expression(std::string_view n) noexcept : name{n} {} - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::symbol_exists, name, immediate{Value ? 1 : 0}); return m; } + template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::symbol_exists, name, Value ? 1 : 0); return m; } }; [[nodiscard]] constexpr symbol_exists_expression operator()(std::string_view name) const noexcept { return symbol_exists_expression{name}; } @@ -960,16 +934,15 @@ struct symbol_match_offset_combinator struct symbol_match_offset_expression : terminal_encoder_expression_interface { std::string_view name; - std::size_t offset; - constexpr symbol_match_offset_expression(std::string_view n, std::size_t o) noexcept : name{n}, offset{o} {} - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.skip(directives::eps).encode(((d.mode() & directives::caseless) != directives::none) ? OpCf : Op, name, immediate{static_cast(offset)}); return m; } + std::uint_least8_t offset; + constexpr symbol_match_offset_expression(std::string_view n, std::uint_least8_t o) noexcept : name{n}, offset{o} {} + template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.skip(directives::eps).encode(((d.mode() & directives::caseless) != directives::none) ? OpCf : Op, name, offset); return m; } }; [[nodiscard]] constexpr symbol_match_offset_expression operator()(std::string_view name, std::size_t offset = 0) const { - if (offset > (std::numeric_limits::max)()) - throw resource_limit_error{}; - return symbol_match_offset_expression{name, offset}; + detail::assure_in_range(offset, 0U, (std::numeric_limits::max)()); + return symbol_match_offset_expression{name, static_cast(offset)}; } }; @@ -981,8 +954,8 @@ struct negative_lookahead_expression : unary_encoder_expression_interface template [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const { - auto m2 = d.encode(opcode::choice, 1 + d.evaluate_length(this->e1, m)).evaluate(this->e1, m); - d.encode(opcode::fail, immediate{2}); + auto m2 = d.encode(opcode::choice, 1 + d.evaluate_length(this->e1, m), 0, 0).evaluate(this->e1, m); + d.encode(opcode::fail, 0, 2); return m2; } }; @@ -995,8 +968,8 @@ struct positive_lookahead_expression : unary_encoder_expression_interface template [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const { - auto m2 = d.encode(opcode::choice, 2 + d.evaluate_length(this->e1, m)).evaluate(this->e1, m); - d.encode(opcode::commit_back, 1).encode(opcode::fail, immediate{1}); + auto m2 = d.encode(opcode::choice, 1 + d.evaluate_length(this->e1, m), 0, 0).evaluate(this->e1, m); + d.encode(opcode::commit_back, 1, 0, 0).encode(opcode::fail, 0, 1); return m2; } }; @@ -1009,9 +982,9 @@ struct repetition_expression : unary_encoder_expression_interface template [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const { - std::ptrdiff_t const n = d.evaluate_length(this->e1, m); - auto m2 = d.encode(opcode::choice, 2 + n).evaluate(this->e1, m); - d.encode(opcode::commit_partial, -(2 + n)); + std::ptrdiff_t const n = 1 + d.evaluate_length(this->e1, m); + auto m2 = d.encode(opcode::choice, n, 0, 0).evaluate(this->e1, m); + d.encode(opcode::commit_partial, -n, 0, 0); return m2; } }; @@ -1024,8 +997,8 @@ struct choice_expression : binary_encoder_expression_interface template [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const { - auto m2 = d.encode(opcode::choice, 2 + d.evaluate_length(this->e1, m)).evaluate(this->e1, m); - return d.encode(opcode::commit, d.evaluate_length(this->e2, m2)).evaluate(this->e2, m2); + auto m2 = d.encode(opcode::choice, 1 + d.evaluate_length(this->e1, m), 0, 0).evaluate(this->e1, m); + return d.encode(opcode::commit, d.evaluate_length(this->e2, m2), 0, 0).evaluate(this->e2, m2); } }; @@ -1134,7 +1107,7 @@ struct local_block_expression : unary_encoder_expression_interface template [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const { - d.skip().encode(opcode::symbol_push, immediate{2}); + d.skip().encode(opcode::symbol_push, 0, 2); auto m2 = d.evaluate(this->e1, m); d.encode(opcode::symbol_pop); return m2; @@ -1150,7 +1123,7 @@ struct local_to_block_expression : unary_encoder_expression_interface template [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const { - d.skip().encode(opcode::symbol_push, name, immediate{1}); + d.skip().encode(opcode::symbol_push, name, 1); auto m2 = d.evaluate(this->e1, m); d.encode(opcode::symbol_pop); return m2; @@ -1358,7 +1331,7 @@ class implicit_space_rule std::vector> calls; std::unordered_set left_recursive; std::vector>, program const*>> unprocessed; - program_encoder{grprogram, grcallees, directives::eps | directives::preskip}.call(start_rule, 1, false).encode(opcode::halt); + program_encoder{grprogram, grcallees, directives::eps | directives::preskip}.call(start_rule, std::uint_least16_t{1}, false).encode(opcode::halt); calls.emplace_back(&start_rule.program_, std::get<2>(grcallees.back())); unprocessed.emplace_back(std::vector>{{&start_rule, false}}, &start_rule.program_); do { @@ -1366,7 +1339,7 @@ class implicit_space_rule auto const address = static_cast(grprogram.instructions.size()); if (addresses.emplace(subprogram, address).second) { grprogram.concatenate(*subprogram); - grprogram.instructions.emplace_back(opcode::ret, operands::none, immediate{0}); + grprogram.instructions.emplace_back(opcode::ret, std::uint_least8_t{0}, std::uint_least16_t{0}, std::int_least32_t{0}); if (auto top_rule = callstack.back().first; top_rule) { for (auto [callee_rule, callee_program, instr_offset, mode] : top_rule->callees_) { // NOLINT(performance-for-range-copy) calls.emplace_back(callee_program, address + instr_offset); @@ -1387,13 +1360,15 @@ class implicit_space_rule } } while (!unprocessed.empty()); for (auto [subprogram, instr_addr] : calls) { - if (auto& iprefix = grprogram.instructions[static_cast(instr_addr)]; iprefix.pf.op == opcode::call) - iprefix.pf.val = (left_recursive.count(subprogram) != 0) ? (std::max)(iprefix.pf.val, static_cast(1)) : 0; - auto& ioffset = grprogram.instructions[static_cast(instr_addr + 1)]; - auto const rel_addr = ioffset.off + addresses[subprogram] - (instr_addr + 2); - detail::assure_in_range(rel_addr, std::numeric_limits::lowest(), (std::numeric_limits::max)()); - ioffset.off = static_cast(rel_addr); + if (auto& instr = grprogram.instructions[static_cast(instr_addr)]; instr.op == opcode::call) { + instr.immediate16 = ((left_recursive.count(subprogram) != 0) ? (std::max)(instr.immediate16, std::uint_least16_t{1}) : std::uint_least16_t{0}); + std::ptrdiff_t const call_addr = instr.offset32 + addresses[subprogram] - (instr_addr + 1); + detail::assure_in_range(call_addr, std::numeric_limits::min(), std::numeric_limits::max()); + instr.offset32 = static_cast(call_addr); + } } + if (grprogram.data.empty()) + grprogram.data.emplace_back('\0'); return grammar{std::move(grprogram)}; } @@ -1459,7 +1434,6 @@ class multi_input_source class string_input_source { std::string buffer_; - public: [[nodiscard]] std::string_view buffer() const noexcept { return buffer_; } void drain_buffer(std::size_t sr) { buffer_.erase(0, sr); } @@ -1469,12 +1443,11 @@ class string_input_source class string_view_input_source { std::string_view buffer_; - public: using enqueue_drains = std::true_type; [[nodiscard]] constexpr std::string_view buffer() const noexcept { return buffer_; } constexpr void drain_buffer(std::size_t sr) noexcept { buffer_.remove_prefix(sr); } - template > void enqueue(It first, It last) { buffer_ = std::string_view{&(*first), static_cast(last - first)}; } + template > void enqueue(It first, It last) { buffer_ = (last > first) ? std::string_view{&(*first), static_cast(last - first)} : std::string_view{}; } }; class parser_base @@ -1498,11 +1471,11 @@ class parser_base std::vector responses_; std::vector stack_frames_; std::unordered_map casefolded_subjects_; - lug::registers registers_{0, 0, 0, 0, 0, 0, 0, 0}; + lug::registers registers_{0, 0, 0, 0, 0, 0, 0}; bool parsing_{false}; template - void commit(int off) + void commit(std::ptrdiff_t off) { if (stack_frames_.empty()) throw bad_stack{}; @@ -1547,9 +1520,9 @@ class parser_base return responses_.size(); } - [[nodiscard]] std::ptrdiff_t call_into(unsigned short imm, int off) + [[nodiscard]] std::ptrdiff_t call_into(std::size_t prec, std::ptrdiff_t off) { - if (imm == 0) { + if (prec == 0) { stack_frames_.emplace_back(std::in_place_type, registers_.pc); ++registers_.cd; registers_.pc += off; @@ -1566,13 +1539,13 @@ class parser_base }); if (frame_it != stack_frames_.crend()) { auto const& memo = std::get(*frame_it); - if ((memo.sra == parser_base::lrfailcode) || (imm < memo.prec)) + if ((memo.sra == parser_base::lrfailcode) || (prec < memo.prec)) return 1; registers_.sr = memo.sra; registers_.rc = restore_responses_after(registers_.rc, memo.responses); return 0; } - stack_frames_.emplace_back(std::in_place_type, registers_.sr, parser_base::lrfailcode, imm, registers_.pc, registers_.pc + off, registers_.rc); + stack_frames_.emplace_back(std::in_place_type, registers_.sr, parser_base::lrfailcode, prec, registers_.pc, registers_.pc + off, registers_.rc); ++registers_.cd; ++registers_.ci; registers_.pc += off; @@ -1859,73 +1832,39 @@ class basic_parser : public parser_base throw bad_grammar{}; drain(); registers_.pc = 0; - std::ptrdiff_t fc = 0; - bool result = false, done = false; + instruction const* const instructions{prog.instructions.data()}; + char const* const data{prog.data.data()}; + std::ptrdiff_t fc{0}; + bool result{false}; + bool done{false}; while (!done) { - auto [op, imm, off, str] = instruction::decode(prog.instructions, registers_.pc); - switch (op) { - case opcode::match: { - fc = match_sequence(registers_.sr, str, std::mem_fn(&basic_parser::compare)); - } break; - case opcode::match_cf: { - fc = match_sequence(registers_.sr, str, std::mem_fn(&basic_parser::casefold_compare)); - } break; - case opcode::match_any: { - if constexpr (detail::input_source_has_options::value) { - if (((imm & 0x8000U) != 0) && ((input_source_.options() & source_options::interactive) != source_options::none)) { - fc = 1; - break; - } - } - fc = match_single(registers_.sr, []{ return true; }); - } break; - case opcode::match_any_of: { - fc = match_single(registers_.sr, [pe = static_cast(imm), s = str](auto const& r) { return unicode::any_of(r, pe, s); }); - } break; - case opcode::match_all_of: { - fc = match_single(registers_.sr, [pe = static_cast(imm), s = str](auto const& r) { return unicode::all_of(r, pe, s); }); - } break; - case opcode::match_none_of: { - fc = match_single(registers_.sr, [pe = static_cast(imm), s = str](auto const& r) { return unicode::none_of(r, pe, s); }); - } break; - case opcode::match_set: { - fc = match_single(registers_.sr, [&runes = prog.runesets[imm]](char32_t rune) { - auto const interval = std::lower_bound(runes.begin(), runes.end(), rune, [](auto& x, auto& y) { return x.second < y; }); - return (interval != runes.end()) && (interval->first <= rune) && (rune <= interval->second); }); - } break; - case opcode::match_eol: { - fc = match_single(registers_.sr, [](auto curr, auto last, auto& next, char32_t rune) { - if ((curr == next) || ((unicode::query(rune).properties() & unicode::ptype::Line_Ending) == unicode::ptype::None)) - return false; - if (U'\r' == rune) - if (auto const [next2, rune2] = utf8::decode_rune(next, last); (next2 != next) && (rune2 == U'\n')) - next = next2; - return true; }); - } break; + instruction const instr{instructions[registers_.pc++]}; + std::string_view const str{data + instr.offset32, instr.immediate16}; + switch (instr.op) { case opcode::choice: { - stack_frames_.emplace_back(std::in_place_type, registers_.sr - imm, registers_.rc, registers_.pc + off); + stack_frames_.emplace_back(std::in_place_type, registers_.sr - instr.immediate8, registers_.rc, registers_.pc + instr.offset32); } break; case opcode::commit: { - commit(off); + commit(instr.offset32); } break; case opcode::commit_back: { - commit(off); + commit(instr.offset32); } break; case opcode::commit_partial: { - commit(off); + commit(instr.offset32); } break; case opcode::jump: { - registers_.pc += off; + registers_.pc += instr.offset32; } break; case opcode::call: { - fc = call_into(imm, off); + fc = call_into(instr.immediate16, instr.offset32); } break; case opcode::ret: { if (return_from()) accept_if_deferred(); } break; case opcode::fail: { - fc = static_cast(imm); + fc = static_cast(instr.immediate8); } break; case opcode::cut: { if (registers_.ci == 0) { @@ -1939,16 +1878,16 @@ class basic_parser : public parser_base accept(); result = done = true; } break; - case opcode::action: { - registers_.rc = push_response(registers_.cd, imm); - } break; case opcode::predicate: { registers_.mr = (std::max)(registers_.mr, registers_.sr); environment_->reset_match_and_subject(match(), subject()); - bool const accepted = prog.predicates[imm](*environment_); + bool const accepted = prog.predicates[instr.immediate16](*environment_); pop_responses_after(registers_.rc); fc = accepted ? 0 : 1; } break; + case opcode::action: { + registers_.rc = push_response(registers_.cd, instr.immediate16); + } break; case opcode::capture_start: { stack_frames_.emplace_back(std::in_place_type, registers_.sr); ++registers_.ci; @@ -1964,13 +1903,51 @@ class basic_parser : public parser_base fc = 1; break; } - registers_.rc = push_response(registers_.cd, imm, syntax_range{sr0, sr1 - sr0}); + registers_.rc = push_response(registers_.cd, instr.immediate16, syntax_range{sr0, sr1 - sr0}); + } break; + case opcode::match: { + fc = match_sequence(registers_.sr, str, std::mem_fn(&basic_parser::compare)); + } break; + case opcode::match_cf: { + fc = match_sequence(registers_.sr, str, std::mem_fn(&basic_parser::casefold_compare)); + } break; + case opcode::match_any: { + if constexpr (detail::input_source_has_options::value) { + if (((instr.immediate16 & 0x8000U) != 0) && ((input_source_.options() & source_options::interactive) != source_options::none)) { + fc = 1; + break; + } + } + fc = match_single(registers_.sr, []{ return true; }); + } break; + case opcode::match_any_of: { + fc = match_single(registers_.sr, [pe = static_cast(instr.immediate8), s = str](auto const& r) { return unicode::any_of(r, pe, s); }); + } break; + case opcode::match_all_of: { + fc = match_single(registers_.sr, [pe = static_cast(instr.immediate8), s = str](auto const& r) { return unicode::all_of(r, pe, s); }); + } break; + case opcode::match_none_of: { + fc = match_single(registers_.sr, [pe = static_cast(instr.immediate8), s = str](auto const& r) { return unicode::none_of(r, pe, s); }); + } break; + case opcode::match_set: { + fc = match_single(registers_.sr, [&runes = prog.runesets[instr.immediate16]](char32_t rune) { + auto const interval = std::lower_bound(runes.begin(), runes.end(), rune, [](auto& x, auto& y) { return x.second < y; }); + return (interval != runes.end()) && (interval->first <= rune) && (rune <= interval->second); }); + } break; + case opcode::match_eol: { + fc = match_single(registers_.sr, [](auto curr, auto last, auto& next, char32_t rune) { + if ((curr == next) || ((unicode::query(rune).properties() & unicode::ptype::Line_Ending) == unicode::ptype::None)) + return false; + if (U'\r' == rune) + if (auto const [next2, rune2] = utf8::decode_rune(next, last); (next2 != next) && (rune2 == U'\n')) + next = next2; + return true; }); } break; case opcode::condition_test: { - fc = (environment_->has_condition(str) == (imm != 0)) ? 0 : 1; + fc = (environment_->has_condition(str) == (instr.immediate8 != 0)) ? 0 : 1; } break; case opcode::condition_push: { - stack_frames_.emplace_back(std::in_place_type, str, environment_->set_condition(str, imm != 0)); + stack_frames_.emplace_back(std::in_place_type, str, environment_->set_condition(str, instr.immediate8 != 0)); } break; case opcode::condition_pop: { if (stack_frames_.empty()) @@ -1980,7 +1957,7 @@ class basic_parser : public parser_base stack_frames_.pop_back(); } break; case opcode::symbol_exists: { - fc = (environment_->has_symbol(str) == (imm != 0)) ? 0 : 1; + fc = (environment_->has_symbol(str) == (instr.immediate8 != 0)) ? 0 : 1; } break; case opcode::symbol_all: { fc = match_symbol_all(registers_.sr, str, detail::identity{}, std::mem_fn(&basic_parser::compare)); @@ -1995,16 +1972,16 @@ class basic_parser : public parser_base fc = match_symbol_any(registers_.sr, str, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare)); } break; case opcode::symbol_head: { - fc = match_symbol_head(registers_.sr, str, imm, detail::identity{}, std::mem_fn(&basic_parser::compare)); + fc = match_symbol_head(registers_.sr, str, instr.immediate8, detail::identity{}, std::mem_fn(&basic_parser::compare)); } break; case opcode::symbol_head_cf: { - fc = match_symbol_head(registers_.sr, str, imm, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare)); + fc = match_symbol_head(registers_.sr, str, instr.immediate8, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare)); } break; case opcode::symbol_tail: { - fc = match_symbol_tail(registers_.sr, str, imm, detail::identity{}, std::mem_fn(&basic_parser::compare)); + fc = match_symbol_tail(registers_.sr, str, instr.immediate8, detail::identity{}, std::mem_fn(&basic_parser::compare)); } break; case opcode::symbol_tail_cf: { - fc = match_symbol_tail(registers_.sr, str, imm, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare)); + fc = match_symbol_tail(registers_.sr, str, instr.immediate8, utf8::tocasefold, std::mem_fn(&basic_parser::casefold_compare)); } break; case opcode::symbol_start: { stack_frames_.emplace_back(std::in_place_type, str, registers_.sr); @@ -2024,9 +2001,9 @@ class basic_parser : public parser_base } break; case opcode::symbol_push: { stack_frames_.emplace_back(std::in_place_type, environment_->symbols_); - if (imm == 1) + if (instr.immediate8 == 1) environment_->symbols_.erase(str); - else if (imm == 2) + else if (instr.immediate8 == 2) environment_->symbols_.clear(); } break; case opcode::symbol_pop: { From 8c711594016379431e2c5479d62f34b456fbe76d Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Tue, 6 Aug 2024 18:40:16 -0700 Subject: [PATCH 09/14] Code cleanup --- lug/lug.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lug/lug.hpp b/lug/lug.hpp index 2a7d179..272896f 100644 --- a/lug/lug.hpp +++ b/lug/lug.hpp @@ -373,7 +373,7 @@ class encoder directives entry_mode_{directives::none}; virtual void do_append(instruction instr) = 0; virtual void do_append(program const&) = 0; - [[nodiscard]] virtual std::pair do_add_string(std::string_view /*s*/) { return {std::int_least32_t{0}, std::uint_least16_t{0}}; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) + [[nodiscard]] virtual std::pair do_add_string(std::string_view /*s*/) { return {}; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) [[nodiscard]] virtual std::uint_least16_t do_add_rune_set(unicode::rune_set&& /*r*/) { return 0; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) [[nodiscard]] virtual std::uint_least16_t do_add_semantic_action(semantic_action&& /*a*/) { return 0; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) [[nodiscard]] virtual std::uint_least16_t do_add_semantic_capture_action(semantic_capture_action&& /*a*/) { return 0; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) @@ -1367,8 +1367,7 @@ class implicit_space_rule instr.offset32 = static_cast(call_addr); } } - if (grprogram.data.empty()) - grprogram.data.emplace_back('\0'); + grprogram.data.emplace_back('\0'); return grammar{std::move(grprogram)}; } From 5b24fa78cb09913d4adad2924e906f771f2993d7 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Wed, 7 Aug 2024 03:52:52 -0700 Subject: [PATCH 10/14] Merged program_encoder and rule_encoder into the encoder class after overhauling how choice/jump offsets are calculated. This essentially fixes #21, since the encoder class is no longer an abstract base class with virtual member functions. --- lug/detail.hpp | 17 ++- lug/lug.hpp | 289 +++++++++++++++++++------------------------------ 2 files changed, 126 insertions(+), 180 deletions(-) diff --git a/lug/detail.hpp b/lug/detail.hpp index 9182402..d78e2f9 100644 --- a/lug/detail.hpp +++ b/lug/detail.hpp @@ -292,14 +292,27 @@ class scope_exit template >> scope_exit(Fn) -> scope_exit>; -template +template && std::is_integral_v && std::is_integral_v>> constexpr void assure_in_range(T x, U minval, V maxval) { if (!((minval <= x) && (x <= maxval))) throw Error(); } -template +template && std::is_integral_v && std::is_integral_v && std::is_integral_v>> +[[nodiscard]] constexpr T checked_cast(S x, U minval, V maxval) +{ + detail::assure_in_range(x, minval, maxval); + return static_cast(x); +} + +template && std::is_integral_v>> +[[nodiscard]] constexpr T checked_cast(S x) +{ + return detail::checked_cast(x, (std::numeric_limits>::min)(), (std::numeric_limits>::max)()); +} + +template && std::is_integral_v>> [[nodiscard]] constexpr auto checked_add(T x, U y) { if (((std::numeric_limits::max)() - x) < y) diff --git a/lug/lug.hpp b/lug/lug.hpp index 272896f..40441b6 100644 --- a/lug/lug.hpp +++ b/lug/lug.hpp @@ -23,7 +23,6 @@ namespace lug { class rule; class grammar; class encoder; -class rule_encoder; class syntax; class environment; class multi_input_source; @@ -68,7 +67,7 @@ enum class opcode : std::uint_least8_t call, ret, fail, cut, halt, predicate, action, capture_start, capture_end, condition_pop, symbol_end, symbol_pop, match_any, match_set, match_eol, - match = 32, match_cf, match_any_of, match_all_of, match_none_of, + match, match_cf, match_any_of, match_all_of, match_none_of, condition_test, condition_push, symbol_exists, symbol_all, symbol_all_cf, symbol_any, symbol_any_cf, symbol_head, symbol_head_cf, symbol_tail, symbol_tail_cf, symbol_start, symbol_push @@ -86,6 +85,7 @@ struct alignas(std::uint_least64_t) instruction static_assert(sizeof(instruction) == sizeof(std::uint_least64_t), "expected instruction size to be same size as 32-bit integer"); static_assert(alignof(instruction) == alignof(std::uint_least64_t), "expected instruction alignment to be same size as 32-bit integer"); +enum class instruction_address : std::size_t {}; enum class directives : std::uint_least8_t { none = 0, caseless = 1, eps = 2, lexeme = 4, noskip = 8, preskip = 16, postskip = 32, is_bitfield_enum }; using program_callees = std::vector>; @@ -118,14 +118,11 @@ struct program case opcode::capture_end: object = instr.immediate16 + captures_offset; break; default: break; } - if (object.has_value()) { - detail::assure_in_range(*object, 0U, (std::numeric_limits::max)()); - new_instr.immediate16 = static_cast(*object); - } + if (object.has_value()) + new_instr.immediate16 = detail::checked_cast(*object, 0U, (std::numeric_limits::max)()); } else { std::size_t const offset32 = static_cast(new_instr.offset32) + data_offset; - detail::assure_in_range(offset32, 0U, static_cast((std::numeric_limits::max)())); - new_instr.offset32 = static_cast(offset32); + new_instr.offset32 = detail::checked_cast(offset32, 0U, static_cast((std::numeric_limits::max)())); } instructions.push_back(new_instr); } @@ -152,7 +149,6 @@ struct program class rule { friend class encoder; - friend class rule_encoder; friend grammar start(rule const& start_rule); program program_; program_callees callees_; @@ -369,25 +365,34 @@ template encoder_metadata(Frame&&) -> encoder_metadata mode_; directives entry_mode_{directives::none}; - virtual void do_append(instruction instr) = 0; - virtual void do_append(program const&) = 0; - [[nodiscard]] virtual std::pair do_add_string(std::string_view /*s*/) { return {}; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) - [[nodiscard]] virtual std::uint_least16_t do_add_rune_set(unicode::rune_set&& /*r*/) { return 0; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) - [[nodiscard]] virtual std::uint_least16_t do_add_semantic_action(semantic_action&& /*a*/) { return 0; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) - [[nodiscard]] virtual std::uint_least16_t do_add_semantic_capture_action(semantic_capture_action&& /*a*/) { return 0; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) - [[nodiscard]] virtual std::uint_least16_t do_add_syntactic_predicate(syntactic_predicate&& /*p*/) { return 0; } // NOLINT(cppcoreguidelines-rvalue-reference-param-not-moved) - virtual void do_add_callee(rule const* /*r*/, program const* /*p*/, std::ptrdiff_t /*n*/, directives /*d*/) {} - [[nodiscard]] virtual bool do_should_evaluate_length() const noexcept { return true; } - [[nodiscard]] virtual std::ptrdiff_t do_length() const noexcept = 0; -protected: - encoder& do_call(rule const* r, program const* p, std::ptrdiff_t off, std::uint_least16_t prec) + template + [[nodiscard]] std::uint_least16_t add_item(std::vector& items, Item&& item) + { + items.push_back(std::forward(item)); + return detail::checked_cast(items.size() - 1); + } + + [[nodiscard]] std::pair add_string(std::string_view str) + { + std::size_t const index = program_->data.size(); + program_->data.insert(program_->data.end(), str.begin(), str.end()); + return { + detail::checked_cast(index, 0U, static_cast((std::numeric_limits::max)())), + detail::checked_cast(str.size(), 0U, (std::numeric_limits::max)()) + }; + } + + instruction_address do_call(rule const* r, program const* p, std::ptrdiff_t off, std::uint_least16_t prec) { auto callee_mode = mode_.back(); skip(p->entry_mode ^ directives::eps, directives::noskip); - do_add_callee(r, p, length(), callee_mode); + callees_->emplace_back(r, p, static_cast(program_->instructions.size()), callee_mode); return encode(opcode::call, off, prec, 0); } @@ -399,69 +404,32 @@ class encoder } public: - explicit encoder(directives initial) : mode_{initial} {} - virtual ~encoder() = default; - encoder(encoder const&) = delete; - encoder(encoder&&) = delete; - encoder& operator=(encoder const&) = delete; - encoder& operator=(encoder&&) = delete; - template >> [[nodiscard]] decltype(auto) evaluate(E const& e, M const& m); - template >> [[nodiscard]] std::ptrdiff_t evaluate_length(E const& e, M const& m); - encoder& append(instruction instr) { do_append(instr); return *this; } - encoder& append(program const& p) { do_append(p); return *this; } - encoder& call(program const& p, std::uint_least16_t prec) { return do_call(nullptr, &p, 0, prec); } - encoder& call(grammar const& g, std::uint_least16_t prec) { return do_call(nullptr, &g.program(), 3, prec); } - encoder& encode(opcode op) { return append(instruction{op, 0, 0, 0}); } - encoder& encode(opcode op, std::uint_least16_t imm16, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, imm16, 0}); } - encoder& encode(opcode op, std::string_view str, std::uint_least8_t imm8 = 0) { auto const range = do_add_string(str); return append(instruction{op, imm8, range.second, range.first}); } - encoder& encode(opcode op, semantic_action&& a, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, do_add_semantic_action(std::move(a)), 0}); } - encoder& encode(opcode op, semantic_capture_action&& a, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, do_add_semantic_capture_action(std::move(a)), 0}); } - encoder& encode(opcode op, syntactic_predicate&& p, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, do_add_syntactic_predicate(std::move(p)), 0}); } - [[nodiscard]] std::ptrdiff_t length() const noexcept { return do_length(); } + explicit encoder(program& p, program_callees& c, directives initial) : program_{&p}, callees_{&c}, mode_{initial} {} + explicit encoder(rule& r) : rule_{&r}, program_{&r.program_}, callees_{&r.callees_}, mode_{directives::eps} { rule_->currently_encoding_ = true; } + ~encoder() { if (rule_) { rule_->currently_encoding_ = false; } if (program_) { program_->entry_mode = entry_mode(); } } [[nodiscard]] directives mode() const noexcept { return mode_.back(); } [[nodiscard]] directives entry_mode() const noexcept { return (entry_mode_ & ~directives::eps) | mode_.back(); } - encoder& match(unicode::rune_set&& runes) { return skip().encode(opcode::match_set, do_add_rune_set(std::move(runes)), 0); } - encoder& match_eps() { return skip(directives::lexeme).encode(opcode::match, std::string_view{}); } - encoder& match_any() { return skip().encode(opcode::match_any); } - - encoder& encode(opcode op, std::ptrdiff_t off, std::uint_least16_t imm16, std::uint_least8_t imm8) - { - detail::assure_in_range(off, (std::numeric_limits::min)(), (std::numeric_limits::max)()); - return append(instruction{op, imm8, imm16, static_cast(off)}); - } - - encoder& dpsh(directives enable, directives disable) - { - directives const prev = mode_.back(); - mode_.push_back((prev & ~disable) | enable); - return *this; - } - - encoder& dpop(directives relay) - { - directives const prev = detail::pop_back(mode_); - directives& last_mode = mode_.back(); - directives const next = (last_mode & ~relay) | (prev & relay); - if (((next & directives::postskip) == directives::none) && ((prev & (directives::lexeme | directives::noskip | directives::postskip)) == directives::postskip)) - do_skip(last_mode); - mode_.back() = next; - return *this; - } - - encoder& skip(directives callee_mode = directives::eps, directives callee_skip = directives::lexeme) - { - directives& last_mode = mode_.back(); - directives const prev = last_mode; - directives const next = last_mode & ~(callee_mode & directives::eps); - if (entry_mode_ == directives::none) - entry_mode_ = (prev & (directives::caseless | directives::lexeme | directives::noskip)) | directives::eps; - if ((((prev | callee_mode)) & (callee_skip | directives::preskip)) == directives::preskip) - do_skip(last_mode); - mode_.back() = next; - return *this; - } - - encoder& call(rule const& r, std::uint_least16_t prec, bool allow_inlining = true) + [[nodiscard]] instruction_address here() const noexcept { return static_cast(program_->instructions.size()); } + instruction& instruction_at(instruction_address addr) { return program_->instructions[static_cast(addr)]; } + void jump_to_target(instruction_address addr, instruction_address target) { instruction_at(addr).offset32 = detail::checked_cast(static_cast(target) - static_cast(addr) - 1); } + void jump_to_here(instruction_address addr) { jump_to_target(addr, here()); } + template >> [[nodiscard]] decltype(auto) evaluate(E const& e, M const& m); + instruction_address append(instruction instr) { instruction_address const result{program_->instructions.size()}; program_->instructions.push_back(instr); return result; } + instruction_address append(program const& p) { instruction_address const result{program_->instructions.size()}; program_->concatenate(p); return result; } + instruction_address encode(opcode op) { return append(instruction{op, 0, 0, 0}); } + instruction_address encode(opcode op, std::ptrdiff_t off, std::uint_least16_t imm16, std::uint_least8_t imm8) { return append(instruction{op, imm8, imm16, detail::checked_cast(off)}); } + instruction_address encode(opcode op, std::uint_least16_t imm16, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, imm16, 0}); } + instruction_address encode(opcode op, std::string_view str, std::uint_least8_t imm8 = 0) { auto const rng = add_string(str); return append(instruction{op, imm8, rng.second, rng.first}); } + instruction_address encode(opcode op, semantic_action&& a, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, add_item(program_->actions, std::move(a)), 0}); } + instruction_address encode(opcode op, semantic_capture_action&& a, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, add_item(program_->captures, std::move(a)), 0}); } + instruction_address encode(opcode op, syntactic_predicate&& p, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, add_item(program_->predicates, std::move(p)), 0}); } + instruction_address call(program const& p, std::uint_least16_t prec) { return do_call(nullptr, &p, 0, prec); } + instruction_address call(grammar const& g, std::uint_least16_t prec) { return do_call(nullptr, &g.program(), 3, prec); } + instruction_address match(unicode::rune_set&& runes) { return skip().encode(opcode::match_set, add_item(program_->runesets, std::move(runes)), 0); } + instruction_address match_any() { return skip().encode(opcode::match_any); } + instruction_address match_eps() { return skip(directives::lexeme).encode(opcode::match, std::string_view{}); } + + instruction_address call(rule const& r, std::uint_least16_t prec, bool allow_inlining = true) { if (auto const& p = r.program_; allow_inlining && prec <= 0 && !r.currently_encoding_ && r.callees_.empty() && !p.instructions.empty() && (p.instructions.size() <= 8) && (p.actions.size() <= 1) && (p.captures.size() <= 1) && (p.predicates.size() <= 1)) @@ -480,7 +448,7 @@ class encoder return m; } - encoder& match(std::string_view subject) + instruction_address match(std::string_view subject) { skip(!subject.empty() ? directives::eps : directives::none); if ((mode() & directives::caseless) != directives::none) @@ -489,78 +457,39 @@ class encoder } template >> - encoder& match_class(T properties) + instruction_address match_class(T properties) { return skip().encode(Op, detail::string_pack(properties), static_cast(unicode::to_property_enum_v>)); } -}; - -class instruction_length_evaluator final : public encoder -{ - std::ptrdiff_t length_{0}; - void do_append(instruction instr) final { std::ignore = instr; length_ = detail::checked_add(length_, std::ptrdiff_t{1}); } - void do_append(program const& p) final { length_ = detail::checked_add(length_, static_cast(p.instructions.size())); } - [[nodiscard]] bool do_should_evaluate_length() const noexcept final { return false; } - [[nodiscard]] std::ptrdiff_t do_length() const noexcept final { return length_; } -public: - explicit instruction_length_evaluator(directives initial) : encoder{initial} {} - ~instruction_length_evaluator() final = default; - instruction_length_evaluator(instruction_length_evaluator const&) = delete; - instruction_length_evaluator(instruction_length_evaluator&&) = delete; - instruction_length_evaluator& operator=(instruction_length_evaluator const&) = delete; - instruction_length_evaluator& operator=(instruction_length_evaluator&&) = delete; -}; - -class program_encoder : public encoder -{ - program& program_; - program_callees& callees_; - [[nodiscard]] std::ptrdiff_t do_length() const noexcept final { return static_cast(program_.instructions.size()); } - void do_append(instruction instr) final { program_.instructions.push_back(instr); } - void do_append(program const& p) final { program_.concatenate(p); } - void do_add_callee(rule const* r, program const* p, std::ptrdiff_t n, directives d) final { callees_.emplace_back(r, p, n, d); } - [[nodiscard]] std::uint_least16_t do_add_rune_set(unicode::rune_set&& r) final { return add_item(program_.runesets, std::move(r)); } - [[nodiscard]] std::uint_least16_t do_add_semantic_action(semantic_action&& a) final { return add_item(program_.actions, std::move(a)); } - [[nodiscard]] std::uint_least16_t do_add_semantic_capture_action(semantic_capture_action&& a) final { return add_item(program_.captures, std::move(a)); } - [[nodiscard]] std::uint_least16_t do_add_syntactic_predicate(syntactic_predicate&& p) final { return add_item(program_.predicates, std::move(p)); } - [[nodiscard]] std::pair do_add_string(std::string_view s) final + void dpsh(directives enable, directives disable) { - std::size_t const index = program_.data.size(); - std::size_t const size = s.size(); - detail::assure_in_range(index, 0U, static_cast((std::numeric_limits::max)())); - detail::assure_in_range(size, 0U, (std::numeric_limits::max)()); - program_.data.insert(program_.data.end(), s.begin(), s.end()); - return {static_cast(index), static_cast(size)}; + directives const prev = mode_.back(); + mode_.push_back((prev & ~disable) | enable); } - template - [[nodiscard]] std::uint_least16_t add_item(std::vector& items, Item&& item) + void dpop(directives relay) { - detail::assure_in_range(items.size(), 0U, (std::numeric_limits::max)() - 1U); - items.push_back(std::forward(item)); - return static_cast(items.size() - 1); + directives const prev = detail::pop_back(mode_); + directives& last_mode = mode_.back(); + directives const next = (last_mode & ~relay) | (prev & relay); + if (((next & directives::postskip) == directives::none) && ((prev & (directives::lexeme | directives::noskip | directives::postskip)) == directives::postskip)) + do_skip(last_mode); + mode_.back() = next; } -public: - program_encoder(program& p, program_callees& c, directives initial) : encoder{initial}, program_{p}, callees_{c} {} - ~program_encoder() override { program_.entry_mode = entry_mode(); } - program_encoder(program_encoder const&) = delete; - program_encoder(program_encoder&&) = delete; - program_encoder& operator=(program_encoder const&) = delete; - program_encoder& operator=(program_encoder&&) = delete; -}; - -class rule_encoder final : public program_encoder -{ - rule& rule_; -public: - explicit rule_encoder(rule& r) : program_encoder{r.program_, r.callees_, directives::eps}, rule_{r} { rule_.currently_encoding_ = true; } - ~rule_encoder() final { rule_.currently_encoding_ = false; } - rule_encoder(rule_encoder const&) = delete; - rule_encoder(rule_encoder&&) = delete; - rule_encoder& operator=(rule_encoder const&) = delete; - rule_encoder& operator=(rule_encoder&&) = delete; + encoder& skip(directives callee_mode = directives::eps, directives callee_skip = directives::lexeme) + { + directives& last_mode = mode_.back(); + directives const prev = last_mode; + directives const next = last_mode & ~(callee_mode & directives::eps); + if (entry_mode_ == directives::none) + entry_mode_ = (prev & (directives::caseless | directives::lexeme | directives::noskip)) | directives::eps; + if ((((prev | callee_mode)) & (callee_skip | directives::preskip)) == directives::preskip) + do_skip(last_mode); + mode_.back() = next; + return *this; + } }; template @@ -606,8 +535,8 @@ class basic_regular_expression : public terminal_encoder_expression_interface struct generator final : environment { basic_regular_expression const& owner; - program_callees callees; - program_encoder encoder; + lug::program_callees callees; + lug::encoder encoder; unicode::rune_set runes; unicode::ctype classes{unicode::ctype::none}; bool circumflex{false}; @@ -650,8 +579,11 @@ class basic_regular_expression : public terminal_encoder_expression_interface encoder.match(std::move(runes)); if (classes != unicode::ctype::none) encoder.match_class(classes); - if (circumflex) - encoder.encode(opcode::commit, 0, 0, 0).encode(opcode::fail, 0, 1).match_any(); + if (circumflex) { + encoder.encode(opcode::commit); + encoder.encode(opcode::fail, 0, 1); + encoder.match_any(); + } } runes.clear(), classes = unicode::ctype::none, circumflex = false; } @@ -747,26 +679,15 @@ template return make_expression(e)(*this, m); } -template -[[nodiscard]] inline std::ptrdiff_t encoder::evaluate_length(E const& e, M const& m) -{ - if (do_should_evaluate_length()) { - instruction_length_evaluator evaluator{mode()}; - (void)evaluator.evaluate(e, m); - return evaluator.length(); - } - return 0; -} - template inline rule::rule(E const& e) { - (void)rule_encoder{*this}.evaluate(make_expression(e), encoder_metadata{}); + (void)encoder{*this}.evaluate(make_expression(e), encoder_metadata{}); } inline rule::rule(rule const& r) { - rule_encoder{*this}.call(r, 1); + encoder{*this}.call(r, 1); } struct rule_precedence_expression : terminal_encoder_expression_interface @@ -826,7 +747,7 @@ inline constexpr directive_modifier [[nodiscard]] constexpr auto operator()(encoder& /*d*/, M const& m) const -> M const& { return m; } }; struct eps_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.match_eps(); return m; } }; -struct eoi_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::choice, 2, 0, 0).encode(opcode::match_any, 0x8000, 0).encode(opcode::fail, 0, 2); return m; } }; +struct eoi_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::choice, 2, 0, 0); d.encode(opcode::match_any, 0x8000, 0); d.encode(opcode::fail, 0, 2); return m; } }; struct eol_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::match_eol); return m; } }; struct cut_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::cut); return m; } }; @@ -941,8 +862,7 @@ struct symbol_match_offset_combinator [[nodiscard]] constexpr symbol_match_offset_expression operator()(std::string_view name, std::size_t offset = 0) const { - detail::assure_in_range(offset, 0U, (std::numeric_limits::max)()); - return symbol_match_offset_expression{name, static_cast(offset)}; + return symbol_match_offset_expression{name, detail::checked_cast(offset)}; } }; @@ -954,8 +874,10 @@ struct negative_lookahead_expression : unary_encoder_expression_interface template [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const { - auto m2 = d.encode(opcode::choice, 1 + d.evaluate_length(this->e1, m), 0, 0).evaluate(this->e1, m); + auto const choice = d.encode(opcode::choice); + auto m2 = d.evaluate(this->e1, m); d.encode(opcode::fail, 0, 2); + d.jump_to_here(choice); return m2; } }; @@ -968,8 +890,11 @@ struct positive_lookahead_expression : unary_encoder_expression_interface template [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const { - auto m2 = d.encode(opcode::choice, 1 + d.evaluate_length(this->e1, m), 0, 0).evaluate(this->e1, m); - d.encode(opcode::commit_back, 1, 0, 0).encode(opcode::fail, 0, 1); + auto const choice = d.encode(opcode::choice); + auto m2 = d.evaluate(this->e1, m); + d.encode(opcode::commit_back, 1, 0, 0); + d.jump_to_here(choice); + d.encode(opcode::fail, 0, 1); return m2; } }; @@ -982,9 +907,12 @@ struct repetition_expression : unary_encoder_expression_interface template [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const { - std::ptrdiff_t const n = 1 + d.evaluate_length(this->e1, m); - auto m2 = d.encode(opcode::choice, n, 0, 0).evaluate(this->e1, m); - d.encode(opcode::commit_partial, -n, 0, 0); + auto const choice = d.encode(opcode::choice); + auto const expression = d.here(); + auto m2 = d.evaluate(this->e1, m); + auto const commit = d.encode(opcode::commit_partial); + d.jump_to_here(choice); + d.jump_to_target(commit, expression); return m2; } }; @@ -997,8 +925,13 @@ struct choice_expression : binary_encoder_expression_interface template [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const { - auto m2 = d.encode(opcode::choice, 1 + d.evaluate_length(this->e1, m), 0, 0).evaluate(this->e1, m); - return d.encode(opcode::commit, d.evaluate_length(this->e2, m2), 0, 0).evaluate(this->e2, m2); + auto const choice = d.encode(opcode::choice); + auto m2 = d.evaluate(this->e1, m); + auto const commit = d.encode(opcode::commit); + d.jump_to_here(choice); + auto m3 = d.evaluate(this->e2, m2); + d.jump_to_here(commit); + return m3; } }; @@ -1331,7 +1264,9 @@ class implicit_space_rule std::vector> calls; std::unordered_set left_recursive; std::vector>, program const*>> unprocessed; - program_encoder{grprogram, grcallees, directives::eps | directives::preskip}.call(start_rule, std::uint_least16_t{1}, false).encode(opcode::halt); + encoder grammar_encoder{grprogram, grcallees, directives::eps | directives::preskip}; // TODO: optimize away + grammar_encoder.call(start_rule, std::uint_least16_t{1}, false); + grammar_encoder.encode(opcode::halt); calls.emplace_back(&start_rule.program_, std::get<2>(grcallees.back())); unprocessed.emplace_back(std::vector>{{&start_rule, false}}, &start_rule.program_); do { @@ -1362,9 +1297,7 @@ class implicit_space_rule for (auto [subprogram, instr_addr] : calls) { if (auto& instr = grprogram.instructions[static_cast(instr_addr)]; instr.op == opcode::call) { instr.immediate16 = ((left_recursive.count(subprogram) != 0) ? (std::max)(instr.immediate16, std::uint_least16_t{1}) : std::uint_least16_t{0}); - std::ptrdiff_t const call_addr = instr.offset32 + addresses[subprogram] - (instr_addr + 1); - detail::assure_in_range(call_addr, std::numeric_limits::min(), std::numeric_limits::max()); - instr.offset32 = static_cast(call_addr); + instr.offset32 = detail::checked_cast(instr.offset32 + addresses[subprogram] - (instr_addr + 1)); } } grprogram.data.emplace_back('\0'); From 649c57a87145139d266ab45bd9e3b28a1a62a481 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Wed, 7 Aug 2024 04:00:53 -0700 Subject: [PATCH 11/14] Updated changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a229d65..d151687 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## Release v0.4.0 (Under Development) +* Implemented new 64-bit instruction encoding scheme that is simpler and more efficient. String data is no longer embedded in the instruction stream, ensuring constant instruction length of 8 bytes. +* Merged `lug::program_encoder` and `lug::rule_encoder` into the `lug::encoder` base class after overhauling how choice/jump offsets are calculated, significantly reducing binary size bloat. * Extracted common base class of `lug::basic_parser` into `lug::parser_base` to reduce template bloat. * Reduced template bloat for parser directives (improving compiler error messages) and optimized nested directives. * Merged parser stack frames together into a single stack using `std::variant`. From afef044be395fb52b9708a75643bfa174467077c Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Wed, 7 Aug 2024 04:29:18 -0700 Subject: [PATCH 12/14] Cleanup lug::start implementation after changes to the encoder class --- lug/lug.hpp | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/lug/lug.hpp b/lug/lug.hpp index 40441b6..52503c7 100644 --- a/lug/lug.hpp +++ b/lug/lug.hpp @@ -414,8 +414,8 @@ class encoder void jump_to_target(instruction_address addr, instruction_address target) { instruction_at(addr).offset32 = detail::checked_cast(static_cast(target) - static_cast(addr) - 1); } void jump_to_here(instruction_address addr) { jump_to_target(addr, here()); } template >> [[nodiscard]] decltype(auto) evaluate(E const& e, M const& m); - instruction_address append(instruction instr) { instruction_address const result{program_->instructions.size()}; program_->instructions.push_back(instr); return result; } - instruction_address append(program const& p) { instruction_address const result{program_->instructions.size()}; program_->concatenate(p); return result; } + instruction_address append(instruction instr) { instruction_address const addr{here()}; program_->instructions.push_back(instr); return addr; } + instruction_address append(program const& p) { instruction_address const addr{here()}; program_->concatenate(p); return addr; } instruction_address encode(opcode op) { return append(instruction{op, 0, 0, 0}); } instruction_address encode(opcode op, std::ptrdiff_t off, std::uint_least16_t imm16, std::uint_least8_t imm8) { return append(instruction{op, imm8, imm16, detail::checked_cast(off)}); } instruction_address encode(opcode op, std::uint_least16_t imm16, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, imm16, 0}); } @@ -1260,23 +1260,20 @@ class implicit_space_rule { program grprogram; program_callees grcallees; - std::unordered_map addresses; - std::vector> calls; + encoder grencoder{grprogram, grcallees, directives::eps | directives::preskip}; + grencoder.call(start_rule, std::uint_least16_t{1}, false); + grencoder.encode(opcode::halt); std::unordered_set left_recursive; - std::vector>, program const*>> unprocessed; - encoder grammar_encoder{grprogram, grcallees, directives::eps | directives::preskip}; // TODO: optimize away - grammar_encoder.call(start_rule, std::uint_least16_t{1}, false); - grammar_encoder.encode(opcode::halt); - calls.emplace_back(&start_rule.program_, std::get<2>(grcallees.back())); - unprocessed.emplace_back(std::vector>{{&start_rule, false}}, &start_rule.program_); + std::unordered_map addresses; + std::vector> calls{{&start_rule.program_, std::get<2>(grcallees.back())}}; + std::vector>, program const*>> unprocessed{{std::vector>{{&start_rule, false}}, &start_rule.program_}}; do { - auto [callstack, subprogram] = detail::pop_back(unprocessed); - auto const address = static_cast(grprogram.instructions.size()); - if (addresses.emplace(subprogram, address).second) { - grprogram.concatenate(*subprogram); - grprogram.instructions.emplace_back(opcode::ret, std::uint_least8_t{0}, std::uint_least16_t{0}, std::int_least32_t{0}); - if (auto top_rule = callstack.back().first; top_rule) { - for (auto [callee_rule, callee_program, instr_offset, mode] : top_rule->callees_) { // NOLINT(performance-for-range-copy) + auto const&& [callstack, subprogram] = detail::pop_back(unprocessed); + if (auto const address = static_cast(grencoder.here()); addresses.emplace(subprogram, address).second) { + grencoder.append(*subprogram); + grencoder.encode(opcode::ret); + if (auto const top_rule = callstack.back().first; top_rule) { + for (auto&& [callee_rule, callee_program, instr_offset, mode] : top_rule->callees_) { calls.emplace_back(callee_program, address + instr_offset); if ((callee_rule != nullptr) && ((mode & directives::eps) != directives::none) && detail::escaping_find_if(callstack.crbegin(), callstack.crend(), [callee = callee_rule](auto const& caller) { From 5efa570943efe1cf2f8b1873806f16102551b490 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Wed, 7 Aug 2024 11:32:09 -0700 Subject: [PATCH 13/14] Implemented tail recursion optimization and inlined initial rule into grammar program. Fixes #12. --- lug/lug.hpp | 179 +++++++++++++++++++++++++--------------------------- 1 file changed, 87 insertions(+), 92 deletions(-) diff --git a/lug/lug.hpp b/lug/lug.hpp index 52503c7..b5b2a6d 100644 --- a/lug/lug.hpp +++ b/lug/lug.hpp @@ -82,10 +82,9 @@ struct alignas(std::uint_least64_t) instruction constexpr instruction(opcode o, std::uint_least8_t i8, std::uint_least16_t i16, std::int_least32_t o32) noexcept : op{o}, immediate8{i8}, immediate16{i16}, offset32{o32} {} }; -static_assert(sizeof(instruction) == sizeof(std::uint_least64_t), "expected instruction size to be same size as 32-bit integer"); -static_assert(alignof(instruction) == alignof(std::uint_least64_t), "expected instruction alignment to be same size as 32-bit integer"); +static_assert(sizeof(instruction) == sizeof(std::uint_least64_t), "expected instruction size to be same size as std::uint_least64_t"); +static_assert(alignof(instruction) == alignof(std::uint_least64_t), "expected instruction alignment to be same size as std::uint_least64_t"); -enum class instruction_address : std::size_t {}; enum class directives : std::uint_least8_t { none = 0, caseless = 1, eps = 2, lexeme = 4, noskip = 8, preskip = 16, postskip = 32, is_bitfield_enum }; using program_callees = std::vector>; @@ -388,11 +387,11 @@ class encoder }; } - instruction_address do_call(rule const* r, program const* p, std::ptrdiff_t off, std::uint_least16_t prec) + std::ptrdiff_t do_call(rule const* r, program const* p, std::ptrdiff_t off, std::uint_least16_t prec) { auto callee_mode = mode_.back(); skip(p->entry_mode ^ directives::eps, directives::noskip); - callees_->emplace_back(r, p, static_cast(program_->instructions.size()), callee_mode); + callees_->emplace_back(r, p, here(), callee_mode); return encode(opcode::call, off, prec, 0); } @@ -409,27 +408,26 @@ class encoder ~encoder() { if (rule_) { rule_->currently_encoding_ = false; } if (program_) { program_->entry_mode = entry_mode(); } } [[nodiscard]] directives mode() const noexcept { return mode_.back(); } [[nodiscard]] directives entry_mode() const noexcept { return (entry_mode_ & ~directives::eps) | mode_.back(); } - [[nodiscard]] instruction_address here() const noexcept { return static_cast(program_->instructions.size()); } - instruction& instruction_at(instruction_address addr) { return program_->instructions[static_cast(addr)]; } - void jump_to_target(instruction_address addr, instruction_address target) { instruction_at(addr).offset32 = detail::checked_cast(static_cast(target) - static_cast(addr) - 1); } - void jump_to_here(instruction_address addr) { jump_to_target(addr, here()); } - template >> [[nodiscard]] decltype(auto) evaluate(E const& e, M const& m); - instruction_address append(instruction instr) { instruction_address const addr{here()}; program_->instructions.push_back(instr); return addr; } - instruction_address append(program const& p) { instruction_address const addr{here()}; program_->concatenate(p); return addr; } - instruction_address encode(opcode op) { return append(instruction{op, 0, 0, 0}); } - instruction_address encode(opcode op, std::ptrdiff_t off, std::uint_least16_t imm16, std::uint_least8_t imm8) { return append(instruction{op, imm8, imm16, detail::checked_cast(off)}); } - instruction_address encode(opcode op, std::uint_least16_t imm16, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, imm16, 0}); } - instruction_address encode(opcode op, std::string_view str, std::uint_least8_t imm8 = 0) { auto const rng = add_string(str); return append(instruction{op, imm8, rng.second, rng.first}); } - instruction_address encode(opcode op, semantic_action&& a, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, add_item(program_->actions, std::move(a)), 0}); } - instruction_address encode(opcode op, semantic_capture_action&& a, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, add_item(program_->captures, std::move(a)), 0}); } - instruction_address encode(opcode op, syntactic_predicate&& p, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, add_item(program_->predicates, std::move(p)), 0}); } - instruction_address call(program const& p, std::uint_least16_t prec) { return do_call(nullptr, &p, 0, prec); } - instruction_address call(grammar const& g, std::uint_least16_t prec) { return do_call(nullptr, &g.program(), 3, prec); } - instruction_address match(unicode::rune_set&& runes) { return skip().encode(opcode::match_set, add_item(program_->runesets, std::move(runes)), 0); } - instruction_address match_any() { return skip().encode(opcode::match_any); } - instruction_address match_eps() { return skip(directives::lexeme).encode(opcode::match, std::string_view{}); } - - instruction_address call(rule const& r, std::uint_least16_t prec, bool allow_inlining = true) + [[nodiscard]] std::ptrdiff_t here() const noexcept { return static_cast(program_->instructions.size()); } + instruction& instruction_at(std::ptrdiff_t addr) { return program_->instructions[static_cast(addr)]; } + void jump_to_target(std::ptrdiff_t addr, std::ptrdiff_t target) { instruction_at(addr).offset32 = detail::checked_cast(static_cast(target) - static_cast(addr) - 1); } + void jump_to_here(std::ptrdiff_t addr) { jump_to_target(addr, here()); } + std::ptrdiff_t append(instruction instr) { std::ptrdiff_t const addr{here()}; program_->instructions.push_back(instr); return addr; } + std::ptrdiff_t append(program const& p) { std::ptrdiff_t const addr{here()}; program_->concatenate(p); return addr; } + std::ptrdiff_t encode(opcode op) { return append(instruction{op, 0, 0, 0}); } + std::ptrdiff_t encode(opcode op, std::ptrdiff_t off, std::uint_least16_t imm16, std::uint_least8_t imm8) { return append(instruction{op, imm8, imm16, detail::checked_cast(off)}); } + std::ptrdiff_t encode(opcode op, std::uint_least16_t imm16, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, imm16, 0}); } + std::ptrdiff_t encode(opcode op, std::string_view str, std::uint_least8_t imm8 = 0) { auto const rng = add_string(str); return append(instruction{op, imm8, rng.second, rng.first}); } + std::ptrdiff_t encode(opcode op, semantic_action&& a, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, add_item(program_->actions, std::move(a)), 0}); } + std::ptrdiff_t encode(opcode op, semantic_capture_action&& a, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, add_item(program_->captures, std::move(a)), 0}); } + std::ptrdiff_t encode(opcode op, syntactic_predicate&& p, std::uint_least8_t imm8 = 0) { return append(instruction{op, imm8, add_item(program_->predicates, std::move(p)), 0}); } + std::ptrdiff_t call(program const& p, std::uint_least16_t prec) { return do_call(nullptr, &p, 0, prec); } + std::ptrdiff_t call(grammar const& g, std::uint_least16_t prec) { return do_call(nullptr, &g.program(), 3, prec); } + std::ptrdiff_t match(unicode::rune_set&& runes) { return skip().encode(opcode::match_set, add_item(program_->runesets, std::move(runes)), 0); } + std::ptrdiff_t match_any() { return skip().encode(opcode::match_any); } + std::ptrdiff_t match_eps() { return skip(directives::lexeme).encode(opcode::match, std::string_view{}); } + + std::ptrdiff_t call(rule const& r, std::uint_least16_t prec, bool allow_inlining = true) { if (auto const& p = r.program_; allow_inlining && prec <= 0 && !r.currently_encoding_ && r.callees_.empty() && !p.instructions.empty() && (p.instructions.size() <= 8) && (p.actions.size() <= 1) && (p.captures.size() <= 1) && (p.predicates.size() <= 1)) @@ -448,7 +446,7 @@ class encoder return m; } - instruction_address match(std::string_view subject) + std::ptrdiff_t match(std::string_view subject) { skip(!subject.empty() ? directives::eps : directives::none); if ((mode() & directives::caseless) != directives::none) @@ -457,7 +455,7 @@ class encoder } template >> - instruction_address match_class(T properties) + std::ptrdiff_t match_class(T properties) { return skip().encode(Op, detail::string_pack(properties), static_cast(unicode::to_property_enum_v>)); } @@ -591,28 +589,28 @@ class basic_regular_expression : public terminal_encoder_expression_interface public: explicit basic_regular_expression(std::string_view e) : expression_{e}, program_{std::make_shared()} {} - template [[nodiscard]] auto operator()(encoder& d, M const& m) const -> M const&; + template [[nodiscard]] auto evaluate(encoder& d, M const& m) const -> M const&; }; struct string_expression : terminal_encoder_expression_interface { std::string_view text; constexpr explicit string_expression(std::string_view t) noexcept : text{t} {} - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.match(text); return m; } + template [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const -> M const& { d.match(text); return m; } }; struct char_expression : terminal_encoder_expression_interface { char c; constexpr explicit char_expression(char x) noexcept : c{x} {} - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.match(std::string_view{&c, 1}); return m; } + template [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const -> M const& { d.match(std::string_view{&c, 1}); return m; } }; struct char32_expression : terminal_encoder_expression_interface { char32_t c; constexpr explicit char32_expression(char32_t x) noexcept : c{x} {} - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.match(utf8::encode_rune(c)); return m; } + template [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const -> M const& { d.match(utf8::encode_rune(c)); return m; } }; struct char32_range_expression : terminal_encoder_expression_interface @@ -620,7 +618,7 @@ struct char32_range_expression : terminal_encoder_expression_interface char32_t start; char32_t end; constexpr char32_range_expression(char32_t first, char32_t last) noexcept : start{first}, end{last} {} - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.match(unicode::sort_and_optimize(add_rune_range(unicode::rune_set{}, d.mode(), start, end))); return m; } + template [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const -> M const& { d.match(unicode::sort_and_optimize(add_rune_range(unicode::rune_set{}, d.mode(), start, end))); return m; } }; template @@ -628,7 +626,7 @@ struct callable_expression : terminal_encoder_expression_interface { std::reference_wrapper target; constexpr explicit callable_expression(Target& t) noexcept : target{t} {} - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { return d.call_with_frame(m, target.get(), 0); } + template [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const -> M const& { return d.call_with_frame(m, target.get(), 0); } }; template struct is_callable_encoder_expression : std::false_type {}; @@ -640,7 +638,7 @@ struct predicate_expression : terminal_encoder_expression_interface { Pred pred; template >> constexpr explicit predicate_expression(P&& p) noexcept(std::is_nothrow_constructible_v) : pred(std::forward

(p)) {} - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::predicate, syntactic_predicate{pred}); return m; } + template [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const -> M const& { d.encode(opcode::predicate, syntactic_predicate{pred}); return m; } }; template unary_encoder_expression_interface(X1&&) -> unary_encoder_expression_interface>; @@ -670,24 +668,20 @@ template && is_ex template >> [[nodiscard]] constexpr auto make_space_expression(E const& e) { - return [x = make_expression(e)](encoder& d) { (void)x(d, encoder_metadata{}); }; -} - -template -[[nodiscard]] inline decltype(auto) encoder::evaluate(E const& e, M const& m) -{ - return make_expression(e)(*this, m); + return [x = make_expression(e)](encoder& d) { (void)x.evaluate(d, encoder_metadata{}); }; } template inline rule::rule(E const& e) { - (void)encoder{*this}.evaluate(make_expression(e), encoder_metadata{}); + encoder rule_encoder{*this}; + (void)make_expression(e).evaluate(rule_encoder, encoder_metadata{}); } inline rule::rule(rule const& r) { - encoder{*this}.call(r, 1); + encoder rule_encoder{*this}; + rule_encoder.call(r, 1); } struct rule_precedence_expression : terminal_encoder_expression_interface @@ -695,7 +689,7 @@ struct rule_precedence_expression : terminal_encoder_expression_interface std::reference_wrapper target; std::uint_least16_t prec; rule_precedence_expression(rule const& t, std::uint_least16_t p) noexcept : target{t}, prec{p} {} - template [[nodiscard]] auto operator()(encoder& d, M const& m) const -> M const& { return d.call_with_frame(m, target.get(), prec); } + template [[nodiscard]] auto evaluate(encoder& d, M const& m) const -> M const& { return d.call_with_frame(m, target.get(), prec); } }; [[nodiscard]] inline auto rule::operator[](std::uint_least16_t prec) const noexcept @@ -715,10 +709,10 @@ struct directive_expression : unary_encoder_expression_interface : unary_encoder_expression_interface{std::forward(x1)}, enable_mask{enable}, disable_mask{disable}, relay_mask{relay} {} template - [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const + [[nodiscard]] constexpr decltype(auto) evaluate(encoder& d, M const& m) const { d.dpsh(enable_mask, disable_mask); - auto m2 = d.evaluate(this->e1, m); + auto m2 = this->e1.evaluate(d, m); d.dpop(relay_mask); return m2; } @@ -745,11 +739,11 @@ inline constexpr directive_modifier skip_after{}; inline constexpr directive_modifier skip_before{}; -struct nop_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& /*d*/, M const& m) const -> M const& { return m; } }; -struct eps_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.match_eps(); return m; } }; -struct eoi_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::choice, 2, 0, 0); d.encode(opcode::match_any, 0x8000, 0); d.encode(opcode::fail, 0, 2); return m; } }; -struct eol_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::match_eol); return m; } }; -struct cut_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::cut); return m; } }; +struct nop_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto evaluate(encoder& /*d*/, M const& m) const -> M const& { return m; } }; +struct eps_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const -> M const& { d.match_eps(); return m; } }; +struct eoi_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const -> M const& { d.encode(opcode::choice, 2, 0, 0); d.encode(opcode::match_any, 0x8000, 0); d.encode(opcode::fail, 0, 2); return m; } }; +struct eol_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const -> M const& { d.encode(opcode::match_eol); return m; } }; +struct cut_expression : terminal_encoder_expression_interface { template [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const -> M const& { d.encode(opcode::cut); return m; } }; template struct match_class_combinator @@ -759,7 +753,7 @@ struct match_class_combinator { Property property; constexpr explicit match_class_expression(Property p) noexcept : property{p} {} - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.match_class(property); return m; } + template [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const -> M const& { d.match_class(property); return m; } }; template >> @@ -768,13 +762,13 @@ struct match_class_combinator struct match_any_expression : terminal_encoder_expression_interface, match_class_combinator { - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.match_any(); return m; } + template [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const -> M const& { d.match_any(); return m; } }; template struct ctype_expression : terminal_encoder_expression_interface { - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.match_class(Property); return m; } + template [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const -> M const& { d.match_class(Property); return m; } }; template @@ -784,7 +778,7 @@ struct condition_test_combinator { std::string_view name; constexpr explicit condition_test_expression(std::string_view n) noexcept : name{n} {} - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::condition_test, name, Value ? 1 : 0); return m; } + template [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const -> M const& { d.encode(opcode::condition_test, name, Value ? 1 : 0); return m; } }; [[nodiscard]] constexpr condition_test_expression operator()(std::string_view name) const noexcept { return condition_test_expression{name}; } @@ -800,10 +794,10 @@ struct condition_block_combinator constexpr condition_block_expression(E1 const& x1, std::string_view n) noexcept : unary_encoder_expression_interface{x1}, name{n} {} template - [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const + [[nodiscard]] constexpr decltype(auto) evaluate(encoder& d, M const& m) const { d.encode(opcode::condition_push, name, Value ? 1 : 0); - auto m2 = d.evaluate(this->e1, m); + auto m2 = this->e1.evaluate(d, m); d.encode(opcode::condition_pop); return m2; } @@ -830,7 +824,7 @@ struct symbol_exists_combinator { std::string_view name; constexpr explicit symbol_exists_expression(std::string_view n) noexcept : name{n} {} - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.encode(opcode::symbol_exists, name, Value ? 1 : 0); return m; } + template [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const -> M const& { d.encode(opcode::symbol_exists, name, Value ? 1 : 0); return m; } }; [[nodiscard]] constexpr symbol_exists_expression operator()(std::string_view name) const noexcept { return symbol_exists_expression{name}; } @@ -843,7 +837,7 @@ struct symbol_match_combinator { std::string_view name; constexpr explicit symbol_match_expression(std::string_view n) noexcept : name{n} {} - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.skip(directives::eps).encode(((d.mode() & directives::caseless) != directives::none) ? OpCf : Op, name); return m; } + template [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const -> M const& { d.skip(directives::eps).encode(((d.mode() & directives::caseless) != directives::none) ? OpCf : Op, name); return m; } }; [[nodiscard]] constexpr symbol_match_expression operator()(std::string_view name) const noexcept { return symbol_match_expression{name}; } @@ -857,7 +851,7 @@ struct symbol_match_offset_combinator std::string_view name; std::uint_least8_t offset; constexpr symbol_match_offset_expression(std::string_view n, std::uint_least8_t o) noexcept : name{n}, offset{o} {} - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const -> M const& { d.skip(directives::eps).encode(((d.mode() & directives::caseless) != directives::none) ? OpCf : Op, name, offset); return m; } + template [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const -> M const& { d.skip(directives::eps).encode(((d.mode() & directives::caseless) != directives::none) ? OpCf : Op, name, offset); return m; } }; [[nodiscard]] constexpr symbol_match_offset_expression operator()(std::string_view name, std::size_t offset = 0) const @@ -872,10 +866,10 @@ struct negative_lookahead_expression : unary_encoder_expression_interface using unary_encoder_expression_interface::unary_encoder_expression_interface; template - [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const + [[nodiscard]] constexpr decltype(auto) evaluate(encoder& d, M const& m) const { auto const choice = d.encode(opcode::choice); - auto m2 = d.evaluate(this->e1, m); + auto m2 = this->e1.evaluate(d, m); d.encode(opcode::fail, 0, 2); d.jump_to_here(choice); return m2; @@ -888,10 +882,10 @@ struct positive_lookahead_expression : unary_encoder_expression_interface using unary_encoder_expression_interface::unary_encoder_expression_interface; template - [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const + [[nodiscard]] constexpr decltype(auto) evaluate(encoder& d, M const& m) const { auto const choice = d.encode(opcode::choice); - auto m2 = d.evaluate(this->e1, m); + auto m2 = this->e1.evaluate(d, m); d.encode(opcode::commit_back, 1, 0, 0); d.jump_to_here(choice); d.encode(opcode::fail, 0, 1); @@ -905,11 +899,11 @@ struct repetition_expression : unary_encoder_expression_interface using unary_encoder_expression_interface::unary_encoder_expression_interface; template - [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const + [[nodiscard]] constexpr decltype(auto) evaluate(encoder& d, M const& m) const { auto const choice = d.encode(opcode::choice); auto const expression = d.here(); - auto m2 = d.evaluate(this->e1, m); + auto m2 = this->e1.evaluate(d, m); auto const commit = d.encode(opcode::commit_partial); d.jump_to_here(choice); d.jump_to_target(commit, expression); @@ -923,13 +917,13 @@ struct choice_expression : binary_encoder_expression_interface using binary_encoder_expression_interface::binary_encoder_expression_interface; template - [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const + [[nodiscard]] constexpr decltype(auto) evaluate(encoder& d, M const& m) const { auto const choice = d.encode(opcode::choice); - auto m2 = d.evaluate(this->e1, m); + auto m2 = this->e1.evaluate(d, m); auto const commit = d.encode(opcode::commit); d.jump_to_here(choice); - auto m3 = d.evaluate(this->e2, m2); + auto m3 = this->e2.evaluate(d, m2); d.jump_to_here(commit); return m3; } @@ -941,10 +935,9 @@ struct sequence_expression : binary_encoder_expression_interface using binary_encoder_expression_interface::binary_encoder_expression_interface; template - [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const + [[nodiscard]] constexpr decltype(auto) evaluate(encoder& d, M const& m) const { - auto m2 = d.evaluate(this->e1, m); - return d.evaluate(this->e2, m2); + return this->e2.evaluate(d, this->e1.evaluate(d, m)); } }; @@ -955,13 +948,13 @@ struct attribute_action_expression : unary_encoder_expression_interface template constexpr attribute_action_expression(X1&& x1, O&& o) : unary_encoder_expression_interface{std::forward(x1)}, operand(std::forward(o)) {} template - [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const + [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const { if constexpr (is_callable_encoder_expression_v> && (std::tuple_size_v != 0)) { d.encode(opcode::action, semantic_action{[frame = m.attribute_frame](environment& envr) { envr.push_attribute_frame(frame); }}); static_cast(*this).do_prologue(d); d.call(this->e1.target, 0); static_cast(*this).do_epilogue_inlined(d, m); return m; } else { - static_cast(*this).do_prologue(d); auto m2 = d.evaluate(this->e1, m); static_cast(*this).do_epilogue(d); return m2; + static_cast(*this).do_prologue(d); auto m2 = this->e1.evaluate(d, m); static_cast(*this).do_epilogue(d); return m2; } } }; @@ -970,7 +963,7 @@ template struct attribute_bind_to_expression : attribute_action_expression { using attribute_action_expression::attribute_action_expression; - template [[nodiscard]] constexpr auto operator()(encoder& d, M const& m) const { return encoder_metadata{std::tuple_cat((attribute_action_expression::operator()(d, m)).attribute_frame, std::forward_as_tuple(*(this->operand)))}; } + template [[nodiscard]] constexpr auto evaluate(encoder& d, M const& m) const { return encoder_metadata{std::tuple_cat((attribute_action_expression::evaluate(d, m)).attribute_frame, std::forward_as_tuple(*(this->operand)))}; } }; template @@ -1014,7 +1007,7 @@ struct symbol_assign_expression : unary_encoder_expression_interface { std::string_view name; template constexpr symbol_assign_expression(X1&& x1, std::string_view n) : unary_encoder_expression_interface{std::forward(x1)}, name{n} {} - template [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const { d.skip().encode(opcode::symbol_start, name); auto m2 = d.evaluate(this->e1, m); d.encode(opcode::symbol_end); return m2; } + template [[nodiscard]] constexpr decltype(auto) evaluate(encoder& d, M const& m) const { d.skip().encode(opcode::symbol_start, name); auto m2 = this->e1.evaluate(d, m); d.encode(opcode::symbol_end); return m2; } }; template @@ -1023,10 +1016,10 @@ struct symbol_block_expression : unary_encoder_expression_interface using unary_encoder_expression_interface::unary_encoder_expression_interface; template - [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const + [[nodiscard]] constexpr decltype(auto) evaluate(encoder& d, M const& m) const { d.skip().encode(opcode::symbol_push); - auto m2 = d.evaluate(this->e1, m); + auto m2 = this->e1.evaluate(d, m); d.encode(opcode::symbol_pop); return m2; } @@ -1038,10 +1031,10 @@ struct local_block_expression : unary_encoder_expression_interface using unary_encoder_expression_interface::unary_encoder_expression_interface; template - [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const + [[nodiscard]] constexpr decltype(auto) evaluate(encoder& d, M const& m) const { d.skip().encode(opcode::symbol_push, 0, 2); - auto m2 = d.evaluate(this->e1, m); + auto m2 = this->e1.evaluate(d, m); d.encode(opcode::symbol_pop); return m2; } @@ -1054,10 +1047,10 @@ struct local_to_block_expression : unary_encoder_expression_interface constexpr local_to_block_expression(E1 const& x1, std::string_view n) noexcept : unary_encoder_expression_interface{x1}, name{n} {} template - [[nodiscard]] constexpr decltype(auto) operator()(encoder& d, M const& m) const + [[nodiscard]] constexpr decltype(auto) evaluate(encoder& d, M const& m) const { d.skip().encode(opcode::symbol_push, name, 1); - auto m2 = d.evaluate(this->e1, m); + auto m2 = this->e1.evaluate(d, m); d.encode(opcode::symbol_pop); return m2; } @@ -1261,17 +1254,17 @@ class implicit_space_rule program grprogram; program_callees grcallees; encoder grencoder{grprogram, grcallees, directives::eps | directives::preskip}; - grencoder.call(start_rule, std::uint_least16_t{1}, false); - grencoder.encode(opcode::halt); - std::unordered_set left_recursive; - std::unordered_map addresses; - std::vector> calls{{&start_rule.program_, std::get<2>(grcallees.back())}}; + grencoder.skip(start_rule.program_.entry_mode, directives::noskip); std::vector>, program const*>> unprocessed{{std::vector>{{&start_rule, false}}, &start_rule.program_}}; + std::vector> calls; + std::unordered_map addresses; + std::unordered_set left_recursive; + opcode prologue{opcode::halt}; do { auto const&& [callstack, subprogram] = detail::pop_back(unprocessed); - if (auto const address = static_cast(grencoder.here()); addresses.emplace(subprogram, address).second) { + if (auto const address = grencoder.here(); addresses.emplace(subprogram, address).second) { grencoder.append(*subprogram); - grencoder.encode(opcode::ret); + grencoder.encode(std::exchange(prologue, opcode::ret)); if (auto const top_rule = callstack.back().first; top_rule) { for (auto&& [callee_rule, callee_program, instr_offset, mode] : top_rule->callees_) { calls.emplace_back(callee_program, address + instr_offset); @@ -1292,9 +1285,11 @@ class implicit_space_rule } } while (!unprocessed.empty()); for (auto [subprogram, instr_addr] : calls) { - if (auto& instr = grprogram.instructions[static_cast(instr_addr)]; instr.op == opcode::call) { + if (auto& instr = grencoder.instruction_at(instr_addr); instr.op == opcode::call) { instr.immediate16 = ((left_recursive.count(subprogram) != 0) ? (std::max)(instr.immediate16, std::uint_least16_t{1}) : std::uint_least16_t{0}); instr.offset32 = detail::checked_cast(instr.offset32 + addresses[subprogram] - (instr_addr + 1)); + if ((instr.immediate16 == 0) && (grencoder.instruction_at(instr_addr + 1).op == opcode::ret)) + instr.op = opcode::jump; } } grprogram.data.emplace_back('\0'); @@ -2038,7 +2033,7 @@ LUG_DIAGNOSTIC_PUSH_AND_IGNORE LUG_DIAGNOSTIC_POP template -inline auto basic_regular_expression::operator()(encoder& d, M const& m) const -> M const& +inline auto basic_regular_expression::evaluate(encoder& d, M const& m) const -> M const& { if (program_->instructions.empty()) { static grammar const grmr = make_grammar(); From fc93632b73bba700f6dd0c5a3d7a9389bdaac161 Mon Sep 17 00:00:00 2001 From: Jesse Towner Date: Wed, 7 Aug 2024 12:05:25 -0700 Subject: [PATCH 14/14] Fix clang-tidy warnings introduced from recent changes. --- .clang-tidy | 1 + CHANGELOG.md | 4 +++- lug/lug.hpp | 17 +++++++++++------ 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index a6cb299..4599919 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -5,6 +5,7 @@ Checks: - -clang-analyzer-optin.core.EnumCastOutOfRange # interferes with enum bitfield flags - android-* - bugprone-* + - -bugprone-easily-swappable-parameters # many functions have easily swappable parameters - cert-* - -cert-dcl21-cpp # this check is deprecated, it is no longer part of the CERT standard - concurrency-* diff --git a/CHANGELOG.md b/CHANGELOG.md index d151687..6d18ed7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,9 @@ * Extracted common base class of `lug::basic_parser` into `lug::parser_base` to reduce template bloat. * Reduced template bloat for parser directives (improving compiler error messages) and optimized nested directives. * Merged parser stack frames together into a single stack using `std::variant`. -* Removed use of `goto` in parser machine main loop. +* Removed use of `goto` in parsing machine main loop. +* Changed grammar program generation to force inline the start rule. +* Implemented tail call optimization. ## Release v0.3.0 (July 4, 2024) diff --git a/lug/lug.hpp b/lug/lug.hpp index b5b2a6d..6a3d29b 100644 --- a/lug/lug.hpp +++ b/lug/lug.hpp @@ -51,7 +51,7 @@ template inline constexpr bool is_capture_target_v = std::is_same_v::digits - 1); + static constexpr std::size_t cut_deferred_flag = std::size_t{1} << static_cast(std::numeric_limits::digits - 1); std::size_t sr; // subject register std::size_t mr; // match register std::size_t rc; // response counter @@ -405,12 +405,16 @@ class encoder public: explicit encoder(program& p, program_callees& c, directives initial) : program_{&p}, callees_{&c}, mode_{initial} {} explicit encoder(rule& r) : rule_{&r}, program_{&r.program_}, callees_{&r.callees_}, mode_{directives::eps} { rule_->currently_encoding_ = true; } - ~encoder() { if (rule_) { rule_->currently_encoding_ = false; } if (program_) { program_->entry_mode = entry_mode(); } } + ~encoder() { if (rule_ != nullptr) { rule_->currently_encoding_ = false; } if (program_ != nullptr) { program_->entry_mode = entry_mode(); } } + encoder(encoder const&) = delete; + encoder(encoder&&) = default; + encoder& operator=(encoder const&) = delete; + encoder& operator=(encoder&&) = default; [[nodiscard]] directives mode() const noexcept { return mode_.back(); } [[nodiscard]] directives entry_mode() const noexcept { return (entry_mode_ & ~directives::eps) | mode_.back(); } [[nodiscard]] std::ptrdiff_t here() const noexcept { return static_cast(program_->instructions.size()); } instruction& instruction_at(std::ptrdiff_t addr) { return program_->instructions[static_cast(addr)]; } - void jump_to_target(std::ptrdiff_t addr, std::ptrdiff_t target) { instruction_at(addr).offset32 = detail::checked_cast(static_cast(target) - static_cast(addr) - 1); } + void jump_to_target(std::ptrdiff_t addr, std::ptrdiff_t target) { instruction_at(addr).offset32 = detail::checked_cast(target - addr - 1); } void jump_to_here(std::ptrdiff_t addr) { jump_to_target(addr, here()); } std::ptrdiff_t append(instruction instr) { std::ptrdiff_t const addr{here()}; program_->instructions.push_back(instr); return addr; } std::ptrdiff_t append(program const& p) { std::ptrdiff_t const addr{here()}; program_->concatenate(p); return addr; } @@ -532,7 +536,6 @@ class basic_regular_expression : public terminal_encoder_expression_interface struct generator final : environment { - basic_regular_expression const& owner; lug::program_callees callees; lug::encoder encoder; unicode::rune_set runes; @@ -540,7 +543,7 @@ class basic_regular_expression : public terminal_encoder_expression_interface bool circumflex{false}; generator(basic_regular_expression const& se, directives mode) - : owner{se}, encoder{*se.program_, callees, mode | directives::eps | directives::lexeme} + : encoder{*se.program_, callees, mode | directives::eps | directives::lexeme} {} void bracket_class(std::string_view s) @@ -1390,13 +1393,15 @@ class parser_base using symbol_table_frame = std::unordered_map>; using stack_frame = std::variant; + // NOLINTBEGIN(cppcoreguidelines-non-private-member-variables-in-classes,misc-non-private-member-variables-in-classes) lug::grammar const* grammar_; lug::environment* environment_; std::vector responses_; - std::vector stack_frames_; + std::vector stack_frames_; std::unordered_map casefolded_subjects_; lug::registers registers_{0, 0, 0, 0, 0, 0, 0}; bool parsing_{false}; + // NOLINTEND(cppcoreguidelines-non-private-member-variables-in-classes,misc-non-private-member-variables-in-classes) template void commit(std::ptrdiff_t off)