Skip to content
This repository
Browse code

Can we make parsing substantially faster?

This experiment implements a micro optimisation approach that blows
parsing off the chart for a simple ERB parser. Implements an idea that I
had after reading a ML list post by Eric Sagnes.
  • Loading branch information...
commit 872611321c4af390c8b89e5d54b24613b4280fba 1 parent e3c600a
Kaspar Schiess authored

Showing 1 changed file with 271 additions and 0 deletions. Show diff stats Hide diff stats

  1. +271 0 experiments/optimizer.rb
271 experiments/optimizer.rb
... ... @@ -0,0 +1,271 @@
  1 +# Example that demonstrates how a simple erb-like parser could be constructed.
  2 +
  3 +$:.unshift File.dirname(__FILE__) + "/../lib"
  4 +
  5 +require 'parslet'
  6 +require 'parslet/atoms/visitor'
  7 +require 'parslet/convenience'
  8 +require 'blankslate'
  9 +
  10 +class ErbParser < Parslet::Parser
  11 + rule(:ruby) { (str('%>').absent? >> any).repeat.as(:ruby) }
  12 +
  13 + rule(:expression) { (str('=') >> ruby).as(:expression) }
  14 + rule(:comment) { (str('#') >> ruby).as(:comment) }
  15 + rule(:code) { ruby.as(:code) }
  16 + rule(:erb) { expression | comment | code }
  17 +
  18 + rule(:erb_with_tags) { str('<%') >> erb >> str('%>') }
  19 + rule(:text) { (str('<%').absent? >> any).repeat(1) }
  20 +
  21 + rule(:text_with_ruby) { (text.as(:text) | erb_with_tags).repeat.as(:text) }
  22 + root(:text_with_ruby)
  23 +end
  24 +
  25 +class Parslet::Source
  26 + def match_excluding str
  27 + slice_str = @str.check_until(Regexp.new(Regexp.escape(str)))
  28 + return @str.rest_size unless slice_str
  29 + return slice_str.size - str.size
  30 + end
  31 +end
  32 +
  33 +class AbsentParser < Parslet::Atoms::Base
  34 + def initialize absent
  35 + @absent = absent
  36 + end
  37 +
  38 + def try(source, context, consume_all)
  39 + excluding_length = source.match_excluding(@absent)
  40 +
  41 + if excluding_length >= 1
  42 + return succ(source.consume(excluding_length))
  43 + else
  44 + return context.err(self, source, "Failed absence #{@absent.inspect}.")
  45 + end
  46 + end
  47 +end
  48 +
  49 +class Parslet::Optimizer
  50 + module DSL
  51 + def >> other
  52 + Match::Sequence.new(self, other)
  53 + end
  54 + def absent?
  55 + Match::Lookahead.new(false, self)
  56 + end
  57 + def repeat(min=0, max=nil)
  58 + Match::Repetition.new(self, min, max)
  59 + end
  60 + end
  61 + module Match
  62 + class Base
  63 + include DSL
  64 +
  65 + def visit_parser(root)
  66 + false
  67 + end
  68 + def visit_entity(name, block)
  69 + false
  70 + end
  71 + def visit_named(name, atom)
  72 + false
  73 + end
  74 + def visit_repetition(tag, min, max, atom)
  75 + false
  76 + end
  77 + def visit_alternative(alternatives)
  78 + false
  79 + end
  80 + def visit_sequence(sequence)
  81 + false
  82 + end
  83 + def visit_lookahead(positive, atom)
  84 + false
  85 + end
  86 + def visit_re(regexp)
  87 + false
  88 + end
  89 + def visit_str(str)
  90 + false
  91 + end
  92 + def match(other, bindings)
  93 + @bindings = bindings
  94 + other.accept(self)
  95 + end
  96 + end
  97 + class Str < Base
  98 + def initialize(variable)
  99 + @variable = variable
  100 + end
  101 + def visit_str(str)
  102 + if bound_value=@bindings[@variable]
  103 + return bound_value == str
  104 + else
  105 + @bindings[@variable] = str
  106 + return true
  107 + end
  108 + end
  109 + end
  110 + class Lookahead < Base
  111 + def initialize(positive, expression)
  112 + @positive, @expression = positive, expression
  113 + end
  114 + def visit_lookahead(positive, atom)
  115 + positive == @positive &&
  116 + @expression.match(atom, @bindings)
  117 + end
  118 + end
  119 + class Sequence < Base
  120 + def initialize(*parslets)
  121 + @parslets = parslets
  122 + end
  123 + def visit_sequence(sequence)
  124 + sequence.zip(@parslets).all? { |atom, expr| expr.match(atom, @bindings) }
  125 + end
  126 + end
  127 + class Repetition < Base
  128 + def initialize(expression, min, max)
  129 + @min, @max, @expression = min, max, expression
  130 + end
  131 + def visit_repetition(tag, min, max, atom)
  132 + @min == min && @max == max && @expression.match(atom, @bindings)
  133 + end
  134 + end
  135 + class Re < Base
  136 + def initialize(variable)
  137 + @variable = variable
  138 + end
  139 + def visit_re(regexp)
  140 + case @variable
  141 + when Symbol
  142 + p [@variable, regexp]
  143 + fail
  144 + else
  145 + @variable == regexp
  146 + end
  147 + end
  148 + end
  149 + end
  150 +
  151 + def self.str(var)
  152 + Match::Str.new(var)
  153 + end
  154 + def self.any
  155 + Match::Re.new('.')
  156 + end
  157 +
  158 + class Rule
  159 + def initialize(expression, replacement)
  160 + @expression, @replacement = expression, replacement
  161 + end
  162 +
  163 + class Context < BlankSlate
  164 + def initialize(bindings)
  165 + @bindings = bindings
  166 + end
  167 + def method_missing(sym, *args, &block)
  168 + if args.size == 0 && !block && @bindings.has_key?(sym)
  169 + return @bindings[sym]
  170 + end
  171 +
  172 + super
  173 + end
  174 + def call(callable)
  175 + instance_eval(&callable)
  176 + end
  177 + end
  178 +
  179 + def match other
  180 + bindings = {}
  181 + if @expression.match(other, bindings)
  182 + return bindings
  183 + end
  184 + end
  185 + def call(bindings)
  186 + context = Context.new(bindings)
  187 + context.call(@replacement)
  188 + end
  189 + end
  190 + def self.rule(expression, &replacement)
  191 + rules << Rule.new(expression, replacement)
  192 + end
  193 + def self.rules
  194 + @rules ||= []
  195 + end
  196 + def rules
  197 + self.class.rules
  198 + end
  199 +
  200 + class Transform
  201 + def initialize(rules)
  202 + @rules = rules
  203 + @candidates = []
  204 + end
  205 +
  206 + def default_parser(root)
  207 + root.accept(self)
  208 + end
  209 + def default_entity(name, block)
  210 + Parslet::Atoms::Entity.new(name) { block.call.accept(self) }
  211 + end
  212 + def default_named(name, atom)
  213 + Parslet::Atoms::Named.new(atom.accept(self), name)
  214 + end
  215 + def default_repetition(tag, min, max, atom)
  216 + Parslet::Atoms::Repetition.new(atom.accept(self), min, max, tag)
  217 + end
  218 + def default_alternative(alternatives)
  219 + Parslet::Atoms::Alternative.new(
  220 + *alternatives.map { |atom| atom.accept(self) })
  221 + end
  222 + def default_sequence(sequence)
  223 + Parslet::Atoms::Sequence.new(
  224 + *sequence.map { |atom| atom.accept(self) })
  225 + end
  226 + def default_lookahead(positive, atom)
  227 + Parslet::Atoms::Lookahead.new(atom, positive)
  228 + end
  229 + def default_re(regexp)
  230 + Parslet::Atoms::Re.new(regexp)
  231 + end
  232 + def default_str(str)
  233 + Parslet::Atoms::Str.new(str)
  234 + end
  235 +
  236 + def method_missing(sym, *args, &block)
  237 + if (md=sym.to_s.match(/visit_([a-z]+)/)) && !block
  238 + # Obtain the default, which is a completely transformed new parser
  239 + default = self.send("default_#{md[1]}", *args)
  240 + # Try transforming this parser again at the current level
  241 + return transform(default)
  242 + end
  243 +
  244 + super
  245 + end
  246 + def transform(atom)
  247 + # Try to match one of the rules against the newly constructed tree.
  248 + @rules.each do |rule|
  249 + if bindings=rule.match(atom)
  250 + return rule.call(bindings)
  251 + end
  252 + end
  253 +
  254 + # No match, returning new atom.
  255 + return atom
  256 + end
  257 + end
  258 +
  259 + def apply(parser)
  260 + parser.accept(Transform.new(rules))
  261 + end
  262 +end
  263 +class Optimizer < Parslet::Optimizer
  264 + rule((str(:x).absent? >> any).repeat(1)) {
  265 + AbsentParser.new(x) }
  266 +end
  267 +
  268 +parser = ErbParser.new
  269 +optimized_parser = Optimizer.new.apply(parser)
  270 +# p optimized_parser.parse(File.read(ARGV.first))
  271 +p parser.parse_with_debug(File.read(ARGV.first))

0 comments on commit 8726113

Please sign in to comment.
Something went wrong with that request. Please try again.