First commit

jarrett · Feb 26, 2014 · 4e60f7e · 4e60f7e
commit 4e60f7e
Show file tree

Hide file tree

Showing 15 changed files with 556 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,56 @@
+
+#********** osx template********** 
+
+.DS_Store
+
+# Thumbnails
+._*
+
+# Files that might appear on external disk
+.Spotlight-V100
+.Trashes
+
+
+#********** linux template********** 
+
+.*
+!.gitignore
+*~
+
+# KDE
+.directory
+
+
+#********** windows template********** 
+
+# Windows image file caches
+Thumbs.db 
+
+# Folder config file
+Desktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+
+#********** ruby template********** 
+
+*.gem
+*.rbc
+.bundle
+.config
+coverage
+InstalledFiles
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+
+# YARD artifacts
+.yardoc
+_yardoc
+doc/
+
diff --git a/Rakefile b/Rakefile
@@ -0,0 +1,24 @@
+require 'rake/testtask'
+
+# To run one test: rake test TEST=just_one_file.rb
+Rake::TestTask.new do |t|
+  t.test_files = FileList['test/*_test.rb']
+  t.libs << 'test'
+end
+
+def built_gem_name
+  Dir.glob('erb_parser-*.*.*.gem').first
+end
+
+task :build do
+  `rm *.gem`
+  puts `gem build erb_parser.gemspec`
+end
+
+task :install do
+  puts `gem install #{built_gem_name}`
+end
+
+task :release do
+  puts `gem push #{built_gem_name}`
+end
diff --git a/erb_parser.gemspec b/erb_parser.gemspec
@@ -0,0 +1,16 @@
+Gem::Specification.new do |s|
+  s.name         = 'erb_parser'
+  s.version      = '0.0.0'
+  s.date         = '2014-02-26'
+  s.summary      = 'Parser for ERB templates'
+  s.description  = 'Parses ERB templates into two types of tokens: Plain text and ERB tags. Special support for HTML/XML.'
+  s.authors      = ['Jarrett Colby']
+  s.email        = 'jarrett@madebyhq.com'
+  s.files        = Dir.glob('lib/**/*')
+  s.homepage     = 'http://madebyhq.com/'
+
+  s.add_runtime_dependency 'treetop'
+
+  s.add_development_dependency 'minitest'
+  s.add_development_dependency 'turn'
+end
diff --git a/lib/erb_parser.rb b/lib/erb_parser.rb
@@ -0,0 +1,33 @@
+require 'treetop'
+require 'erb_parser/nodes'
+require 'erb_parser/treetop_runner'
+require 'erb_parser/parsed_erb'
+require 'erb_parser/erb_tag'
+require 'erb_parser/xml_transformer'
+
+module ErbParser
+  def self.parse(str)
+    ParsedErb.new TreetopRunner.run(str)
+  end
+
+  # Takes a string representing an XML document or fragment. Finds every ERB tag in the
+  # XML and replaces it with the tag <erb>. The contents of the replacement tag will be
+  # the inner Ruby code, escaped for XML. You can override the tag like so:
+  # 
+  #    ErbParser.transform_xml str, :tag => 'tag-name'
+  # 
+  # If the ERB tag is of the form +<%=+, the attribute +interpolated="true"+ will be
+  # added. Else if the ERB tag is of the form +<#+, the attribute +comment="true"+ will be
+  # added. You can override this behavior like so:
+  #
+  #    ErbParser.transform_xml str, :interp_attr => {'attr-name' => 'attr-value'}
+  #    ErbParser.transform_xml str, :interp_attr => false
+  #    
+  #    ErbParser.transform_xml str, :comment_attr => {'attr-name' => 'attr-value'}
+  #    ErbParser.transform_xml str, :comment_attr => false
+  # 
+  # The returned value is a string representing the transformed XML document or fragment.
+  def self.transform_xml(str, options = {})
+    XmlTransformer.transform(parse(str), options)   
+  end
+end
diff --git a/lib/erb_parser/erb_grammar.treetop b/lib/erb_parser/erb_grammar.treetop
@@ -0,0 +1,47 @@
+module ErbParser
+  grammar ErbGrammar
+    rule document
+      (erb_tag / text)*
+    end
+
+    rule text
+      (!'<%' .)+
+      <Text>
+    end
+
+    rule erb_tag
+      '<%'
+      number_sign:'#'? equal_sign:'='?
+      _ruby_code:ruby_code
+      '%>'
+      <ErbTag>
+    end
+
+    rule ruby_code
+      (string_literal / (!'%>' .))*
+    end
+
+    # Matches the following quote styles:
+    # "string"
+    # 'string'
+    # %q(string (string) string)
+    # %Q(string (string) string)
+    # %(string (string) string)
+    # %q{string {string} string}
+    # %Q{string {string} string}
+    # %{string {string} string}
+    rule string_literal
+      ('"' ('\"' / !'"' .)* '"') /
+      ('\'' ('\\\'' / !'\'' .)* '\'') /
+      ('%' ('q' / 'Q')? (curly_brackets / parens))
+    end
+
+    rule curly_brackets
+      '{' (curly_brackets / '\}' / !'}' .)* '}'
+    end
+
+    rule parens
+      '(' (parens         / '\)' / !')' .)* ')'
+    end
+  end
+end
diff --git a/lib/erb_parser/erb_tag.rb b/lib/erb_parser/erb_tag.rb
@@ -0,0 +1,23 @@
+module ErbParser
+  class ErbTag
+    def comment?
+      @treetop_node.comment?
+    end
+
+    def initialize(treetop_node)
+      @treetop_node = treetop_node
+    end
+
+    def interpolated?
+      @treetop_node.interpolated?
+    end
+
+    def ruby_code
+      @treetop_node.ruby_code
+    end
+
+    def to_s
+      @treetop_node.text_value
+    end
+  end
+end
diff --git a/lib/erb_parser/nodes.rb b/lib/erb_parser/nodes.rb
@@ -0,0 +1,27 @@
+module ErbParser
+  module ErbGrammar
+    module Text
+      def type
+        :text
+      end
+    end
+
+    module ErbTag
+      def comment?
+        !number_sign.empty?
+      end
+
+      def interpolated?
+        !equal_sign.empty?
+      end
+
+      def ruby_code
+        _ruby_code.text_value
+      end
+
+      def type
+        :erb_tag
+      end
+    end
+  end
+end
diff --git a/lib/erb_parser/parsed_erb.rb b/lib/erb_parser/parsed_erb.rb
@@ -0,0 +1,29 @@
+module ErbParser
+  class ParsedErb
+    # Accesses the parsed tokens as an array. Each element of the array is either a
+    # String, representing plain text, or an ErbTag.
+    def [](index)
+      @tokens[index]
+    end
+
+    def initialize(treetop_ast)
+      @treetop_ast = treetop_ast
+      @tokens = treetop_ast.elements.map do |elem|
+        case elem.type
+        when :text
+          elem.text_value
+        when :erb_tag
+          ErbTag.new elem
+        else
+          raise "Unexpected type: #{elem.type}"
+        end
+      end
+    end
+
+    # Returns the array of parsed tokens.
+    attr_reader :tokens
+
+    # Returns the raw Treetop AST.
+    attr_reader :treetop_ast
+  end
+end
diff --git a/lib/erb_parser/treetop_runner.rb b/lib/erb_parser/treetop_runner.rb
@@ -0,0 +1,18 @@
+Treetop.load File.join(File.dirname(__FILE__), 'erb_grammar')
+
+module ErbParser
+  # This module doesn't do much. It just provides some boilerplate code to invoke Treetop.
+  # The result is whatever Treetop returns.
+  module TreetopRunner
+    def self.run(str, options = {})
+      treetop = ErbGrammarParser.new
+      if result = treetop.parse(str, options)
+        result
+      else
+        raise ParseError, treetop.failure_reason
+      end
+    end
+
+    class ParseError < RuntimeError; end
+  end
+end
diff --git a/lib/erb_parser/xml_transformer.rb b/lib/erb_parser/xml_transformer.rb
@@ -0,0 +1,52 @@
+require 'cgi'
+
+module ErbParser
+  module XmlTransformer
+    def self.transform(parsed_erb, options)
+      options = {
+        :tag          => 'erb',
+        :interp_attr  => {'interpolated' => 'true'},
+        :comment_attr => {'comment'      => 'true'}
+      }.merge(options)
+
+      parsed_erb.tokens.map do |elem|
+        case elem
+        when String
+          elem
+        when ErbTag
+          if elem.interpolated?
+            if options[:interp_attr].is_a?(Hash)
+              attrs = options[:interp_attr]
+            else
+              attrs = {}
+            end
+          elsif elem.comment?
+            if options[:comment_attr].is_a?(Hash)
+              attrs = options[:comment_attr]
+            else
+              attrs = {}
+            end
+          else
+            attrs = {}
+          end
+          content_tag options[:tag], CGI.escape_html(elem.ruby_code), attrs
+        else
+          raise "Unexpected element: #{elem.class.name}"
+        end
+      end.join
+    end
+
+    def self.content_tag(name, contents, attrs = {})
+      if attrs.empty?
+        attrs_str = ''
+      else
+        attrs_str = ' ' + attrs.map do |key, val|
+          key = CGI.escape_html(key.to_s)
+          val = CGI.escape_html(val.to_s)
+          %Q(#{key}="#{val}")
+        end.join(' ')
+      end
+      '<' + name.to_s + attrs_str + '>' + contents.to_s + '</' + name.to_s + '>'
+    end
+  end
+end
diff --git a/readme.md b/readme.md
@@ -0,0 +1,37 @@
+## Can ErbParser handle all valid Ruby code?
+
+No it cannot. Ruby has a very complex syntax. In a library like this, it would be a fool's
+errand to try to handle every weird syntactic construct that could technically be
+considered valid Ruby. Instead, this library is designed to handle only the constructs
+that would commonly appear inside ERB tags. In other words, the basics of the language.
+
+Just avoid exotic syntactic constructs, and you should be fine. (You shouldn't do anything
+syntactically fancy in an ERB template anyway--it's bad coding style.) In particular, you
+must avoid Ruby's weirder string literals, such as the following:
+
+    %q!This is a valid string literal, but you must not use this syntax.!
+
+Also be wary of tricky escape sequences. If you absolutely must use unusual syntax, and it
+breaks ErbParser, consider moving the offending code into a class or module external to
+the ERB template.
+
+Nonetheless, the library *does* account for and allow the following string literal
+formats:
+
+    "string"
+    'string'
+    %q(string (string) string)
+    %Q(string (string) string)
+    %(string (string) string)
+    %q{string {string} string}
+    %Q{string {string} string}
+    %{string {string} string}
+
+This parser is *not* hardened against malicious input. But then, you shouldn't be
+accepting ERB as untrusted input anyway, because ERB allows arbitrary code execution.
+
+## What does ErbParser do with invalid ERB or Ruby code?
+
+If you pass code containing a syntax error, the parsing behavior is undefined. You may get
+an exception, or you may just get nonsensical results. It depends on the type of the
+syntax error.