More robust meta tag detection.

Meta tag detection used rexml. While this gives all the power of parsing and working with XML, it is very strict on the input it accepts. On bad markup it simply throw the towel and gave up on parsing. As the detection of meta tags should not depend on bad (html) markup much later in the file, the parsing is now ported to using Regular Expression.
juggy · Jan 10, 2011 · c7149a2 · c7149a2
1 parent 5cf15d6
commit c7149a2
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 14 deletions.
diff --git a/lib/pdfkit/pdfkit.rb b/lib/pdfkit/pdfkit.rb
@@ -79,23 +79,18 @@ def to_file(path)
 
   protected
 
-    def find_options_in_meta(body)
-      pdfkit_meta_tags(body).inject({}) do |found, tag|
-        name = tag.attributes["name"].sub(/^#{PDFKit.configuration.meta_tag_prefix}/, '').to_sym
-        found.merge(name => tag.attributes["content"])
+    def find_options_in_meta(content)
+      # Read file if content is a File
+      content = content.read if content.is_a?(File)
+
+      found = {}
+      content.scan(/<meta [^>]*>/) do |meta|
+        puts PDFKit.configuration.meta_tag_prefix
+        name = meta.scan(/name=["']#{PDFKit.configuration.meta_tag_prefix}([^"']*)/)[0][0]
+        found[name] = meta.scan(/content=["']([^"']*)/)[0][0]
       end
-    end
 
-    def pdfkit_meta_tags(body)
-      require 'rexml/document'
-      xml_body = REXML::Document.new(body)
-      found = []
-      xml_body.elements.each("html/head/meta") do |tag|
-        found << tag if tag.attributes['name'].to_s =~ /^#{PDFKit.configuration.meta_tag_prefix}/
-      end
       found
-    rescue # rexml random crash on invalid xml
-      []
     end
 
     def style_tag_for(stylesheet)

diff --git a/spec/pdfkit_spec.rb b/spec/pdfkit_spec.rb
@@ -105,6 +105,22 @@
       pdfkit.command[pdfkit.command.index('"--page-size"') + 1].should == '"Legal"'
       pdfkit.command[pdfkit.command.index('"--orientation"') + 1].should == '"Landscape"'
     end
+
+    it "should detect special pdfkit meta tags despite bad markup" do
+      body = %{
+        <html>
+          <head>
+            <meta name="pdfkit-page_size" content="Legal"/>
+            <meta name="pdfkit-orientation" content="Landscape"/>
+          </head>
+          <br>
+        </html>
+      }
+      pdfkit = PDFKit.new(body)
+      pdfkit.command[pdfkit.command.index('"--page-size"') + 1].should == '"Legal"'
+      pdfkit.command[pdfkit.command.index('"--orientation"') + 1].should == '"Landscape"'
+    end
+
   end
 
   context "#to_pdf" do