Skip to content

Commit

Permalink
Index IO and ObjectHash to prevent excessive pdf sizes
Browse files Browse the repository at this point in the history
- use object_id for filename or streams as index keys
  • Loading branch information
jonathangreenberg committed Oct 31, 2012
1 parent 1f38108 commit 419eca8
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 40 deletions.
67 changes: 57 additions & 10 deletions lib/prawn/core/object_store.rb
Original file line number Diff line number Diff line change
Expand Up @@ -143,23 +143,30 @@ def object_id_for_page(k)
#
def import_page(input, page_num)
@loaded_objects = {}

io = if input.respond_to?(:seek) && input.respond_to?(:read)
input
elsif File.file?(input.to_s)
StringIO.new(File.binread(input.to_s))
else
raise ArgumentError, "input must be an IO-like object or a filename"
if template_id = indexed_template(input, page_num)
return template_id
end

# unless File.file?(filename)
io = if input.respond_to?(:seek) && input.respond_to?(:read)
input
elsif File.file?(input.to_s)
StringIO.new(File.binread(input.to_s))
else
raise ArgumentError, "input must be an IO-like object or a filename"
end

# unless File.file?(filename)
# raise ArgumentError, "#{filename} does not exist"
# end

hash = PDF::Reader::ObjectHash.new(io)
hash = indexed_hash(input, io)
ref = hash.page_references[page_num - 1]

ref.nil? ? nil : load_object_graph(hash, ref).identifier
if ref.nil?
nil
else
index_template(input, page_num, load_object_graph(hash, ref).identifier)
end

rescue PDF::Reader::MalformedPDFError, PDF::Reader::InvalidObjectError
msg = "Error reading template file. If you are sure it's a valid PDF, it may be a bug."
Expand All @@ -171,6 +178,46 @@ def import_page(input, page_num)

private

# An index for page templates so that their loaded object graph
# can be reused without multiple loading
def template_index
@template_index ||= {}
end

# An index for the read object hash of a pdf template so that the
# object hash does not need to be parsed multiple times when using
# different pages of the pdf as page templates
def hash_index
@hash_index ||= {}
end

# returns the indexed object graph identifier for a template page if
# it exists
def indexed_template(input, page_number)
key = indexing_key(input)
template_index[key] && template_index[key][page_number]
end

# indexes the identifier for a page from a template
def index_template(input, page_number, id)
(template_index[indexing_key(input)] ||= {})[page_number] ||= id
end

# reads and indexes a new IO for a template
# if the IO has been indexed already then the parsed object hash
# is returned directly
def indexed_hash(input, io)
hash_index[indexing_key(input)] ||= PDF::Reader::ObjectHash.new(io)
end

# the index key for the input.
# uses object_id so that both a string filename or an IO stream can be
# indexed and reused provided the same object gets used in multiple page
# template calls.
def indexing_key(input)
input.object_id
end

# returns a nested array of object IDs for all pages in this object store.
#
def get_page_objects(obj)
Expand Down
25 changes: 14 additions & 11 deletions lib/prawn/document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,9 @@ def page
#
# pdf.start_new_page(:template => multipage_template.pdf, :template_page => 2)
#
# Note: templates get indexed by either the object_id of the filename or stream
# entered so that if you reuse the same template multiple times be sure to use the
# same instance for more efficient use of resources and smaller rendered pdfs.
def start_new_page(options = {})
if last_page = state.page
last_page_size = last_page.size
Expand Down Expand Up @@ -539,15 +542,15 @@ def @bounding_box.move_past_bottom
# through existing pages after they are created.
#
# Parameters are:
#
# <tt>string</tt>:: Template string for page number wording.
#
# <tt>string</tt>:: Template string for page number wording.
# Should include '<page>' and, optionally, '<total>'.
# <tt>options</tt>:: A hash for page numbering and text box options.
# <tt>:page_filter</tt>:: A filter to specify which pages to place page numbers on.
# <tt>:page_filter</tt>:: A filter to specify which pages to place page numbers on.
# Refer to the method 'page_match?'
# <tt>:start_count_at</tt>:: The starting count to increment pages from.
# <tt>:total_pages</tt>:: If provided, will replace <total> with the value given.
# Useful to override the total number of pages when using
# Useful to override the total number of pages when using
# the start_count_at option.
# <tt>:color</tt>:: Text fill color.
#
Expand All @@ -558,7 +561,7 @@ def @bounding_box.move_past_bottom
# five.
#
# Prawn::Document.generate("page_with_numbering.pdf") do
# number_pages "<page> in a total of <total>",
# number_pages "<page> in a total of <total>",
# {:start_count_at => 5,
# :page_filter => lambda{ |pg| pg != 1 },
# :at => [bounds.right - 50, 0],
Expand All @@ -578,7 +581,7 @@ def number_pages(string, options={})
txtcolor = opts.delete(:color)
# An explicit height so that we can draw page numbers in the margins
opts[:height] = 50 unless opts.has_key?(:height)

start_count = false
pseudopage = 0
(1..page_count).each do |p|
Expand All @@ -589,7 +592,7 @@ def number_pages(string, options={})
else
start_count_at.to_i
end
end
end
if page_match?(page_filter, p)
go_to_page(p)
# have to use fill_color here otherwise text reverts back to default fill color
Expand All @@ -598,21 +601,21 @@ def number_pages(string, options={})
str = string.gsub("<page>","#{pseudopage}").gsub("<total>","#{total_pages}")
text_box str, opts
start_count = true # increment page count as soon as first match found
end
end
pseudopage += 1 if start_count
end
end

# Provides a way to execute a block of code repeatedly based on a
# page_filter.
# page_filter.
#
# Available page filters are:
# :all repeats on every page
# :odd repeats on odd pages
# :even repeats on even pages
# some_array repeats on every page listed in the array
# some_range repeats on every page included in the range
# some_lambda yields page number and repeats for true return values
# some_lambda yields page number and repeats for true return values
def page_match?(page_filter, page_number)
case page_filter
when :all
Expand All @@ -626,7 +629,7 @@ def page_match?(page_filter, page_number)
when Proc
page_filter.call(page_number)
end
end
end


# Returns true if content streams will be compressed before rendering,
Expand Down
49 changes: 30 additions & 19 deletions spec/template_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
:bottom => 36 }



end

it "should not add an extra restore_graphics_state operator to the end of any content stream" do
Expand All @@ -57,7 +57,7 @@
data.include?("QQ").should == false
end
end

it "should have a single page object if importing a single page template" do
filename = "#{Prawn::DATADIR}/pdfs/hexagon.pdf"

Expand Down Expand Up @@ -161,34 +161,34 @@
str = @pdf.render
str[0,4].should == "%PDF"
end

context "with the template as a stream" do
it "should correctly import a template file from a stream" do
filename = "#{Prawn::DATADIR}/pdfs/hexagon.pdf"
io = StringIO.new(File.read(filename))
io = StringIO.new(File.read(filename))
@pdf = Prawn::Document.new(:template => io)
str = @pdf.render
str[0,4].should == "%PDF"
str[0,4].should == "%PDF"
end
end

end

describe "Document#start_new_page with :template option" do
filename = "#{Prawn::BASEDIR}/spec/data/curves.pdf"

it "should set the imported page's parent to the document pages catalog" do
@pdf = Prawn::Document.new()
@pdf.start_new_page(:template => filename)
@pdf.state.page.dictionary.data[:Parent].should == @pdf.state.store.pages
end

it "should set start the Y cursor at the top of the page" do
@pdf = Prawn::Document.new()
@pdf.start_new_page(:template => filename)
(@pdf.y == nil).should == false
end

it "should respect margins set by Prawn" do
@pdf = Prawn::Document.new(:margin => 0)
@pdf.start_new_page(:template => filename)
Expand All @@ -209,7 +209,7 @@
:top => 36,
:bottom => 36 }
end

it "should not add an extra restore_graphics_state operator to the end of any content stream" do
@pdf = Prawn::Document.new
@pdf.start_new_page(:template => filename)
Expand All @@ -223,7 +223,7 @@
data.include?("QQ").should == false
end
end

it "should have two content streams if importing a single page template" do
filename = "#{Prawn::DATADIR}/pdfs/hexagon.pdf"
@pdf = Prawn::Document.new()
Expand All @@ -234,7 +234,7 @@
template_page = hash[pages[1]]
template_page[:Contents].size.should == 2
end

it "should have balance q/Q operators on all content streams" do
filename = "#{Prawn::DATADIR}/pdfs/hexagon.pdf"

Expand All @@ -251,7 +251,7 @@
data.scan("Q").size.should == 1
end
end

it "should allow text to be added to a single page template" do

@pdf = Prawn::Document.new()
Expand All @@ -262,7 +262,7 @@
text = PDF::Inspector::Text.analyze(@pdf.render)
text.strings.first.should == "Adding some text"
end

it "should allow PDFs with page resources behind an indirect object to be used as templates" do
filename = "#{Prawn::DATADIR}/pdfs/resources_as_indirect_object.pdf"

Expand All @@ -275,7 +275,7 @@
all_text = text.strings.join
all_text.include?("Adding some text").should == true
end

it "should correctly add a TTF font to a template that has existing fonts" do
filename = "#{Prawn::DATADIR}/pdfs/contains_ttf_font.pdf"
@pdf = Prawn::Document.new()
Expand All @@ -293,20 +293,31 @@
fonts = resources[:Font]
fonts.size.should == 2
end


it "indexes template pages when used multiple times" do
filename = "#{Prawn::DATADIR}/pdfs/multipage_template.pdf"
@repeated_pdf = Prawn::Document.new()
3.times { @repeated_pdf.start_new_page(:template => filename) }
repeated_hash = PDF::Reader::ObjectHash.new(StringIO.new(@repeated_pdf.render))
@sequential_pdf = Prawn::Document.new()
(1..3).each { |p| @sequential_pdf.start_new_page(:template => filename, :template_page => p ) }
sequential_hash = PDF::Reader::ObjectHash.new(StringIO.new(@sequential_pdf.render))
(repeated_hash.size < sequential_hash.size).should == true
end

context "with the template as a stream" do
it "should correctly import a template file from a stream" do
filename = "#{Prawn::DATADIR}/pdfs/hexagon.pdf"
io = StringIO.new(File.read(filename))

@pdf = Prawn::Document.new()
@pdf.start_new_page(:template => io)

str = @pdf.render
str[0,4].should == "%PDF"
str[0,4].should == "%PDF"
end
end

context "using template_page option" do
it "uses the specified page option" do
filename = "#{Prawn::DATADIR}/pdfs/multipage_template.pdf"
Expand Down

0 comments on commit 419eca8

Please sign in to comment.