Skip to content

Commit

Permalink
Add streaming of response bodies to disk and Mechanize::Download to s…
Browse files Browse the repository at this point in the history
…ave content to disk. Issue sparklemotion#62
  • Loading branch information
drbrain committed Oct 26, 2011
1 parent 491b2f5 commit 98b2f51
Show file tree
Hide file tree
Showing 13 changed files with 508 additions and 205 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.rdoc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@
* Added Mechanize#retry_change_requests to allow mechanize to retry POST and
other non-idempotent requests when you know it is safe to do so. Issue
#123
* Mechanize can now stream files directly to disk without loading them into
memory first through Mechanize::Download, a pluggable parser for
downloading files.

All responses larger than Mechanize#max_file_buffer are downloaded to a
Tempfile. For backwards compatibility Mechanize::File subclasses still
load the response body into memory.
* Added Mechanize#content_encoding_hooks which allow handling of
non-standard content encodings like "agzip". Patch #125 by kitamomonga
* Added dom_class to elements and the element matcher like dom_id. Patch
Expand Down
5 changes: 4 additions & 1 deletion Manifest.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ lib/mechanize.rb
lib/mechanize/content_type_error.rb
lib/mechanize/cookie.rb
lib/mechanize/cookie_jar.rb
lib/mechanize/download.rb
lib/mechanize/element_matcher.rb
lib/mechanize/file.rb
lib/mechanize/file_connection.rb
Expand Down Expand Up @@ -46,6 +47,7 @@ lib/mechanize/page/image.rb
lib/mechanize/page/label.rb
lib/mechanize/page/link.rb
lib/mechanize/page/meta_refresh.rb
lib/mechanize/parser.rb
lib/mechanize/pluggable_parsers.rb
lib/mechanize/redirect_limit_reached_error.rb
lib/mechanize/redirect_not_get_or_head_error.rb
Expand Down Expand Up @@ -136,6 +138,7 @@ test/test_images.rb
test/test_mechanize.rb
test/test_mechanize_cookie.rb
test/test_mechanize_cookie_jar.rb
test/test_mechanize_download.rb
test/test_mechanize_file.rb
test/test_mechanize_file_request.rb
test/test_mechanize_file_response.rb
Expand All @@ -151,6 +154,7 @@ test/test_mechanize_link.rb
test/test_mechanize_page_encoding.rb
test/test_mechanize_page_link.rb
test/test_mechanize_page_meta_refresh.rb
test/test_mechanize_parser.rb
test/test_mechanize_redirect_not_get_or_head_error.rb
test/test_mechanize_subclass.rb
test/test_mechanize_util.rb
Expand All @@ -168,7 +172,6 @@ test/test_relative_links.rb
test/test_request.rb
test/test_response_code.rb
test/test_robots.rb
test/test_save_file.rb
test/test_scheme.rb
test/test_select.rb
test/test_select_all.rb
Expand Down
25 changes: 23 additions & 2 deletions lib/mechanize.rb
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,16 @@ def max_history= length
@agent.history.max_size = length
end

# Responses larger than this will be written to a Tempfile instead of stored
# in memory. The default is 10240 bytes
def max_file_buffer
@agent.max_file_buffer
end

def max_file_buffer= bytes
@agent.max_file_buffer = bytes
end

def log=(l); Mechanize.log = l end
def log; Mechanize.log end

Expand Down Expand Up @@ -605,11 +615,20 @@ def parse uri, response, body
# Find our pluggable parser
parser_klass = @pluggable_parser.parser content_type

unless Mechanize::Download === parser_klass then
body = case body
when IO, Tempfile, StringIO then
body.read
else
body
end
end

parser_klass.new uri, response, body, response.code do |parser|
parser.mech = self if parser.respond_to? :mech=

parser.watch_for_set = @watch_for_set if
@watch_for_set and parser.respond_to?(:watch_for_set=)
parser.watch_for_set = @watch_for_set if
@watch_for_set and parser.respond_to?(:watch_for_set=)
end
end

Expand Down Expand Up @@ -678,6 +697,8 @@ def add_to_history(page)
require 'mechanize/content_type_error'
require 'mechanize/cookie'
require 'mechanize/cookie_jar'
require 'mechanize/parser'
require 'mechanize/download'
require 'mechanize/file'
require 'mechanize/file_connection'
require 'mechanize/file_request'
Expand Down
53 changes: 53 additions & 0 deletions lib/mechanize/download.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
##
# Download is a pluggable parser for downloading files without loading them
# into memory first. You may subclass this class to handle content types you
# do not wish to load into memory first.
#
# See Mechanize::PluggableParser for instructions on using this class.

class Mechanize::Download

include Mechanize::Parser

##
# Accessor for the IO-like that contains the body

attr_reader :body_io

alias content body_io

##
# Creates a new download retrieved from the given +uri+ and +response+
# object. The +body_io+ is an IO-like containing the HTTP response body and
# +code+ is the HTTP status.

def initialize uri = nil, response = nil, body_io = nil, code = nil
@uri = uri
@body_io = body_io
@code = code

fill_header response
extract_filename

yield self if block_given?
end

##
# Saves a copy of the body_io to +filename+

def save filename = nil
filename = find_free_name filename

if @body_io.respond_to? :path then
FileUtils.cp @body_io.path, filename
else
open filename, 'wb' do |io|
until @body_io.eof? do
io.write @body_io.read 16384
end
end
end
end

end

102 changes: 10 additions & 92 deletions lib/mechanize/file.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,130 +16,48 @@

class Mechanize::File

extend Forwardable

##
# The URI this file was retrieved from

attr_accessor :uri

##
# The Net::HTTPResponse for this file

attr_accessor :response
include Mechanize::Parser

##
# The HTTP response body, the raw file contents

attr_accessor :body

##
# The HTTP response code

attr_accessor :code

##
# The filename for this file based on the content-disposition of the
# response or the basename of the URL

attr_accessor :filename

##
# Alias for the HTTP response object

alias :header :response

##
# :method: [](header)
#
# Access HTTP +header+ by name

def_delegator :header, :[], :[]

##
# :method: []=(header, value)
#
# Set HTTP +header+ to +value+

def_delegator :header, :[]=, :[]=

##
# :method: key?(header)
#
# Is the named +header+ present?

def_delegator :header, :key?, :key?

##
# :method: each
#
# Enumerate HTTP headers

def_delegator :header, :each, :each

##
# :method: each
#
# Enumerate HTTP headers in capitalized (canonical) form

def_delegator :header, :canonical_each, :canonical_each

alias :content :body
alias content body

##
# Creates a new file retrieved from the given +uri+ and +response+ object.
# The +body+ is the HTTP response body and +code+ is the HTTP status.

def initialize(uri=nil, response=nil, body=nil, code=nil)
@uri = uri
def initialize uri = nil, response = nil, body = nil, code = nil
@uri = uri
@body = body
@code = code
@response = Mechanize::Headers.new

# Copy the headers in to a hash to prevent memory leaks
if response
response.each { |k,v|
@response[k] = v
}
end

@filename = 'index.html'

# Set the filename
if disposition = @response['content-disposition']
disposition.split(/;\s*/).each do |pair|
k,v = pair.split(/=/, 2)
@filename = v if k && k.downcase == 'filename'
end
else
if @uri
@filename = @uri.path.split(/\//).last || 'index.html'
@filename << ".html" unless @filename =~ /\./
end
end
fill_header response
extract_filename

yield self if block_given?
end

##
# Use this method to save the content of this object to +filename+

def save_as(filename = nil)
if filename.nil?
filename = @filename
number = 1
while(File.exists?(filename))
filename = "#{@filename}.#{number}"
number += 1
end
end
def save filename = nil
filename = find_free_name filename

open filename, "wb" do |f|
open filename, 'wb' do |f|
f.write body
end
end

alias :save :save_as
alias save_as save

end

Loading

0 comments on commit 98b2f51

Please sign in to comment.