diff --git a/README.md b/README.md index b9d267d4..ecdb4f5e 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,8 @@ page.links.external # Returns all external HTTP links found * Now `page.image` will return the first image in `page.images` if no OG or Twitter image found, instead of returning `nil`. +* You can now specify 2 different timeouts, `connection_timeout` and `read_timeout`, instead of the previous single `timeout`. + ## Changes in 3.0 * The redirect API has been changed, now the `:allow_redirections` option will expect only a boolean, which by default is `true`. That is, no more specifying `:safe`, `:unsafe` or `:all`. @@ -213,28 +215,36 @@ And the full scraped document is accessible from: ### Timeout & Retries -By default, MetaInspector times out after 20 seconds of waiting for a page to respond, -and it will retry fetching the page 3 times. -You can specify different values for both of these, like this: +You can specify 2 different timeouts when requesting a page: + +* `connection_timeout` sets the maximum number of seconds to wait to get a connection to the page. +* `read_timeout` sets the maximum number of seconds to wait to read the page, once connected. + +Both timeouts default to 20 seconds each. - # timeout after 5 seconds, retry 4 times - page = MetaInspector.new('sitevalidator.com', :timeout => 5, :retries => 4) +You can also specify the number of `retries`, which defaults to 3. + +For example, this will time out after 10 seconds waiting for a connection, or after 5 seconds waiting +to read its contents, and will retry 4 times: + +```ruby +page = MetaInspector.new('www.google', :connection_timeout => 10, :read_timeout => 5, :retries => 4) +``` If MetaInspector fails to fetch the page after it has exhausted its retries, -it will raise `MetaInspector::Request::TimeoutError`, which you can rescue in your +it will raise `Faraday::TimeoutError`, which you can rescue in your application code. - begin - data = MetaInspector.new(url) - rescue MetaInspector::Request::TimeoutError - enqueue_for_future_fetch_attempt(url) - render_simple(url) - rescue - log_fetch_error($!) - render_simple(url) - else - render_rich(data) - end +```ruby +begin + page = MetaInspector.new(url) +rescue Faraday::TimeoutError + enqueue_for_future_fetch_attempt(url) + render_simple(url) +else + render_rich(page) +end +``` ### Redirections diff --git a/lib/meta_inspector/document.rb b/lib/meta_inspector/document.rb index b82d5797..fca059c8 100644 --- a/lib/meta_inspector/document.rb +++ b/lib/meta_inspector/document.rb @@ -1,13 +1,15 @@ module MetaInspector # A MetaInspector::Document knows about its URL and its contents class Document - attr_reader :timeout, :html_content_only, :allow_redirections, :warn_level, :headers + attr_reader :html_content_only, :allow_redirections, :warn_level, :headers include MetaInspector::Exceptionable # Initializes a new instance of MetaInspector::Document, setting the URL to the one given # Options: - # => timeout: defaults to 20 seconds + # => connection_timeout: defaults to 20 seconds + # => read_timeout: defaults to 20 seconds + # => retries: defaults to 3 times # => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false # => allow_redirections: when true, follow HTTP redirects. Defaults to true # => document: the html of the url as a string @@ -15,7 +17,9 @@ class Document # => headers: object containing custom headers for the request def initialize(initial_url, options = {}) options = defaults.merge(options) - @timeout = options[:timeout] + @connection_timeout = options[:connection_timeout] + @read_timeout = options[:read_timeout] + @retries = options[:retries] @html_content_only = options[:html_content_only] @allow_redirections = options[:allow_redirections] @document = options[:document] @@ -24,7 +28,9 @@ def initialize(initial_url, options = {}) @exception_log = options[:exception_log] || MetaInspector::ExceptionLog.new(warn_level: warn_level) @url = MetaInspector::URL.new(initial_url, exception_log: @exception_log) @request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections, - timeout: @timeout, + connection_timeout: @connection_timeout, + read_timeout: @read_timeout, + retries: @retries, exception_log: @exception_log, headers: @headers) unless @document @parser = MetaInspector::Parser.new(self, exception_log: @exception_log) diff --git a/lib/meta_inspector/request.rb b/lib/meta_inspector/request.rb index 3a079822..25edd1b1 100644 --- a/lib/meta_inspector/request.rb +++ b/lib/meta_inspector/request.rb @@ -1,7 +1,6 @@ require 'faraday' require 'faraday_middleware' require 'faraday-cookie_jar' -require 'timeout' module MetaInspector @@ -13,7 +12,8 @@ def initialize(initial_url, options = {}) @url = initial_url @allow_redirections = options[:allow_redirections] - @timeout = options[:timeout] + @connection_timeout = options[:connection_timeout] + @read_timeout = options[:read_timeout] @retries = options[:retries] @exception_log = options[:exception_log] @headers = options[:headers] @@ -35,11 +35,8 @@ def content_type def response request_count ||= 0 request_count += 1 - Timeout::timeout(@timeout) { @response ||= fetch } - rescue Timeout::Error - retry unless @retries == request_count - @exception_log << TimeoutError.new("Attempt to fetch #{url} timed out 3 times.") - rescue Faraday::Error::ConnectionFailed, RuntimeError => e + @response ||= fetch + rescue Faraday::TimeoutError, Faraday::Error::ConnectionFailed, RuntimeError => e @exception_log << e nil end @@ -48,21 +45,25 @@ def response def fetch session = Faraday.new(:url => url) do |faraday| + faraday.request :retry, max: @retries + if @allow_redirections faraday.use FaradayMiddleware::FollowRedirects, limit: 10 faraday.use :cookie_jar end + faraday.headers.merge!(@headers || {}) faraday.adapter :net_http end - response = session.get + + response = session.get do |req| + req.options.timeout = @connection_timeout + req.options.open_timeout = @read_timeout + end @url.url = response.env.url.to_s response end - - class TimeoutError < StandardError - end end end diff --git a/lib/meta_inspector/version.rb b/lib/meta_inspector/version.rb index f7007198..275604e6 100644 --- a/lib/meta_inspector/version.rb +++ b/lib/meta_inspector/version.rb @@ -1,3 +1,3 @@ module MetaInspector - VERSION = "4.0.0.rc2" + VERSION = "4.0.0.rc3" end diff --git a/spec/request_spec.rb b/spec/request_spec.rb index 3eaa7973..a92bf0e1 100644 --- a/spec/request_spec.rb +++ b/spec/request_spec.rb @@ -60,50 +60,6 @@ end end - describe "retrying on timeouts" do - let(:logger) { MetaInspector::ExceptionLog.new } - subject do - MetaInspector::Request.new(url('http://pagerankalert.com'), - exception_log: logger, retries: 3) - end - - context "when request never succeeds" do - before{ Timeout.stub(:timeout).and_raise(Timeout::Error) } - it "swallows all the timeout errors and raises MetaInspector::Request::TimeoutError" do - logger.should receive(:<<).with(an_instance_of(MetaInspector::Request::TimeoutError)) - subject - end - end - - context "when request succeeds on third try" do - before do - Timeout.stub(:timeout).and_raise(Timeout::Error) - Timeout.stub(:timeout).and_raise(Timeout::Error) - Timeout.stub(:timeout).and_call_original - end - it "doesn't raise an exception" do - logger.should_not receive(:<<) - subject - end - it "succeeds as normal" do - subject.content_type.should == "text/html" - end - end - - context "when request succeeds on fourth try" do - before do - Timeout.stub(:timeout).exactly(3).times.and_raise(Timeout::Error) - # if it were called a fourth time, rspec would raise an error - # so this implicitely tests the correct behavior - end - it "swallows all the timeout errors and raises MetaInspector::Request::TimeoutError" do - logger.should receive(:<<).with(an_instance_of(MetaInspector::Request::TimeoutError)) - subject - end - end - - end - private def url(initial_url)