Skip to content

Commit

Permalink
Refactors timeouts and retries
Browse files Browse the repository at this point in the history
Instead of implementing our own timeout and retry mechanism, we rely
on Faraday for that.

We just pass `connection_timeout` and `read_timeout`, as well as `retries`
to Faraday.
  • Loading branch information
jaimeiniesta committed Nov 20, 2014
1 parent dcccc82 commit 5d44433
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 77 deletions.
44 changes: 27 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ page.links.external # Returns all external HTTP links found

* Now `page.image` will return the first image in `page.images` if no OG or Twitter image found, instead of returning `nil`.

* You can now specify 2 different timeouts, `connection_timeout` and `read_timeout`, instead of the previous single `timeout`.

## Changes in 3.0

* The redirect API has been changed, now the `:allow_redirections` option will expect only a boolean, which by default is `true`. That is, no more specifying `:safe`, `:unsafe` or `:all`.
Expand Down Expand Up @@ -213,28 +215,36 @@ And the full scraped document is accessible from:

### Timeout & Retries

By default, MetaInspector times out after 20 seconds of waiting for a page to respond,
and it will retry fetching the page 3 times.
You can specify different values for both of these, like this:
You can specify 2 different timeouts when requesting a page:

* `connection_timeout` sets the maximum number of seconds to wait to get a connection to the page.
* `read_timeout` sets the maximum number of seconds to wait to read the page, once connected.

Both timeouts default to 20 seconds each.

# timeout after 5 seconds, retry 4 times
page = MetaInspector.new('sitevalidator.com', :timeout => 5, :retries => 4)
You can also specify the number of `retries`, which defaults to 3.

For example, this will time out after 10 seconds waiting for a connection, or after 5 seconds waiting
to read its contents, and will retry 4 times:

```ruby
page = MetaInspector.new('www.google', :connection_timeout => 10, :read_timeout => 5, :retries => 4)
```

If MetaInspector fails to fetch the page after it has exhausted its retries,
it will raise `MetaInspector::Request::TimeoutError`, which you can rescue in your
it will raise `Faraday::TimeoutError`, which you can rescue in your
application code.

begin
data = MetaInspector.new(url)
rescue MetaInspector::Request::TimeoutError
enqueue_for_future_fetch_attempt(url)
render_simple(url)
rescue
log_fetch_error($!)
render_simple(url)
else
render_rich(data)
end
```ruby
begin
page = MetaInspector.new(url)
rescue Faraday::TimeoutError
enqueue_for_future_fetch_attempt(url)
render_simple(url)
else
render_rich(page)
end
```

### Redirections

Expand Down
14 changes: 10 additions & 4 deletions lib/meta_inspector/document.rb
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
module MetaInspector
# A MetaInspector::Document knows about its URL and its contents
class Document
attr_reader :timeout, :html_content_only, :allow_redirections, :warn_level, :headers
attr_reader :html_content_only, :allow_redirections, :warn_level, :headers

include MetaInspector::Exceptionable

# Initializes a new instance of MetaInspector::Document, setting the URL to the one given
# Options:
# => timeout: defaults to 20 seconds
# => connection_timeout: defaults to 20 seconds
# => read_timeout: defaults to 20 seconds
# => retries: defaults to 3 times
# => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
# => allow_redirections: when true, follow HTTP redirects. Defaults to true
# => document: the html of the url as a string
# => warn_level: what to do when encountering exceptions. Can be :warn, :raise or nil
# => headers: object containing custom headers for the request
def initialize(initial_url, options = {})
options = defaults.merge(options)
@timeout = options[:timeout]
@connection_timeout = options[:connection_timeout]
@read_timeout = options[:read_timeout]
@retries = options[:retries]
@html_content_only = options[:html_content_only]
@allow_redirections = options[:allow_redirections]
@document = options[:document]
Expand All @@ -24,7 +28,9 @@ def initialize(initial_url, options = {})
@exception_log = options[:exception_log] || MetaInspector::ExceptionLog.new(warn_level: warn_level)
@url = MetaInspector::URL.new(initial_url, exception_log: @exception_log)
@request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections,
timeout: @timeout,
connection_timeout: @connection_timeout,
read_timeout: @read_timeout,
retries: @retries,
exception_log: @exception_log,
headers: @headers) unless @document
@parser = MetaInspector::Parser.new(self, exception_log: @exception_log)
Expand Down
23 changes: 12 additions & 11 deletions lib/meta_inspector/request.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
require 'faraday'
require 'faraday_middleware'
require 'faraday-cookie_jar'
require 'timeout'

module MetaInspector

Expand All @@ -13,7 +12,8 @@ def initialize(initial_url, options = {})
@url = initial_url

@allow_redirections = options[:allow_redirections]
@timeout = options[:timeout]
@connection_timeout = options[:connection_timeout]
@read_timeout = options[:read_timeout]
@retries = options[:retries]
@exception_log = options[:exception_log]
@headers = options[:headers]
Expand All @@ -35,11 +35,8 @@ def content_type
def response
request_count ||= 0
request_count += 1
Timeout::timeout(@timeout) { @response ||= fetch }
rescue Timeout::Error
retry unless @retries == request_count
@exception_log << TimeoutError.new("Attempt to fetch #{url} timed out 3 times.")
rescue Faraday::Error::ConnectionFailed, RuntimeError => e
@response ||= fetch
rescue Faraday::TimeoutError, Faraday::Error::ConnectionFailed, RuntimeError => e
@exception_log << e
nil
end
Expand All @@ -48,21 +45,25 @@ def response

def fetch
session = Faraday.new(:url => url) do |faraday|
faraday.request :retry, max: @retries

if @allow_redirections
faraday.use FaradayMiddleware::FollowRedirects, limit: 10
faraday.use :cookie_jar
end

faraday.headers.merge!(@headers || {})
faraday.adapter :net_http
end
response = session.get

response = session.get do |req|
req.options.timeout = @connection_timeout
req.options.open_timeout = @read_timeout
end

@url.url = response.env.url.to_s

response
end

class TimeoutError < StandardError
end
end
end
2 changes: 1 addition & 1 deletion lib/meta_inspector/version.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module MetaInspector
VERSION = "4.0.0.rc2"
VERSION = "4.0.0.rc3"
end
44 changes: 0 additions & 44 deletions spec/request_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -60,50 +60,6 @@
end
end

describe "retrying on timeouts" do
let(:logger) { MetaInspector::ExceptionLog.new }
subject do
MetaInspector::Request.new(url('http://pagerankalert.com'),
exception_log: logger, retries: 3)
end

context "when request never succeeds" do
before{ Timeout.stub(:timeout).and_raise(Timeout::Error) }
it "swallows all the timeout errors and raises MetaInspector::Request::TimeoutError" do
logger.should receive(:<<).with(an_instance_of(MetaInspector::Request::TimeoutError))
subject
end
end

context "when request succeeds on third try" do
before do
Timeout.stub(:timeout).and_raise(Timeout::Error)
Timeout.stub(:timeout).and_raise(Timeout::Error)
Timeout.stub(:timeout).and_call_original
end
it "doesn't raise an exception" do
logger.should_not receive(:<<)
subject
end
it "succeeds as normal" do
subject.content_type.should == "text/html"
end
end

context "when request succeeds on fourth try" do
before do
Timeout.stub(:timeout).exactly(3).times.and_raise(Timeout::Error)
# if it were called a fourth time, rspec would raise an error
# so this implicitely tests the correct behavior
end
it "swallows all the timeout errors and raises MetaInspector::Request::TimeoutError" do
logger.should receive(:<<).with(an_instance_of(MetaInspector::Request::TimeoutError))
subject
end
end

end

private

def url(initial_url)
Expand Down

0 comments on commit 5d44433

Please sign in to comment.