Permalink
Browse files

Merge pull request #164 from cantino/Alex-Ikanow-website_deduplicatio…

…n_improvement

Alex ikanow website deduplication improvement
  • Loading branch information...
2 parents d2d36a3 + 0ad347c commit 5b54cded24ad5c6f96eec026655b5f63a01d6231 @cantino cantino committed Feb 13, 2014
Showing with 116 additions and 25 deletions.
  1. +72 −20 app/models/agents/website_agent.rb
  2. +44 −5 spec/models/agents/website_agent_spec.rb
@@ -6,6 +6,11 @@ module Agents
class WebsiteAgent < Agent
cannot_receive_events!
+ default_schedule "every_12h"
+
+ UNIQUENESS_LOOK_BACK = 200
+ UNIQUENESS_FACTOR = 3
+
description <<-MD
The WebsiteAgent scrapes a website, XML document, or JSON feed and creates Events based on the results.
@@ -34,17 +39,15 @@ class WebsiteAgent < Agent
Can be configured to use HTTP basic auth by including the `basic_auth` parameter with `username:password`.
- Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent.
+ Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent. This is only used to set the "working" status.
+
+ Set `uniqueness_look_back` to limit the number of events checked for uniqueness (typically for performance). This defaults to the larger of #{UNIQUENESS_LOOK_BACK} or #{UNIQUENESS_FACTOR}x the number of detected received results.
MD
event_description do
"Events will have the fields you specified. Your options look like:\n\n #{Utils.pretty_print options['extract']}"
end
- default_schedule "every_12h"
-
- UNIQUENESS_LOOK_BACK = 30
-
def working?
event_created_within?(options['expected_update_period_in_days']) && !recent_error_logs?
end
@@ -54,7 +57,7 @@ def default_options
'expected_update_period_in_days' => "2",
'url' => "http://xkcd.com",
'type' => "html",
- 'mode' => :on_change,
+ 'mode' => "on_change",
'extract' => {
'url' => {'css' => "#comic img", 'attr' => "src"},
'title' => {'css' => "#comic img", 'attr' => "title"}
@@ -63,31 +66,44 @@ def default_options
end
def validate_options
+ # Check for required fields
errors.add(:base, "url and expected_update_period_in_days are required") unless options['expected_update_period_in_days'].present? && options['url'].present?
if !options['extract'].present? && extraction_type != "json"
errors.add(:base, "extract is required for all types except json")
end
+
+ # Check for optional fields
+ if options['mode'].present?
+ errors.add(:base, "mode must be set to on_change or all") unless %w[on_change all].include?(options['mode'])
+ end
+
+ if options['expected_update_period_in_days'].present?
+ errors.add(:base, "Invalid expected_update_period_in_days format") unless is_positive_integer?(options['expected_update_period_in_days'])
+ end
+
+ if options['uniqueness_look_back'].present?
+ errors.add(:base, "Invalid uniqueness_look_back format") unless is_positive_integer?(options['uniqueness_look_back'])
+ end
end
def check
hydra = Typhoeus::Hydra.new
log "Fetching #{options['url']}"
- request_opts = {:followlocation => true}
- if options['basic_auth'].present?
- request_opts[:userpwd] = options['basic_auth']
- end
+ request_opts = { :followlocation => true }
+ request_opts[:userpwd] = options['basic_auth'] if options['basic_auth'].present?
request = Typhoeus::Request.new(options['url'], request_opts)
+
request.on_failure do |response|
error "Failed: #{response.inspect}"
end
+
request.on_success do |response|
doc = parse(response.body)
if extract_full_json?
- result = doc
- if store_payload? result
- log "Storing new result for '#{name}': #{result.inspect}"
- create_event :payload => result
+ if store_payload!(previous_payloads(1), doc)
+ log "Storing new result for '#{name}': #{doc.inspect}"
+ create_event :payload => doc
end
else
output = {}
@@ -116,6 +132,7 @@ def check
return
end
+ old_events = previous_payloads num_unique_lengths.first
num_unique_lengths.first.times do |index|
result = {}
options['extract'].keys.each do |name|
@@ -125,7 +142,7 @@ def check
end
end
- if store_payload? result
+ if store_payload!(old_events, result)
log "Storing new parsed result for '#{name}': #{result.inspect}"
create_event :payload => result
end
@@ -138,16 +155,43 @@ def check
private
- def store_payload? result
- !options['mode'] || options['mode'].to_s == "all" || (options['mode'].to_s == "on_change" && !previous_payloads.include?(result.to_json))
+ # This method returns true if the result should be stored as a new event.
+ # If mode is set to 'on_change', this method may return false and update an existing
+ # event to expire further in the future.
+ def store_payload!(old_events, result)
+ if !options['mode'].present?
+ return true
+ elsif options['mode'].to_s == "all"
+ return true
+ elsif options['mode'].to_s == "on_change"
+ result_json = result.to_json
+ old_events.each do |old_event|
+ if old_event.payload.to_json == result_json
+ old_event.expires_at = new_event_expiration_date
+ old_event.save!
+ return false
+ end
+ end
+ return true
+ end
+ raise "Illegal options[mode]: " + options['mode'].to_s
end
- def previous_payloads
- events.order("id desc").limit(UNIQUENESS_LOOK_BACK).pluck(:payload).map(&:to_json) if options['mode'].to_s == "on_change"
+ def previous_payloads(num_events)
+ if options['uniqueness_look_back'].present?
+ look_back = options['uniqueness_look_back'].to_i
+ else
+ # Larger of UNIQUENESS_FACTOR * num_events and UNIQUENESS_LOOK_BACK
+ look_back = UNIQUENESS_FACTOR * num_events
+ if look_back < UNIQUENESS_LOOK_BACK
+ look_back = UNIQUENESS_LOOK_BACK
+ end
+ end
+ events.order("id desc").limit(look_back) if options['mode'].present? && options['mode'].to_s == "on_change"
end
def extract_full_json?
- (!options['extract'].present? && extraction_type == "json")
+ !options['extract'].present? && extraction_type == "json"
end
def extraction_type
@@ -174,5 +218,13 @@ def parse(data)
raise "Unknown extraction type #{extraction_type}"
end
end
+
+ def is_positive_integer?(value)
+ begin
+ Integer(value) >= 0
+ rescue
+ false
+ end
+ end
end
end
@@ -15,15 +15,31 @@
'title' => {'css' => "#comic img", 'attr' => "title"}
}
}
- @checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @site)
+ @checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @site, :keep_events_for => 2)
@checker.user = users(:bob)
@checker.save!
end
describe "#check" do
- it "should check for changes" do
+
+ it "should validate the integer fields" do
+ @checker.options['expected_update_period_in_days'] = "nonsense"
+ lambda { @checker.save! }.should raise_error;
+ @checker.options['expected_update_period_in_days'] = "2"
+ @checker.options['uniqueness_look_back'] = "nonsense"
+ lambda { @checker.save! }.should raise_error;
+ @checker.options['mode'] = "nonsense"
+ lambda { @checker.save! }.should raise_error;
+ @checker.options = @site
+ end
+
+ it "should check for changes (and update Event.expires_at)" do
lambda { @checker.check }.should change { Event.count }.by(1)
+ event = Event.last
+ sleep 2
lambda { @checker.check }.should_not change { Event.count }
+ update_event = Event.last
+ update_event.expires_at.should_not == event.expires_at
end
it "should always save events when in :all mode" do
@@ -35,6 +51,30 @@
}.should change { Event.count }.by(2)
end
+ it "should take uniqueness_look_back into account during deduplication" do
+ @site['mode'] = 'all'
+ @checker.options = @site
+ @checker.check
+ @checker.check
+ event = Event.last
+ event.payload = "{}"
+ event.save
+
+ lambda {
+ @site['mode'] = 'on_change'
+ @site['uniqueness_look_back'] = 2
+ @checker.options = @site
+ @checker.check
+ }.should_not change { Event.count }
+
+ lambda {
+ @site['mode'] = 'on_change'
+ @site['uniqueness_look_back'] = 1
+ @checker.options = @site
+ @checker.check
+ }.should change { Event.count }.by(1)
+ end
+
it "should log an error if the number of results for a set of extraction patterns differs" do
@site['extract']['url']['css'] = "div"
@checker.options = @site
@@ -79,7 +119,7 @@
'expected_update_period_in_days' => 2,
'type' => "html",
'url' => "http://xkcd.com",
- 'mode' => :on_change,
+ 'mode' => "on_change",
'extract' => {
'url' => {'css' => "#topLeft a", 'attr' => "href"},
'title' => {'css' => "#topLeft a", 'text' => "true"}
@@ -216,5 +256,4 @@
end
end
end
-
-end
+end

0 comments on commit 5b54cde

Please sign in to comment.