Skip to content

Commit

Permalink
Add a new option interpolate to WebsiteAgent
Browse files Browse the repository at this point in the history
It is the way to format extracted data, and add meta information to each
resulted payload.
  • Loading branch information
knu committed Oct 17, 2016
1 parent 7e8931b commit 3099405
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 10 deletions.
51 changes: 41 additions & 10 deletions app/models/agents/website_agent.rb
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,14 @@ class WebsiteAgent < Agent
Set `http_success_codes` to an array of status codes (e.g., `[404, 422]`) to treat HTTP response codes beyond 200 as successes.
The `interpolate` option is the way to format extracted data, and add meta information to each resulted payload. Its value must be a hash, whose key-value pairs are interpolated after extraction and added to each event payload. e.g.:
"interpolate": {
"formatted_date": "{{ extracted_date | date: '%Y-%m-%d' }}",
"site_url": "{{ _url_ }}",
"status": "{{ _response_.status }}"
}
# Liquid Templating
In Liquid templating, the following variable is available except when invoked by `data_from_event`:
Expand All @@ -126,8 +134,13 @@ class WebsiteAgent < Agent
MD

event_description do
keys = options['extract'].keys
if interpolate_hash = options['interpolate'].presence
keys |= interpolate_hash.keys
end

"Events will have the following fields:\n\n %s" % [
Utils.pretty_print(Hash[options['extract'].keys.map { |key|
Utils.pretty_print(Hash[keys.map { |key|
[key, "..."]
}])
]
Expand Down Expand Up @@ -156,6 +169,7 @@ def validate_options
errors.add(:base, "either url, url_from_event, or data_from_event are required") unless options['url'].present? || options['url_from_event'].present? || options['data_from_event'].present?
errors.add(:base, "expected_update_period_in_days is required") unless options['expected_update_period_in_days'].present?
validate_extract_options!
validate_interpolate_options!
validate_http_success_codes!

# Check for optional fields
Expand Down Expand Up @@ -280,6 +294,15 @@ def validate_extract_options!
end
end

def validate_interpolate_options!
interpolate_hash = options['interpolate'].presence or return

unless Hash === interpolate_hash &&
interpolate_hash.each_pair.all? { |key, value| String === value }
errors.add(:base, 'interpolate must be a hash of strings.')
end
end

def check
check_urls(interpolated['url'])
end
Expand Down Expand Up @@ -333,20 +356,28 @@ def handle_data(body, url, existing_payload)
extract_xml(doc)
end

num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq

if num_unique_lengths.length != 1
if output.each_value.each_cons(2).any? { |m, n| m.size != n.size }
raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
end

old_events = previous_payloads num_unique_lengths.first
num_unique_lengths.first.times do |index|
num_tuples = output.each_value.first.size

old_events = previous_payloads num_tuples

interpolate = options['interpolate'].presence

num_tuples.times do |index|
result = {}
interpolated['extract'].keys.each do |name|
interpolated['extract'].each_key do |name|
result[name] = output[name][index]
if name.to_s == 'url' && url.present?
result[name] = (url + Utils.normalize_uri(result[name])).to_s
end
end

if interpolate
result.update(interpolate_options(interpolate, result))
end

if payload_url = result['url'].presence
result['url'] = (url + Utils.normalize_uri(payload_url)).to_s
end

if store_payload!(old_events, result)
Expand Down
16 changes: 16 additions & 0 deletions spec/models/agents/website_agent_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,22 @@
expect(event.payload['original_url']).to eq('http://xkcd.com/index')
end

it "should interpolate after extraction" do
@valid_options['interpolate'] = {
'title' => '{{title | upcase}}',
'summary' => '{{title}}: {{hovertext | truncate: 20}}',
}
@checker.options = @valid_options
@checker.check
event = Event.last
expect(event.payload).to include(
'title' => 'EVOLVING',
'url' => 'http://imgs.xkcd.com/comics/evolving.png',
'hovertext' => 'Biologists play reverse Pokémon, trying to avoid putting any one team member on the front lines long enough for the experience to cause evolution.',
'summary' => 'Evolving: Biologists play r...',
)
end

describe "XML" do
before do
stub_request(:any, /github_rss/).to_return(
Expand Down

0 comments on commit 3099405

Please sign in to comment.