Skip to content

Commit

Permalink
Merge pull request #1564 from cantino/rss_agent-feedjira
Browse files Browse the repository at this point in the history
RssAgent: Migrate from FeedNormalizer to Feedjira
  • Loading branch information
knu committed Oct 6, 2016
2 parents cd5b209 + d63f714 commit 5596dde
Show file tree
Hide file tree
Showing 6 changed files with 491 additions and 40 deletions.
3 changes: 2 additions & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ gem 'devise','~> 4.2.0'
gem 'em-http-request', '~> 1.1.2'
gem 'faraday', '~> 0.9.0'
gem 'faraday_middleware', github: 'lostisland/faraday_middleware', branch: 'master' # '>= 0.10.1'
gem 'feed-normalizer'
gem 'feedjira', '~> 2.0'
gem 'font-awesome-sass', '~> 4.3.2'
gem 'foreman', '~> 0.63.0'
gem 'geokit', '~> 1.8.4'
Expand All @@ -106,6 +106,7 @@ gem 'jsonpathv2', '~> 0.0.8'
gem 'kaminari', github: "amatsuda/kaminari", branch: '0-17-stable'
gem 'kramdown', '~> 1.3.3'
gem 'liquid', '~> 3.0.3'
gem 'loofah', '~> 2.0'
gem 'mini_magick'
gem 'multi_xml'
gem 'nokogiri', '1.6.8'
Expand Down
14 changes: 8 additions & 6 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -214,9 +214,11 @@ GEM
extlib (0.9.16)
faraday (0.9.1)
multipart-post (>= 1.2, < 3)
feed-normalizer (1.5.2)
hpricot (>= 0.6)
simple-rss (>= 1.1)
feedjira (2.0.0)
faraday (~> 0.9)
faraday_middleware (~> 0.9)
loofah (~> 2.0)
sax-machine (~> 1.0)
ffi (1.9.10)
font-awesome-sass (4.3.2.1)
sass (~> 3.2)
Expand Down Expand Up @@ -269,7 +271,6 @@ GEM
haversine (0.3.0)
hipchat (1.2.0)
httparty
hpricot (0.8.6)
httmultiparty (0.3.16)
httparty (>= 0.7.3)
mimemagic
Expand Down Expand Up @@ -486,6 +487,7 @@ GEM
sprockets (>= 2.8, < 4.0)
sprockets-rails (>= 2.0, < 4.0)
tilt (>= 1.1, < 3)
sax-machine (1.3.2)
select2-rails (3.5.9.3)
thor (~> 0.14)
shellany (0.0.1)
Expand All @@ -496,7 +498,6 @@ GEM
faraday (>= 0.9.0.rc5)
jwt (>= 0.1.5)
multi_json (>= 1.0.0)
simple-rss (1.3.1)
simple_oauth (0.3.1)
simplecov (0.9.2)
docile (~> 1.1.0)
Expand Down Expand Up @@ -619,7 +620,7 @@ DEPENDENCIES
evernote_oauth
faraday (~> 0.9.0)
faraday_middleware!
feed-normalizer
feedjira (~> 2.0)
ffi (>= 1.9.4)
font-awesome-sass (~> 4.3.2)
forecast_io (~> 2.0.0)
Expand All @@ -644,6 +645,7 @@ DEPENDENCIES
letter_opener_web (~> 1.3.0)
liquid (~> 3.0.3)
listen (~> 3.0.5)
loofah (~> 2.0)
mini_magick
mqtt
multi_xml
Expand Down
130 changes: 101 additions & 29 deletions app/models/agents/rss_agent.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
require 'rss'
require 'feed-normalizer'

module Agents
class RssAgent < Agent
include WebRequestConcern
Expand All @@ -9,21 +6,23 @@ class RssAgent < Agent
can_dry_run!
default_schedule "every_1d"

gem_dependency_check { defined?(Feedjira::Feed) }

DEFAULT_EVENTS_ORDER = [['{{date_published}}', 'time'], ['{{last_updated}}', 'time']]

description do
<<-MD
The RSS Agent consumes RSS feeds and emits events when they change.
This Agent is fairly simple, using [feed-normalizer](https://github.com/aasmith/feed-normalizer) as a base. For complex feeds
with additional field types, we recommend using a WebsiteAgent. See [this example](https://github.com/cantino/huginn/wiki/Agent-configuration-examples#itunes-trailers).
This agent, using [Feedjira](https://github.com/feedjira/feedjira) as a base, can parse various types of RSS and Atom feeds and has some special handlers for FeedBurner, iTunes RSS, and so on. However, supported fields are limited by its general and abstract nature. For complex feeds with additional field types, we recommend using a WebsiteAgent. See [this example](https://github.com/cantino/huginn/wiki/Agent-configuration-examples#itunes-trailers).
If you want to *output* an RSS feed, use the DataOutputAgent.
Options:
* `url` - The URL of the RSS feed (an array of URLs can also be used; items with identical guids across feeds will be considered duplicates).
* `clean` - Attempt to use [feed-normalizer](https://github.com/aasmith/feed-normalizer)'s' `clean!` method to cleanup HTML in the feed. Set to `true` to use.
* `include_feed_info` - Set to `true` to include feed information in each event.
* `clean` - Set to `true` to sanitize `description` and `content` as HTML fragments, removing unknown/unsafe elements and attributes.
* `expected_update_period_in_days` - How often you expect this RSS feed to change. If more than this amount of time passes without an update, the Agent will mark itself as not working.
* `headers` - When present, it should be a hash of headers to send with the request.
* `basic_auth` - Specify HTTP basic auth parameters: `"username:password"`, or `["username", "password"]`.
Expand Down Expand Up @@ -53,18 +52,46 @@ def default_options
Events look like:
{
"feed": {
"id": "...",
"type": "atom",
"generator": "...",
"url": "http://example.com/",
"links": [
{ "href": "http://example.com/", "rel": "alternate", "type": "text/html" },
{ "href": "http://example.com/index.atom", "rel": "self", "type": "application/atom+xml" }
],
"title": "Some site title",
"description": "Some site description",
"copyright": "...",
"icon": "http://example.com/icon.png",
"authors": [ "..." ],
"date_published": "2014-09-11T01:30:00-07:00",
"last_updated": "2014-09-11T01:30:00-07:00"
},
"id": "829f845279611d7925146725317b868d",
"date_published": "2014-09-11 01:30:00 -0700",
"last_updated": "Thu, 11 Sep 2014 01:30:00 -0700",
"url": "http://example.com/...",
"urls": [ "http://example.com/..." ],
"links": [
{ "href": "http://example.com/...", "rel": "alternate" },
],
"title": "Some title",
"description": "Some description",
"content": "Some content",
"title": "Some title",
"authors": [ ... ],
"categories": [ ... ]
"authors": [ "Some Author <email@address>" ],
"categories": [ "..." ],
"enclosure": {
"url" => "http://example.com/file.mp3", "type" => "audio/mpeg", "length" => "123456789"
},
"date_published": "2014-09-11T01:30:00-0700",
"last_updated": "2014-09-11T01:30:00-0700"
}
Some notes:
- The `feed` key is present only if `include_feed_info` is set to true.
- Each element in `authors` is a string normalized in the format "*name* <*email*> (*url*)", where each space-separated part is optional.
- Timestamps are converted to the ISO 8601 format.
MD

def working?
Expand Down Expand Up @@ -104,8 +131,7 @@ def check_urls(urls)
begin
response = faraday.get(url)
if response.success?
feed = FeedNormalizer::FeedNormalizer.parse(response.body, loose: true)
feed.clean! if boolify(interpolated['clean'])
feed = Feedjira::Feed.parse(response.body)
new_events.concat feed_to_events(feed)
else
error "Failed to fetch #{url}: #{response.inspect}"
Expand All @@ -128,10 +154,6 @@ def check_urls(urls)
log "Fetched #{urls.to_sentence} and created #{created_event_count} event(s)."
end

def get_entry_id(entry)
entry.id.presence || Digest::MD5.hexdigest(entry.content)
end

def check_and_track(entry_id)
memory['seen_ids'] ||= []
if memory['seen_ids'].include?(entry_id)
Expand All @@ -143,21 +165,71 @@ def check_and_track(entry_id)
end
end

unless dependencies_missing?
require 'feedjira_extension'
end

def feed_data(feed)
type =
case feed.class.name
when /Atom/
'atom'
else
'rss'
end

{
id: feed.feed_id,
type: type,
url: feed.url,
links: feed.links,
title: feed.title,
description: feed.description,
copyright: feed.copyright,
generator: feed.generator,
icon: feed.icon,
authors: feed.authors,
date_published: feed.date_published,
last_updated: feed.last_updated,
}
end

def entry_data(entry)
{
id: entry.id,
url: entry.url,
urls: entry.links.map(&:href),
links: entry.links,
title: entry.title,
description: clean_fragment(entry.summary),
content: clean_fragment(entry.content || entry.summary),
image: entry.try(:image),
enclosure: entry.enclosure,
authors: entry.authors,
categories: Array(entry.try(:categories)),
date_published: entry.date_published,
last_updated: entry.last_updated,
}
end

def feed_to_events(feed)
payload_base = {}

if boolify(interpolated['include_feed_info'])
payload_base[:feed] = feed_data(feed)
end

feed.entries.map { |entry|
Event.new(payload: {
id: get_entry_id(entry),
date_published: entry.date_published,
last_updated: entry.last_updated,
url: entry.url,
urls: entry.urls,
description: entry.description,
content: entry.content,
title: entry.title,
authors: entry.authors,
categories: entry.categories
})
Event.new(payload: payload_base.merge(entry_data(entry)))
}
end

def clean_fragment(fragment)
if boolify(interpolated['clean']) && fragment.present?
Loofah.scrub_fragment(fragment, :prune).to_s
else
fragment
end
end
end
end
Loading

0 comments on commit 5596dde

Please sign in to comment.