Skip to content

Commit

Permalink
Refactor feed fetching to make it easier to test and fixes feed-fetch…
Browse files Browse the repository at this point in the history
…ing-related bug
  • Loading branch information
mawise committed Apr 28, 2023
1 parent 113f3a8 commit 70e8dca
Show file tree
Hide file tree
Showing 6 changed files with 543 additions and 114 deletions.
124 changes: 27 additions & 97 deletions app/jobs/update_feed_job.rb
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
require File.join(Rails.root, "lib","haven_feed_entry.rb")

class UpdateFeedJob < ApplicationJob
queue_as :default

# Constants for feed entry keys
ENTRY_TITLE = "title"
FEED_TITLE = "feed"
ENTRY_LINK = "link"
ENTRY_DATE = "date"
ENTRY_CONTENT = "content"
ENTRY_GUID = "guid"
ENTRY_AUDIO = "audio"

ERROR_UNKNOWN = "Unknown Feed Type (not RSS or Atom)"
ERROR_INVALID = "Invalid Feed"

Expand Down Expand Up @@ -47,39 +41,41 @@ def truncate_feed(feed, max_count)
end

def update_feed(feed, earliest_time, latest_time)
# update feed title if not yet set
if feed.name.nil?
begin
feed.name = fetch_feed_title(feed.url)
feed.save
rescue
# TODO: retry with RSS autodiscovery?
feed.name = ERROR_INVALID
feed.save
feed.with_lock do
# update feed title if not yet set
if feed.name.nil?
begin
feed.name = fetch_feed_title(feed.url)
feed.save
rescue
# TODO: retry with RSS autodiscovery?
feed.name = ERROR_INVALID
feed.save
return
end
end
if ([ERROR_UNKNOWN, ERROR_INVALID].include? feed.name)
feed.feed_invalid!
return
end
end
if ([ERROR_UNKNOWN, ERROR_INVALID].include? feed.name)
feed.feed_invalid!
return
end
end # release lock

# fetch feed content
return unless feed.last_update.nil? or feed.last_update < 10.minutes.ago
feed.with_lock do
entries = []
begin
entries = fetch_feed_content(feed.url)
entries = HavenFeedEntry.fetch_feed_content(feed.url)
feed.fetch_succeeded!
feed.last_update = DateTime.now
feed.save
rescue
feed.fetch_failed!
end
entries.each do |entry|
title = entry[ENTRY_TITLE]
link = entry[ENTRY_LINK]
published = entry[ENTRY_DATE]
title = entry.title
link = entry.link
published = entry.date
if published.nil?
published = Time.zone.now
end
Expand All @@ -94,14 +90,14 @@ def update_feed(feed, earliest_time, latest_time)
sort_date = latest_time
end
end
content = entry[ENTRY_CONTENT]
guid = entry[ENTRY_GUID]
audio = entry[ENTRY_AUDIO]
content = entry.content
guid = entry.guid
audio = entry.audio
matching_entry = feed.feed_entries.find_by(guid: guid)
record_data = {title: title, link: link, published: published, sort_date: sort_date, content: content, audio: audio, guid: guid}
update_data = {title: title, link: link, audio: audio, content: content}
if matching_entry.nil?
feed.feed_entries.create(record_data)
feed.feed_entries.create!(record_data)
else
matching_entry.update(update_data)
end
Expand All @@ -114,7 +110,7 @@ def update_feed(feed, earliest_time, latest_time)
end

def fetch_feed_title(feed_url)
cleanurl, auth_opts = parse_auth(feed_url)
cleanurl, auth_opts = HavenFeedEntry.parse_auth(feed_url)
URI.open(cleanurl, auth_opts) do |rss|
feed = RSS::Parser.parse(rss, validate: false)
if (feed.feed_type == "rss")
Expand All @@ -130,70 +126,4 @@ def fetch_feed_title(feed_url)
return "Invalid Feed"
end

def fetch_feed_content(feed_url)
entries = []
cleanurl, auth_opts = parse_auth(feed_url)
URI.open(cleanurl, auth_opts) do |rss|
feed = RSS::Parser.parse(rss, validate: false)
if (feed.feed_type == "rss")
feed.items.each do |item|
entry = {}
entry[FEED_TITLE] = feed.channel.title
entry[ENTRY_TITLE] = item.title
entry[ENTRY_LINK] = item.link
entry[ENTRY_DATE] = item.date
entry[ENTRY_CONTENT] = item.description
entry[ENTRY_CONTENT] = item.content_encoded if item.content_encoded
entry[ENTRY_GUID] = item.guid.content
if item.enclosure
if item.enclosure.type == "audio/mpeg"
entry[ENTRY_AUDIO] = item.enclosure.url
elsif item.enclosure.type.start_with? "image/" # If there is an image in the enclosure
unless entry[ENTRY_CONTENT].include? "<img " # and no images in the content
entry[ENTRY_CONTENT] = "<img src=\"#{item.enclosure.url}\" /><br/>" + entry[ENTRY_CONTENT] # then include the enclosure image
end
end
else
entry[ENTRY_AUDIO] = nil
end
entries << entry
end
elsif (feed.feed_type == "atom")
feed.entries.each do |item|
entry = {}
entry[FEED_TITLE] = feed.title.content
entry[ENTRY_TITLE] = item.title.content
entry[ENTRY_LINK] = item.link.href
if !item.published.nil?
entry[ENTRY_DATE] = item.published.content
else
entry[ENTRY_DATE] = item.updated.content
end
if !item.content.nil?
entry[ENTRY_CONTENT] = CGI.unescapeHTML(item.content.to_s)
else
entry[ENTRY_CONTENT] = CGI.unescapeHTML(item.summary.to_s)
end
entry[ENTRY_GUID] = item.id.to_s
entry[ENTRY_AUDIO] = nil # TODO podcast support for Atom feeds
entries << entry
end
end
end
entries
end

def parse_auth(full_url)
scheme, rest = full_url.split("://",2)
opts = {}
opts["User-Agent"] = "haven"
if (rest.include?(":") and rest.include?("@")) # scheme://user:pass@url...
user, rest = rest.split(":",2)
pass, rest = rest.split("@",2)
opts[:http_basic_authentication] = [user,pass]
return ["#{scheme}://#{rest}", opts]
else
return [full_url, opts]
end
end
end
105 changes: 105 additions & 0 deletions lib/haven_feed_entry.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
require 'cgi'

class HavenFeedEntry
attr_reader :feed_title, :title, :link, :date, :content, :guid, :audio

## feed is return from RSS::Parser.parse()
## item is element from feed.items (rss) or feed.entries (atom)
def initialize(feed, item)
if (feed.feed_type == "rss")
@feed_title = feed.channel.title
@title = item.title
@link = item.link
@date = parse_time(item.date)
@content = item.description
@content = item.content_encoded if item.content_encoded
@guid = item.guid.content
@audio = nil
if item.enclosure
if item.enclosure.type == "audio/mpeg"
@audio = item.enclosure.url
elsif item.enclosure.type.start_with? "image/" # If there is an image in the enclosure
unless entry[ENTRY_CONTENT].include? "<img " # and no images in the content
# then include the enclosure image
@content = "<img src=\"#{item.enclosure.url}\" /><br/>" + @content
end
end
end
elsif (feed.feed_type == "atom")
@feed_title = feed.title.content
@title = item.title.content
@link = item.link.href
if !item.published.nil?
@date = parse_time(item.published.content)
else
@date = parse_time(item.updated.content)
end
if !item.content.nil?
@content = CGI.unescapeHTML(item.content.to_s)
else
@content = CGI.unescapeHTML(item.summary.to_s)
end
@guid = item.id.to_s
@audio = nil # TODO podcast support for Atom feeds?
end
end

# returns array of HavenFeedEntry objects
# feed_url is the URL of a feed, eg: "https://example.com/rss.xml"
def self.fetch_feed_content(feed_url)
entries = nil
cleanurl, auth_opts = parse_auth(feed_url)
URI.open(cleanurl, auth_opts) do |rss|
entries = parse_feed_content(rss)
end
entries
end

# returns array of HavenFeedEntry objects
# feed_raw is a StringIO from URI.open (or a File for testing)
def self.parse_feed_content(feed_raw)
entries=[]
feed = RSS::Parser.parse(feed_raw, validate: false)
if (feed.feed_type == "rss")
feed.items.each do |item|
entry = HavenFeedEntry.new(feed,item)
entries << entry
end
elsif (feed.feed_type == "atom")
feed.entries.each do |item|
entry = HavenFeedEntry.new(feed,item)
entries << entry
end
end
entries
end

def self.parse_auth(full_url)
scheme, rest = full_url.split("://",2)
opts = {}
opts["User-Agent"] = "haven"
if (rest.include?(":") and rest.include?("@")) # scheme://user:pass@url...
user, rest = rest.split(":",2)
pass, rest = rest.split("@",2)
opts[:http_basic_authentication] = [user,pass]
return ["#{scheme}://#{rest}", opts]
else
return [full_url, opts]
end
end

private

# Different time formats were causing problems, this method standardizes them
def parse_time(time)
if time.is_a? Time
return Time.parse time.httpdate
elsif time.is_a String
return Time.parse(Time.parse(time).httpdate)
else
raise "Argument Error, #{time} is not a valid time"
end
end

end

Loading

0 comments on commit 70e8dca

Please sign in to comment.