Permalink
Browse files

coercedates plugin

new plugin that coerces the 'updated' and 'published' dates on any entry already in the cache to match the 'updated' date already in the cache

if an entry is _not_ already in the cache, coerces the date values to be the smaller of older of the two values

commit also 'fixes' spider.py to work with filters that modify the 'updated' date

see: rubys#15
  • Loading branch information...
1 parent bfacbc2 commit 649e369f8d7b53c6e958149b3709fb8d621efb8d @hossman committed Jan 8, 2012
View
@@ -0,0 +1,116 @@
+# If you don't want items to "move up" on your planet if the source feed
+# updates them (and changes the update date to something newer then was
+# originally used) you may be tempted to use the "ignore_in_feed: updated"
+# option, but there are three important things to realize about doing this:
+#
+# * When you ignore the "updated" date, it will default to the
+# "published" date -- but if there is no "published" date (very common
+# in many RSS feeds) it will default to the current date+time.
+#
+# * If you purge the entire cache (perhaps because you added a filter)
+# all of the "updated" dates for those items w/o a "published" date will
+# be re-set to the current date+time
+#
+# * The "updated" date is what Venus uses to sort the list
+#
+# This may seem all seem obvious, but can be highly annoying when you deal
+# with some feeds that have no "published" date and have to occasionally
+# purge your cache.
+#
+# One solution would be to only use "ignore_in_feed: updated" on the feeds
+# where you know they feed contains a "published" date for each item, and
+# don't use it for feeds that only contain an "updated" date for each item
+# -- but that can be tedious.
+#
+# So use this plugin instead
+#
+# This plugin will replace the "updated" and "published" dates of every item
+# with whichever of the two values is the lowest, unless the item is already
+# in the cache, in which case it will use the "updated" date from the item in
+# the cache -- making it a safe alternative to "ignore_in_feed: updated" for
+# all feeds regardless of whether the items have a "published" date or not,
+# and regardless of whether the ones that do have a "published" date try to
+# modify it or not.
+#
+###########################################################################
+
+import sys, time, os
+from xml.dom import minidom
+from planet import reconstitute
+from planet import config
+from planet.reconstitute import date
+from planet.spider import filename
+
+log = planet.logger
+
+# finds the first descendent element that matches the specified
+# namespace and tag name, parses it (in canonical date format),
+# returns the parsed value, and removes (all of the) element(s)
+def parseAndPurgeDateElement(element, ns, tagName):
+ result = None
+ # see if we have any date(s?)
+ kids = element.getElementsByTagNameNS(ns, tagName)
+ if kids:
+ # record the first one
+ result = time.strptime(kids[0].childNodes[0].nodeValue,
+ '%Y-%m-%dT%H:%M:%SZ')
+ # get rid of all of them
+ for trash in kids:
+ trash.parentNode.removeChild(trash)
+ return result
+
+
+# given the identifier of an entry in the cache, fetches the
+# formated mtime of that entry (which should match the updated
+# date if venus has done it's job right
+#
+# returns None if the entry is not in the cache
+def getDateFromCache(entry):
+ if entry is None:
+ log.error("Attempted to lookup the date of 'None'")
+ return None
+
+ id = entry.getElementsByTagNameNS(atomNS, 'id')[0].childNodes[0].nodeValue
+ if id is None:
+ log.error("Unable to find id in entry")
+ return None
+
+ cache = os.path.join(config.cache_directory())
+ file = filename(cache, id)
+ if os.path.exists(file):
+ return time.gmtime(os.stat(file).st_mtime)
+ return None
+
+
+atomNS = 'http://www.w3.org/2005/Atom'
+planetNS = 'http://planet.intertwingly.net/'
+
+# parse input stream
+dom = minidom.parse(sys.stdin)
+
+entries = dom.getElementsByTagNameNS(atomNS, 'entry')
+for e in entries:
+
+ # get & remove our dates from the entry
+ updatedDate = parseAndPurgeDateElement(e, atomNS, 'updated')
+ pubDate = parseAndPurgeDateElement(e, atomNS, 'published')
+
+ cacheDate = getDateFromCache(e)
+
+ if cacheDate is not None:
+ mainDate = cacheDate
+ elif not updatedDate:
+ mainDate = pubDate
+ elif not pubDate:
+ mainDate = updatedDate
+ elif pubDate < updatedDate:
+ mainDate = pubDate
+ else:
+ mainDate = updatedDate
+
+ # add back to the entry
+ reconstitute.date(e, 'published', mainDate)
+ reconstitute.date(e, 'updated', mainDate)
+
+# output the dom
+print dom.toxml('utf-8')
View
@@ -235,6 +235,15 @@ def writeCache(feed_uri, feed_info, data):
if os.path.exists(cache_file): os.remove(cache_file)
continue
+ # re-set mtime incase filters have modified it
+ try:
+ edoc = feedparser.parse(output)
+ mtime = calendar.timegm(edoc.entries[0].updated_parsed)
+ except:
+ log.warning("Unable to re-set mtime on %s after running filters: ",
+ entry.id,
+ sys.exc_info()[0])
+
# write out and timestamp the results
write(output, cache_file, mtime)
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="utf-8"?>
+<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
+<channel>
+<title>Fake RSS Blog</title>
+<link>http://fake.url.example.com</link>
+<description>Fake RSS Feed For testing</description>
+<image>
+<url>http://fake.url.example.com/feedlogo.gif</url>
+<title>Test RSS Feed</title>
+<link>http://fake.url.example.com</link>
+</image>
+<language>en-us</language>
+<copyright>Not Copyright 2011 Fake Feed, LLC. The contents of this headlines and excerpts feed are available for unlimited distribution.</copyright>
+<generator>Blogsmith http://www.blogsmith.com/</generator>
+
+<item>
+ <title>Fake Title: RSS Has No Date</title>
+ <link>http://fake.url.example.com/rss-no-date</link>
+ <guid isPermaLink="true">http://fake.url.example.com/rss-no-date</guid>
+ <comments>http://fake.url.example.com/rss-no-date#comments</comments>
+ <description>
+ <![CDATA[<p>Blah Blah Blah something poinient blah blah blah</p>]]>
+ </description>
+ <imageurl>http://fake.url.example.com/rss-no-date.gif</imageurl>
+ <dc:creator>Fake Person</dc:creator>
+</item>
+
+
+<item>
+ <title>Fake Title: RSS Has Changing Date</title>
+ <link>http://fake.url.example.com/rss-changing-date</link>
+ <guid isPermaLink="true">http://fake.url.example.com/rss-changing-date</guid>
+ <comments>http://fake.url.example.com/rss-changing-date#comments</comments>
+ <description>
+ <![CDATA[<p>Blah Blah Blah something poinient blah blah blah</p>]]>
+ </description>
+ <imageurl>http://fake.url.example.com/rss-changing-date.gif</imageurl>
+ <dc:creator>Fake Person</dc:creator>
+ <dc:date>2011-12-01T11:00:00+00:00</dc:date>
+</item>
+
+</channel></rss>
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="utf-8"?>
+<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
+<channel>
+<title>Fake RSS Blog</title>
+<link>http://fake.url.example.com</link>
+<description>Fake RSS Feed For testing</description>
+<image>
+<url>http://fake.url.example.com/feedlogo.gif</url>
+<title>Test RSS Feed</title>
+<link>http://fake.url.example.com</link>
+</image>
+<language>en-us</language>
+<copyright>Not Copyright 2011 Fake Feed, LLC. The contents of this headlines and excerpts feed are available for unlimited distribution.</copyright>
+<generator>Blogsmith http://www.blogsmith.com/</generator>
+
+<item>
+ <title>Fake Title: RSS Has No Date</title>
+ <link>http://fake.url.example.com/rss-no-date</link>
+ <guid isPermaLink="true">http://fake.url.example.com/rss-no-date</guid>
+ <comments>http://fake.url.example.com/rss-no-date#comments</comments>
+ <description>
+ <![CDATA[<p>Blah Blah Blah something poinient blah blah blah</p>]]>
+ </description>
+ <imageurl>http://fake.url.example.com/rss-no-date.gif</imageurl>
+ <dc:creator>Fake Person</dc:creator>
+</item>
+
+
+<item>
+ <title>Fake Title: RSS Has Changing Date</title>
+ <link>http://fake.url.example.com/rss-changing-date</link>
+ <guid isPermaLink="true">http://fake.url.example.com/rss-changing-date</guid>
+ <comments>http://fake.url.example.com/rss-changing-date#comments</comments>
+ <description>
+ <![CDATA[<p>Blah Blah Blah something poinient blah blah blah</p>]]>
+ </description>
+ <imageurl>http://fake.url.example.com/rss-changing-date.gif</imageurl>
+ <dc:creator>Fake Person</dc:creator>
+ <dc:date>2011-12-07T11:07:07+00:00</dc:date>
+</item>
+
+</channel></rss>
@@ -0,0 +1,92 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<feed
+ xmlns="http://www.w3.org/2005/Atom"
+ xmlns:thr="http://purl.org/syndication/thread/1.0"
+ xml:lang="en"
+ xml:base="http://fake.url.example.com/wp-atom.php"
+ >
+ <title type="text">Fake Atom Feed</title>
+ <subtitle type="text">Fake Atom feed for testing stuff</subtitle>
+
+ <updated>2011-12-08T00:00:28Z</updated>
+
+ <link rel="alternate" type="text/html" href="http://fake.url.example.com" />
+ <id>http://fake.url.example.com/feed/atom/</id>
+ <link rel="self" type="application/atom+xml" href="http://fake.url.example.com/feed/atom/" />
+
+ <generator uri="http://wordpress.org/" version="3.2.1">WordPress</generator>
+
+
+
+
+ <entry>
+ <author>
+ <name>Fake Person</name>
+ <uri>http://fake.url.example.com</uri>
+ </author>
+ <title type="html"><![CDATA[Atom Changing Updated Date]]></title>
+ <link rel="alternate" type="text/html" href="http://fake.url.example.com/atom-changing-updated" />
+ <id>http://fake.url.example.com/atom-changing-updated</id>
+ <updated>2011-12-05T10:06:38Z</updated>
+ <published>2011-11-09T00:00:28Z</published>
+ <summary type="html"><![CDATA[Blah Blah Blah [...]]]></summary>
+ <content type="html" xml:base="http://fake.url.example.com/atom-changing-updated"><![CDATA[Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah]]></content>
+ <link rel="replies" type="text/html" href="http://fake.url.example.com/atom-changing-updated#comments" thr:count="0"/>
+ <link rel="replies" type="application/atom+xml" href="http://fake.url.example.com/atom-changing-updated/feed/atom/" thr:count="0"/>
+ <thr:total>0</thr:total>
+ </entry>
+
+
+
+ <entry>
+ <author>
+ <name>Fake Person</name>
+ <uri>http://fake.url.example.com</uri>
+ </author>
+ <title type="html"><![CDATA[Atom Changing Published Date]]></title>
+ <link rel="alternate" type="text/html" href="http://fake.url.example.com/atom-changing-published" />
+ <id>http://fake.url.example.com/atom-changing-published</id>
+ <published>2011-12-08T02:02:28Z</published>
+ <summary type="html"><![CDATA[Blah Blah Blah [...]]]></summary>
+ <content type="html" xml:base="http://fake.url.example.com/atom-changing-published"><![CDATA[Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah]]></content>
+ <link rel="replies" type="text/html" href="http://fake.url.example.com/atom-changing-published#comments" thr:count="0"/>
+ <link rel="replies" type="application/atom+xml" href="http://fake.url.example.com/atom-changing-published/feed/atom/" thr:count="0"/>
+ <thr:total>0</thr:total>
+ </entry>
+
+
+ <entry>
+ <author>
+ <name>Fake Person</name>
+ <uri>http://fake.url.example.com</uri>
+ </author>
+ <title type="html"><![CDATA[Atom No Date]]></title>
+ <link rel="alternate" type="text/html" href="http://fake.url.example.com/atom-no-date" />
+ <id>http://fake.url.example.com/atom-no-date</id>
+ <summary type="html"><![CDATA[Blah Blah Blah [...]]]></summary>
+ <content type="html" xml:base="http://fake.url.example.com/atom-no-date"><![CDATA[Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah]]></content>
+ <link rel="replies" type="text/html" href="http://fake.url.example.com/atom-no-date#comments" thr:count="0"/>
+ <link rel="replies" type="application/atom+xml" href="http://fake.url.example.com/atom-no-date/feed/atom/" thr:count="0"/>
+ <thr:total>0</thr:total>
+ </entry>
+
+ <entry>
+ <author>
+ <name>Fake Person</name>
+ <uri>http://fake.url.example.com</uri>
+ </author>
+ <title type="html"><![CDATA[Atom Update Before Published]]></title>
+ <updated>2011-11-11T11:11:11Z</updated>
+ <published>2011-12-12T12:12:12Z</published>
+ <link rel="alternate" type="text/html" href="http://fake.url.example.com/atom-update-before-pub" />
+ <id>http://fake.url.example.com/atom-update-before-pub</id>
+ <summary type="html"><![CDATA[Blah Blah Blah [...]]]></summary>
+ <content type="html" xml:base="http://fake.url.example.com/atom-update-before-pub"><![CDATA[Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah]]></content>
+ <link rel="replies" type="text/html" href="http://fake.url.example.com/atom-update-before-pub#comments" thr:count="0"/>
+ <link rel="replies" type="application/atom+xml" href="http://fake.url.example.com/atom-update-before-pub/feed/atom/" thr:count="0"/>
+ <thr:total>0</thr:total>
+ </entry>
+
+
+
+</feed>
Oops, something went wrong.

0 comments on commit 649e369

Please sign in to comment.