alter speedparser to strip html tags from title and subtitle elements…

… in entries and in the feeds by default, using the supplied cleaner. this theoretically slows it down, but these are generally so small that it adds a negligible overhead; version bump to 0.1.6
jmoiron · Jan 11, 2012 · 981426b · 981426b
1 parent ee6e8eb
commit 981426b
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,5 +2,7 @@
 *.sw[op]
 feeds/
 build/
+tests/data*
+dist/*
 *.egg-info
 tags
diff --git a/speedparser/__init__.py b/speedparser/__init__.py
@@ -1,3 +1,3 @@
 from speedparser import parse
-VERSION = (0,1,5)
+VERSION = (0,1,6)
 __all__ = ['parse', 'VERSION']
diff --git a/speedparser/speedparser.py b/speedparser/speedparser.py
@@ -70,6 +70,16 @@ def first_text(xpath_result, default='', encoding='utf-8'):
         return unicoder(xpath_result[0].text, encoding) or default
     return default
 
+def strip_outer_tag(text):
+    """Strips the outer tag, if text starts with a tag.  Not entity aware;
+    designed to quickly strip outer tags from lxml cleaner output.  Only
+    checks for <p> and <div> outer tags."""
+    stripped = text.strip()
+    if (stripped.startswith('<p>') or stripped.startswith('<div>')) and \
+        (stripped.endswith('</p>') or stripped.endswith('</div>')):
+        return stripped[stripped.index('>')+1:stripped.rindex('<')]
+    return text
+
 nsre = re.compile(r'xmlns=[\'"](.+?)[\'"]')
 def strip_namespace(document):
     if document[:1000].count('xmlns') > 5:
@@ -201,6 +211,11 @@ def __init__(self, root, namespaces={}, version='rss20', encoding='utf-8', feed=
             if d: entries.append(d)
         self.entries = entries
 
+    def clean(self, text):
+        if text and isinstance(text, basestring):
+            return self.cleaner.clean_html(text)
+        return text
+
     def parse_entry(self, entry):
         """An attempt to parse pieces of an entry out w/o xpath, by looping
         over the entry root's children and slotting them into the right places.
@@ -255,8 +270,11 @@ def parse_date(self, node, entry, ns=''):
         entry['updated_parsed'] = date
 
     def parse_title(self, node, entry, ns=''):
-        if ns in ('media',) and 'title' in entry: return
-        entry['title'] = unicoder(node.text) or ''
+        if ns in ('media',) and 'title' in entry:
+            return
+        title = unicoder(node.text) or ''
+        title = strip_outer_tag(self.clean(title))
+        entry['title'] = title or ''
 
     def parse_author(self, node, entry, ns=''):
         if ns and ns in ('itunes', 'dm') and 'author' in entry:
@@ -301,15 +319,14 @@ def parse_links(self, node, entry, ns=''):
 
     def parse_comments(self, node, entry, ns=''):
         if 'comments' in entry and ns: return
-        entry['comments'] = unicoder(node.text)
+        entry['comments'] = strip_outer_tag(self.clean(unicoder(node.text)))
 
     def parse_content(self, node, entry, ns=''):
         # media:content is processed as media_content below
         if ns and node.tag.endswith('content') and ns not in ('itunes',):
             return
         content = unicoder(innertext(node))
-        if content:
-            content = self.cleaner.clean_html(content)
+        content = self.clean(content)
         entry.setdefault('content', []).append({'value': content or ''})
 
     def parse_summary(self, node, entry, ns=''):
@@ -320,8 +337,7 @@ def parse_summary(self, node, entry, ns=''):
             entry['summary'] = entry['content'][0]['value']
             return
         summary = unicoder(innertext(node))
-        if summary:
-            summary = self.cleaner.clean_html(summary).strip()
+        summary = self.clean(summary)
         entry['summary'] = summary or ''
 
     def parse_media_content(self, node, entry, ns='media'):
@@ -418,14 +434,22 @@ def __init__(self, root, namespaces={}, encoding='utf-8', type='rss20', cleaner=
         if 'id' in feed and 'link' not in feed:
             feed['link'] = feed['id']
 
-
         self.feed = feed
 
+    def clean(self, text, outer_tag=True):
+        if text and isinstance(text, basestring):
+            if not outer_tag:
+                txt = self.cleaner.clean_html(text)
+                frag = lxml.html.fragment_fromstring(txt)
+                import ipdb; ipdb.set_trace();
+            return self.cleaner.clean_html(text)
+        return text
+
     def parse_title(self, node, feed, ns=''):
-        feed['title'] = unicoder(node.text) or ''
+        feed['title'] = strip_outer_tag(self.clean(unicoder(node.text))) or ''
 
     def parse_subtitle(self, node, feed, ns=''):
-        feed['subtitle'] = unicoder(node.text) or ''
+        feed['subtitle'] = strip_outer_tag(self.clean(unicoder(node.text))) or ''
 
     def parse_links(self, node, feed, ns=''):
         if node.text:

diff --git a/tests/regressions.py b/tests/regressions.py
@@ -16,3 +16,17 @@ def test_unix_timestamp_failure(self):
         result = parse(feed, unix_timestamp=True)
         self.assertTrue('bozo_exception' not in result, str(result))
 
+class NonCleanedTitle(TestCase):
+    def test_non_cleaned_title(self):
+        """This tests for a bug where titles were not stripped of html despite
+        a cleaner being supplied to speedparser."""
+        from lxml.html.clean import Cleaner
+        feed = '''<?xml version="1.0"?><feed xmlns="http://www.w3.org/2005/Atom"><title>scribble.yuyat.jp</title><link href="http://scribble.yuyat.jp/"/><link type="application/atom+xml" rel="self" href="http://scribble.yuyat.jp/atom.xml"/><updated>2012-01-08T18:34:39-08:00</updated><id>http://scribble.yuyat.jp/</id><author><name>Yuya Takeyama</name></author><entry><id>http://scribble.yuyat.jp/2012/01/07/this-is-just-a-scribble</id><link type="text/html" rel="alternate" href="http://scribble.yuyat.jp/2012/01/07/this-is-just-a-scribble.html"/><title>scribble 始めます &lt;script&gt;alert(1)&lt;/script&gt;</title><updated>2012-01-07T00:00:00-08:00</updated><author><name>Yuya Takeyama</name></author><content type="html">&lt;p&gt;今まで書いて来た &lt;a href='http://blog.yuyat.jp/'&gt;Born Too Late&lt;/a&gt; の住み分けとしては, あっちがいろいろ調べてからまとめる用, こっちはもっと殴り書いていく感じにしたい.&lt;/p&gt;&lt;div class='highlight'&gt;&lt;pre&gt;&lt;code class='ruby'&gt;&lt;span class='lineno'&gt;1&lt;/span&gt; &lt;span class='k'&gt;class&lt;/span&gt; &lt;span class='nc'&gt;Foo&lt;/span&gt;&lt;span class='lineno'&gt;2&lt;/span&gt;   &lt;span class='k'&gt;def&lt;/span&gt; &lt;span class='nf'&gt;bar&lt;/span&gt;&lt;span class='lineno'&gt;3&lt;/span&gt;     &lt;span class='ss'&gt;:baz&lt;/span&gt;&lt;span class='lineno'&gt;4&lt;/span&gt;   &lt;span class='k'&gt;end&lt;/span&gt;&lt;span class='lineno'&gt;5&lt;/span&gt; &lt;span class='k'&gt;end&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;</content></entry></feed>'''
+        cleaner = Cleaner(comments=True, javascript=True, scripts=True,
+            safe_attrs_only=True, page_structure=True, style=True, embedded=False,
+            remove_tags=['body'])
+        result = parse(feed, unix_timestamp=True, clean_html=cleaner)
+        self.assertTrue('bozo_exception' not in result, str(result))
+        for e in result.entries:
+            self.assertTrue('alert(1)' not in e.title, e.title)
+            self.assertTrue(not e.title.startswith('<p>'), e.title)