Skip to content

Commit

Permalink
alter speedparser to strip html tags from title and subtitle elements…
Browse files Browse the repository at this point in the history
… in entries and in the feeds by default, using the supplied cleaner. this theoretically slows it down, but these are generally so small that it adds a negligible overhead; version bump to 0.1.6
  • Loading branch information
jmoiron committed Jan 11, 2012
1 parent ee6e8eb commit 981426b
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 11 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,7 @@
*.sw[op]
feeds/
build/
tests/data*
dist/*
*.egg-info
tags
2 changes: 1 addition & 1 deletion speedparser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from speedparser import parse
VERSION = (0,1,5)
VERSION = (0,1,6)
__all__ = ['parse', 'VERSION']
44 changes: 34 additions & 10 deletions speedparser/speedparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,16 @@ def first_text(xpath_result, default='', encoding='utf-8'):
return unicoder(xpath_result[0].text, encoding) or default
return default

def strip_outer_tag(text):
"""Strips the outer tag, if text starts with a tag. Not entity aware;
designed to quickly strip outer tags from lxml cleaner output. Only
checks for <p> and <div> outer tags."""
stripped = text.strip()
if (stripped.startswith('<p>') or stripped.startswith('<div>')) and \
(stripped.endswith('</p>') or stripped.endswith('</div>')):
return stripped[stripped.index('>')+1:stripped.rindex('<')]
return text

nsre = re.compile(r'xmlns=[\'"](.+?)[\'"]')
def strip_namespace(document):
if document[:1000].count('xmlns') > 5:
Expand Down Expand Up @@ -201,6 +211,11 @@ def __init__(self, root, namespaces={}, version='rss20', encoding='utf-8', feed=
if d: entries.append(d)
self.entries = entries

def clean(self, text):
if text and isinstance(text, basestring):
return self.cleaner.clean_html(text)
return text

def parse_entry(self, entry):
"""An attempt to parse pieces of an entry out w/o xpath, by looping
over the entry root's children and slotting them into the right places.
Expand Down Expand Up @@ -255,8 +270,11 @@ def parse_date(self, node, entry, ns=''):
entry['updated_parsed'] = date

def parse_title(self, node, entry, ns=''):
if ns in ('media',) and 'title' in entry: return
entry['title'] = unicoder(node.text) or ''
if ns in ('media',) and 'title' in entry:
return
title = unicoder(node.text) or ''
title = strip_outer_tag(self.clean(title))
entry['title'] = title or ''

def parse_author(self, node, entry, ns=''):
if ns and ns in ('itunes', 'dm') and 'author' in entry:
Expand Down Expand Up @@ -301,15 +319,14 @@ def parse_links(self, node, entry, ns=''):

def parse_comments(self, node, entry, ns=''):
if 'comments' in entry and ns: return
entry['comments'] = unicoder(node.text)
entry['comments'] = strip_outer_tag(self.clean(unicoder(node.text)))

def parse_content(self, node, entry, ns=''):
# media:content is processed as media_content below
if ns and node.tag.endswith('content') and ns not in ('itunes',):
return
content = unicoder(innertext(node))
if content:
content = self.cleaner.clean_html(content)
content = self.clean(content)
entry.setdefault('content', []).append({'value': content or ''})

def parse_summary(self, node, entry, ns=''):
Expand All @@ -320,8 +337,7 @@ def parse_summary(self, node, entry, ns=''):
entry['summary'] = entry['content'][0]['value']
return
summary = unicoder(innertext(node))
if summary:
summary = self.cleaner.clean_html(summary).strip()
summary = self.clean(summary)
entry['summary'] = summary or ''

def parse_media_content(self, node, entry, ns='media'):
Expand Down Expand Up @@ -418,14 +434,22 @@ def __init__(self, root, namespaces={}, encoding='utf-8', type='rss20', cleaner=
if 'id' in feed and 'link' not in feed:
feed['link'] = feed['id']


self.feed = feed

def clean(self, text, outer_tag=True):
if text and isinstance(text, basestring):
if not outer_tag:
txt = self.cleaner.clean_html(text)
frag = lxml.html.fragment_fromstring(txt)
import ipdb; ipdb.set_trace();
return self.cleaner.clean_html(text)
return text

def parse_title(self, node, feed, ns=''):
feed['title'] = unicoder(node.text) or ''
feed['title'] = strip_outer_tag(self.clean(unicoder(node.text))) or ''

def parse_subtitle(self, node, feed, ns=''):
feed['subtitle'] = unicoder(node.text) or ''
feed['subtitle'] = strip_outer_tag(self.clean(unicoder(node.text))) or ''

def parse_links(self, node, feed, ns=''):
if node.text:
Expand Down
14 changes: 14 additions & 0 deletions tests/regressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,17 @@ def test_unix_timestamp_failure(self):
result = parse(feed, unix_timestamp=True)
self.assertTrue('bozo_exception' not in result, str(result))

class NonCleanedTitle(TestCase):
def test_non_cleaned_title(self):
"""This tests for a bug where titles were not stripped of html despite
a cleaner being supplied to speedparser."""
from lxml.html.clean import Cleaner
feed = '''<?xml version="1.0"?><feed xmlns="http://www.w3.org/2005/Atom"><title>scribble.yuyat.jp</title><link href="http://scribble.yuyat.jp/"/><link type="application/atom+xml" rel="self" href="http://scribble.yuyat.jp/atom.xml"/><updated>2012-01-08T18:34:39-08:00</updated><id>http://scribble.yuyat.jp/</id><author><name>Yuya Takeyama</name></author><entry><id>http://scribble.yuyat.jp/2012/01/07/this-is-just-a-scribble</id><link type="text/html" rel="alternate" href="http://scribble.yuyat.jp/2012/01/07/this-is-just-a-scribble.html"/><title>scribble 始めます &lt;script&gt;alert(1)&lt;/script&gt;</title><updated>2012-01-07T00:00:00-08:00</updated><author><name>Yuya Takeyama</name></author><content type="html">&lt;p&gt;今まで書いて来た &lt;a href='http://blog.yuyat.jp/'&gt;Born Too Late&lt;/a&gt; の住み分けとしては, あっちがいろいろ調べてからまとめる用, こっちはもっと殴り書いていく感じにしたい.&lt;/p&gt;&lt;div class='highlight'&gt;&lt;pre&gt;&lt;code class='ruby'&gt;&lt;span class='lineno'&gt;1&lt;/span&gt; &lt;span class='k'&gt;class&lt;/span&gt; &lt;span class='nc'&gt;Foo&lt;/span&gt;&lt;span class='lineno'&gt;2&lt;/span&gt; &lt;span class='k'&gt;def&lt;/span&gt; &lt;span class='nf'&gt;bar&lt;/span&gt;&lt;span class='lineno'&gt;3&lt;/span&gt; &lt;span class='ss'&gt;:baz&lt;/span&gt;&lt;span class='lineno'&gt;4&lt;/span&gt; &lt;span class='k'&gt;end&lt;/span&gt;&lt;span class='lineno'&gt;5&lt;/span&gt; &lt;span class='k'&gt;end&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;</content></entry></feed>'''
cleaner = Cleaner(comments=True, javascript=True, scripts=True,
safe_attrs_only=True, page_structure=True, style=True, embedded=False,
remove_tags=['body'])
result = parse(feed, unix_timestamp=True, clean_html=cleaner)
self.assertTrue('bozo_exception' not in result, str(result))
for e in result.entries:
self.assertTrue('alert(1)' not in e.title, e.title)
self.assertTrue(not e.title.startswith('<p>'), e.title)

0 comments on commit 981426b

Please sign in to comment.