Skip to content

Commit

Permalink
Add HTML5SemanticTags technique based on article and video tags.
Browse files Browse the repository at this point in the history
  • Loading branch information
lethain committed Nov 24, 2012
1 parent b90e061 commit bd18b43
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 2 deletions.
33 changes: 31 additions & 2 deletions README.rst
Expand Up @@ -108,12 +108,41 @@ This technique uses Opengraph tags, which look like this::
as their source of data.


extraction.techniques.HTML5SemanticTags
---------------------------------------

The HTML5 `article` tag, and also the `video` tag give us some useful
hints for extracting page information for the sites which happen to
utilize these tags.

This technique will extract information from pages formed like::

<html>
<body>
<h1>This is not a title to HTML5SemanticTags</h1>
<article>
<h1>This is a title</h1>
<p>This is a description.</p>
<p>This is not a description.</p>
</article>
<video>
<source src="this_is_a_video.mp4">
</video>
</body>
</html>

Note that `HTML5SemanticTags` is intentionally much more conservative than
`SemanticTags`, as it provides high quality information in the small number
of cases where it hits, and otherwise expects `SemanticTags` to run sweep
behind it for the lower quality, more abundant hits it discovers.


extraction.techniques.SemanticTags
----------------------------------

This technique relies on the basic tags themselves--for example,
all IMG tags include images, most H1 and H2 tags include titles,
and P tags often include text usable as descriptions::
all `img` tags include images, most `h1` and `h2` tags include titles,
and `p` tags often include text usable as descriptions::

<html>
<body>
Expand Down
1 change: 1 addition & 0 deletions extraction/__init__.py
Expand Up @@ -140,6 +140,7 @@ def feed(self):
class Extractor(object):
"Extracts title, summary and image(s) from an HTML document."
techniques = ["extraction.techniques.FacebookOpengraphTags",
"extraction.techniques.HTML5SemanticTags",
"extraction.techniques.HeadTags",
"extraction.techniques.SemanticTags",
]
Expand Down
49 changes: 49 additions & 0 deletions extraction/techniques.py
Expand Up @@ -123,6 +123,55 @@ def extract(self, html):
return extracted


class HTML5SemanticTags(Technique):
"""
The HTML5 `article` tag, and also the `video` tag give us some useful
hints for extracting page information for the sites which happen to
utilize these tags.
This technique will extract information from pages formed like::
<html>
<body>
<h1>This is not a title to HTML5SemanticTags</h1>
<article>
<h1>This is a title</h1>
<p>This is a description.</p>
<p>This is not a description.</p>
</article>
<video>
<source src="this_is_a_video.mp4">
</video>
</body>
</html>
Note that `HTML5SemanticTags` is intentionally much more conservative than
`SemanticTags`, as it provides high quality information in the small number
of cases where it hits, and otherwise expects `SemanticTags` to run sweep
behind it for the lower quality, more abundant hits it discovers.
"""
def extract(self, html):
"Extract data from Facebook Opengraph tags."
titles = []
descriptions = []
videos = []
soup = BeautifulSoup(html)
for article in soup.find_all('article') or []:
title = article.find('h1')
if title:
titles.append(" ".join(title.strings))
desc = article.find('p')
if desc:
descriptions.append(" ".join(desc.strings))

for video in soup.find_all('video') or []:
for source in video.find_all('source') or []:
if 'src' in source.attrs:
videos.append(source['src'])

return {'titles':titles, 'descriptions':descriptions, 'videos':videos}


class SemanticTags(Technique):
"""
This technique relies on the basic tags themselves--for example,
Expand Down
16 changes: 16 additions & 0 deletions extraction/tests/data.py
Expand Up @@ -130,3 +130,19 @@
<p>This is awesome.</p>
</body>
</html>"""

HTML5_HTML = """
<html>
<body>
<h1>This is not a title to HTML5SemanticTags</h1>
<article>
<h1>This is a title</h1>
<p>This is a description.</p>
<p>This is not a description.</p>
</article>
<video>
<source src="this_is_a_video.mp4">
</video>
</body>
</html>
"""
8 changes: 8 additions & 0 deletions extraction/tests/tests.py
Expand Up @@ -88,6 +88,14 @@ def test_technique_semantic_tags(self):
self.assertEqual(len(extracted.images), 2)
self.assertEquals(extracted.description.split(), "A month ago history reset with the second launch of Digg v1 , and memories are starting to fade since much of the Digg team joined SocialCode four months ago, so it seemed like a good time to describe the system and team architecture which ran and developed Digg.com from May 2010 until May 2012.".split())

def test_technique_html_semantic_tags(self):
"Test extracting data from an HTML5 page."
self.extractor.techniques = ["extraction.techniques.HTML5SemanticTags"]
extracted = self.extractor.extract(HTML5_HTML)
self.assertEqual(extracted.title, 'This is a title')
self.assertEqual(extracted.description, 'This is a description.')
self.assertEqual(extracted._unexpected_values['videos'], ["this_is_a_video.mp4"])

def test_example_lethain_com_technique(self):
"Test extracting data from lethain.com with a custom technique in extraction.examples."
self.extractor.techniques = ["extraction.examples.custom_technique.LethainComTechnique"]
Expand Down

0 comments on commit bd18b43

Please sign in to comment.