Navigation Menu

Skip to content

Commit

Permalink
Add HTML5SemanticTags technique based on article and video tags.
Browse files Browse the repository at this point in the history
  • Loading branch information
lethain committed Nov 24, 2012
1 parent b90e061 commit bd18b43
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 2 deletions.
33 changes: 31 additions & 2 deletions README.rst
Expand Up @@ -108,12 +108,41 @@ This technique uses Opengraph tags, which look like this::
as their source of data. as their source of data.




extraction.techniques.HTML5SemanticTags
---------------------------------------

The HTML5 `article` tag, and also the `video` tag give us some useful
hints for extracting page information for the sites which happen to
utilize these tags.

This technique will extract information from pages formed like::

<html>
<body>
<h1>This is not a title to HTML5SemanticTags</h1>
<article>
<h1>This is a title</h1>
<p>This is a description.</p>
<p>This is not a description.</p>
</article>
<video>
<source src="this_is_a_video.mp4">
</video>
</body>
</html>

Note that `HTML5SemanticTags` is intentionally much more conservative than
`SemanticTags`, as it provides high quality information in the small number
of cases where it hits, and otherwise expects `SemanticTags` to run sweep
behind it for the lower quality, more abundant hits it discovers.


extraction.techniques.SemanticTags extraction.techniques.SemanticTags
---------------------------------- ----------------------------------


This technique relies on the basic tags themselves--for example, This technique relies on the basic tags themselves--for example,
all IMG tags include images, most H1 and H2 tags include titles, all `img` tags include images, most `h1` and `h2` tags include titles,
and P tags often include text usable as descriptions:: and `p` tags often include text usable as descriptions::


<html> <html>
<body> <body>
Expand Down
1 change: 1 addition & 0 deletions extraction/__init__.py
Expand Up @@ -140,6 +140,7 @@ def feed(self):
class Extractor(object): class Extractor(object):
"Extracts title, summary and image(s) from an HTML document." "Extracts title, summary and image(s) from an HTML document."
techniques = ["extraction.techniques.FacebookOpengraphTags", techniques = ["extraction.techniques.FacebookOpengraphTags",
"extraction.techniques.HTML5SemanticTags",
"extraction.techniques.HeadTags", "extraction.techniques.HeadTags",
"extraction.techniques.SemanticTags", "extraction.techniques.SemanticTags",
] ]
Expand Down
49 changes: 49 additions & 0 deletions extraction/techniques.py
Expand Up @@ -123,6 +123,55 @@ def extract(self, html):
return extracted return extracted




class HTML5SemanticTags(Technique):
"""
The HTML5 `article` tag, and also the `video` tag give us some useful
hints for extracting page information for the sites which happen to
utilize these tags.
This technique will extract information from pages formed like::
<html>
<body>
<h1>This is not a title to HTML5SemanticTags</h1>
<article>
<h1>This is a title</h1>
<p>This is a description.</p>
<p>This is not a description.</p>
</article>
<video>
<source src="this_is_a_video.mp4">
</video>
</body>
</html>
Note that `HTML5SemanticTags` is intentionally much more conservative than
`SemanticTags`, as it provides high quality information in the small number
of cases where it hits, and otherwise expects `SemanticTags` to run sweep
behind it for the lower quality, more abundant hits it discovers.
"""
def extract(self, html):
"Extract data from Facebook Opengraph tags."
titles = []
descriptions = []
videos = []
soup = BeautifulSoup(html)
for article in soup.find_all('article') or []:
title = article.find('h1')
if title:
titles.append(" ".join(title.strings))
desc = article.find('p')
if desc:
descriptions.append(" ".join(desc.strings))

for video in soup.find_all('video') or []:
for source in video.find_all('source') or []:
if 'src' in source.attrs:
videos.append(source['src'])

return {'titles':titles, 'descriptions':descriptions, 'videos':videos}


class SemanticTags(Technique): class SemanticTags(Technique):
""" """
This technique relies on the basic tags themselves--for example, This technique relies on the basic tags themselves--for example,
Expand Down
16 changes: 16 additions & 0 deletions extraction/tests/data.py
Expand Up @@ -130,3 +130,19 @@
<p>This is awesome.</p> <p>This is awesome.</p>
</body> </body>
</html>""" </html>"""

HTML5_HTML = """
<html>
<body>
<h1>This is not a title to HTML5SemanticTags</h1>
<article>
<h1>This is a title</h1>
<p>This is a description.</p>
<p>This is not a description.</p>
</article>
<video>
<source src="this_is_a_video.mp4">
</video>
</body>
</html>
"""
8 changes: 8 additions & 0 deletions extraction/tests/tests.py
Expand Up @@ -88,6 +88,14 @@ def test_technique_semantic_tags(self):
self.assertEqual(len(extracted.images), 2) self.assertEqual(len(extracted.images), 2)
self.assertEquals(extracted.description.split(), "A month ago history reset with the second launch of Digg v1 , and memories are starting to fade since much of the Digg team joined SocialCode four months ago, so it seemed like a good time to describe the system and team architecture which ran and developed Digg.com from May 2010 until May 2012.".split()) self.assertEquals(extracted.description.split(), "A month ago history reset with the second launch of Digg v1 , and memories are starting to fade since much of the Digg team joined SocialCode four months ago, so it seemed like a good time to describe the system and team architecture which ran and developed Digg.com from May 2010 until May 2012.".split())


def test_technique_html_semantic_tags(self):
"Test extracting data from an HTML5 page."
self.extractor.techniques = ["extraction.techniques.HTML5SemanticTags"]
extracted = self.extractor.extract(HTML5_HTML)
self.assertEqual(extracted.title, 'This is a title')
self.assertEqual(extracted.description, 'This is a description.')
self.assertEqual(extracted._unexpected_values['videos'], ["this_is_a_video.mp4"])

def test_example_lethain_com_technique(self): def test_example_lethain_com_technique(self):
"Test extracting data from lethain.com with a custom technique in extraction.examples." "Test extracting data from lethain.com with a custom technique in extraction.examples."
self.extractor.techniques = ["extraction.examples.custom_technique.LethainComTechnique"] self.extractor.techniques = ["extraction.examples.custom_technique.LethainComTechnique"]
Expand Down

0 comments on commit bd18b43

Please sign in to comment.