Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Add HTML5SemanticTags technique based on article and video tags.

  • Loading branch information...
commit bd18b433c6a8df46ca9c5eef559bf12232e90912 1 parent b90e061
@lethain authored
View
33 README.rst
@@ -108,12 +108,41 @@ This technique uses Opengraph tags, which look like this::
as their source of data.
+extraction.techniques.HTML5SemanticTags
+---------------------------------------
+
+The HTML5 `article` tag, and also the `video` tag give us some useful
+hints for extracting page information for the sites which happen to
+utilize these tags.
+
+This technique will extract information from pages formed like::
+
+ <html>
+ <body>
+ <h1>This is not a title to HTML5SemanticTags</h1>
+ <article>
+ <h1>This is a title</h1>
+ <p>This is a description.</p>
+ <p>This is not a description.</p>
+ </article>
+ <video>
+ <source src="this_is_a_video.mp4">
+ </video>
+ </body>
+ </html>
+
+Note that `HTML5SemanticTags` is intentionally much more conservative than
+`SemanticTags`, as it provides high quality information in the small number
+of cases where it hits, and otherwise expects `SemanticTags` to run sweep
+behind it for the lower quality, more abundant hits it discovers.
+
+
extraction.techniques.SemanticTags
----------------------------------
This technique relies on the basic tags themselves--for example,
-all IMG tags include images, most H1 and H2 tags include titles,
-and P tags often include text usable as descriptions::
+all `img` tags include images, most `h1` and `h2` tags include titles,
+and `p` tags often include text usable as descriptions::
<html>
<body>
View
1  extraction/__init__.py
@@ -140,6 +140,7 @@ def feed(self):
class Extractor(object):
"Extracts title, summary and image(s) from an HTML document."
techniques = ["extraction.techniques.FacebookOpengraphTags",
+ "extraction.techniques.HTML5SemanticTags",
"extraction.techniques.HeadTags",
"extraction.techniques.SemanticTags",
]
View
49 extraction/techniques.py
@@ -123,6 +123,55 @@ def extract(self, html):
return extracted
+class HTML5SemanticTags(Technique):
+ """
+ The HTML5 `article` tag, and also the `video` tag give us some useful
+ hints for extracting page information for the sites which happen to
+ utilize these tags.
+
+ This technique will extract information from pages formed like::
+
+ <html>
+ <body>
+ <h1>This is not a title to HTML5SemanticTags</h1>
+ <article>
+ <h1>This is a title</h1>
+ <p>This is a description.</p>
+ <p>This is not a description.</p>
+ </article>
+ <video>
+ <source src="this_is_a_video.mp4">
+ </video>
+ </body>
+ </html>
+
+ Note that `HTML5SemanticTags` is intentionally much more conservative than
+ `SemanticTags`, as it provides high quality information in the small number
+ of cases where it hits, and otherwise expects `SemanticTags` to run sweep
+ behind it for the lower quality, more abundant hits it discovers.
+ """
+ def extract(self, html):
+ "Extract data from Facebook Opengraph tags."
+ titles = []
+ descriptions = []
+ videos = []
+ soup = BeautifulSoup(html)
+ for article in soup.find_all('article') or []:
+ title = article.find('h1')
+ if title:
+ titles.append(" ".join(title.strings))
+ desc = article.find('p')
+ if desc:
+ descriptions.append(" ".join(desc.strings))
+
+ for video in soup.find_all('video') or []:
+ for source in video.find_all('source') or []:
+ if 'src' in source.attrs:
+ videos.append(source['src'])
+
+ return {'titles':titles, 'descriptions':descriptions, 'videos':videos}
+
+
class SemanticTags(Technique):
"""
This technique relies on the basic tags themselves--for example,
View
16 extraction/tests/data.py
@@ -130,3 +130,19 @@
<p>This is awesome.</p>
</body>
</html>"""
+
+HTML5_HTML = """
+ <html>
+ <body>
+ <h1>This is not a title to HTML5SemanticTags</h1>
+ <article>
+ <h1>This is a title</h1>
+ <p>This is a description.</p>
+ <p>This is not a description.</p>
+ </article>
+ <video>
+ <source src="this_is_a_video.mp4">
+ </video>
+ </body>
+ </html>
+"""
View
8 extraction/tests/tests.py
@@ -88,6 +88,14 @@ def test_technique_semantic_tags(self):
self.assertEqual(len(extracted.images), 2)
self.assertEquals(extracted.description.split(), "A month ago history reset with the second launch of Digg v1 , and memories are starting to fade since much of the Digg team joined SocialCode four months ago, so it seemed like a good time to describe the system and team architecture which ran and developed Digg.com from May 2010 until May 2012.".split())
+ def test_technique_html_semantic_tags(self):
+ "Test extracting data from an HTML5 page."
+ self.extractor.techniques = ["extraction.techniques.HTML5SemanticTags"]
+ extracted = self.extractor.extract(HTML5_HTML)
+ self.assertEqual(extracted.title, 'This is a title')
+ self.assertEqual(extracted.description, 'This is a description.')
+ self.assertEqual(extracted._unexpected_values['videos'], ["this_is_a_video.mp4"])
+
def test_example_lethain_com_technique(self):
"Test extracting data from lethain.com with a custom technique in extraction.examples."
self.extractor.techniques = ["extraction.examples.custom_technique.LethainComTechnique"]
Please sign in to comment.
Something went wrong with that request. Please try again.