Add HTML5SemanticTags technique based on article and video tags.

lethain · Nov 24, 2012 · bd18b43 · bd18b43
1 parent b90e061
commit bd18b43
Show file tree

Hide file tree

Showing 5 changed files with 105 additions and 2 deletions.
diff --git a/README.rst b/README.rst
@@ -108,12 +108,41 @@ This technique uses Opengraph tags, which look like this::
 as their source of data.
 
 
+extraction.techniques.HTML5SemanticTags
+---------------------------------------
+
+The HTML5 `article` tag, and also the `video` tag give us some useful
+hints for extracting page information for the sites which happen to
+utilize these tags.
+
+This technique will extract information from pages formed like::
+
+    <html>
+      <body>
+        <h1>This is not a title to HTML5SemanticTags</h1>
+        <article>
+          <h1>This is a title</h1>
+          <p>This is a description.</p>
+          <p>This is not a description.</p>
+        </article>
+        <video>
+          <source src="this_is_a_video.mp4">
+        </video>
+      </body>
+    </html>
+
+Note that `HTML5SemanticTags` is intentionally much more conservative than
+`SemanticTags`, as it provides high quality information in the small number
+of cases where it hits, and otherwise expects `SemanticTags` to run sweep
+behind it for the lower quality, more abundant hits it discovers.
+
+
 extraction.techniques.SemanticTags
 ----------------------------------
 
 This technique relies on the basic tags themselves--for example,
-all IMG tags include images, most H1 and H2 tags include titles,
+all `img` tags include images, most `h1` and `h2` tags include titles,
-and P tags often include text usable as descriptions::
+and `p` tags often include text usable as descriptions::
 
     <html>
       <body>

diff --git a/extraction/__init__.py b/extraction/__init__.py
@@ -140,6 +140,7 @@ def feed(self):
 class Extractor(object):
     "Extracts title, summary and image(s) from an HTML document."
     techniques = ["extraction.techniques.FacebookOpengraphTags",
+                  "extraction.techniques.HTML5SemanticTags",
                   "extraction.techniques.HeadTags",
                   "extraction.techniques.SemanticTags",
                   ]

diff --git a/extraction/techniques.py b/extraction/techniques.py
@@ -123,6 +123,55 @@ def extract(self, html):
         return extracted
 
 
+class HTML5SemanticTags(Technique):
+    """
+    The HTML5 `article` tag, and also the `video` tag give us some useful
+    hints for extracting page information for the sites which happen to
+    utilize these tags.
+    
+    This technique will extract information from pages formed like::
+
+        <html>
+          <body>
+            <h1>This is not a title to HTML5SemanticTags</h1>
+            <article>
+              <h1>This is a title</h1>
+              <p>This is a description.</p>
+              <p>This is not a description.</p>
+            </article>
+            <video>
+              <source src="this_is_a_video.mp4">
+            </video>
+          </body>
+        </html>
+
+    Note that `HTML5SemanticTags` is intentionally much more conservative than
+    `SemanticTags`, as it provides high quality information in the small number
+    of cases where it hits, and otherwise expects `SemanticTags` to run sweep
+    behind it for the lower quality, more abundant hits it discovers.
+    """
+    def extract(self, html):
+        "Extract data from Facebook Opengraph tags."
+        titles = []
+        descriptions = []
+        videos = []
+        soup = BeautifulSoup(html)
+        for article in soup.find_all('article') or []:
+            title = article.find('h1')
+            if title:
+                titles.append(" ".join(title.strings))
+            desc = article.find('p')
+            if desc:
+                descriptions.append(" ".join(desc.strings))
+
+        for video in soup.find_all('video') or []:
+            for source in video.find_all('source') or []:
+                if 'src' in source.attrs:
+                    videos.append(source['src'])
+
+        return {'titles':titles, 'descriptions':descriptions, 'videos':videos}
+
+
 class SemanticTags(Technique):
     """
     This technique relies on the basic tags themselves--for example,

diff --git a/extraction/tests/data.py b/extraction/tests/data.py
@@ -130,3 +130,19 @@
     <p>This is awesome.</p>
   </body>
 </html>"""
+
+HTML5_HTML = """
+        <html>
+          <body>
+            <h1>This is not a title to HTML5SemanticTags</h1>
+            <article>
+              <h1>This is a title</h1>
+              <p>This is a description.</p>
+              <p>This is not a description.</p>
+            </article>
+            <video>
+              <source src="this_is_a_video.mp4">
+            </video>
+          </body>
+        </html>
+"""
diff --git a/extraction/tests/tests.py b/extraction/tests/tests.py
@@ -88,6 +88,14 @@ def test_technique_semantic_tags(self):
         self.assertEqual(len(extracted.images), 2)
         self.assertEquals(extracted.description.split(), "A month ago history reset with the second launch of Digg v1 , and memories are starting to fade since much of the Digg team joined SocialCode four months ago, so it seemed like a good time to describe the system and team architecture which ran and developed Digg.com from May 2010 until May 2012.".split())
 
+    def test_technique_html_semantic_tags(self):
+        "Test extracting data from an HTML5 page."
+        self.extractor.techniques = ["extraction.techniques.HTML5SemanticTags"]
+        extracted = self.extractor.extract(HTML5_HTML)
+        self.assertEqual(extracted.title, 'This is a title')
+        self.assertEqual(extracted.description, 'This is a description.')
+        self.assertEqual(extracted._unexpected_values['videos'], ["this_is_a_video.mp4"])
+
     def test_example_lethain_com_technique(self):
         "Test extracting data from lethain.com with a custom technique in extraction.examples."
         self.extractor.techniques = ["extraction.examples.custom_technique.LethainComTechnique"]