Skip to content
Browse files

Add extraction.techniques.HeadTags which extracts metadata from META,…

… LINK and TITLE tags.
  • Loading branch information...
1 parent 802c00d commit 566229fd42dfd247166951dabe69338fdc535467 @lethain committed Nov 23, 2012
Showing with 132 additions and 5 deletions.
  1. +21 −0 README.rst
  2. +17 −3 extraction/__init__.py
  3. +55 −0 extraction/techniques.py
  4. +38 −1 extraction/tests/tests.py
  5. +1 −1 setup.py
View
21 README.rst
@@ -70,6 +70,26 @@ To rerank the techniques, remove techniques or add new techniques
of your own, look at the `Using Extraction` section below.
+
+extraction.techniques.HeadTags
+------------------------------
+
+Every webpage's head tag contains has a title tag, and many also
+include additional data like descriptions, RSS feeds and such.
+This technique parses data that looks like:
+
+ <head>
+ <meta name="description" content="Will Larson&#39;s blog about programming and other things." />
+ <link rel="alternate" type="application/rss+xml" title="Page Feed" href="/feeds/" />
+ <link rel="canonical" href="http://lethain.com/digg-v4-architecture-process/">
+ <title>Digg v4&#39;s Architecture and Development Processes - Irrational Exuberance</title>
+ </head>
+
+While the head tag is authoritative source of canonical URLs and RSS,
+it's often very hit or miss for titles, descriptions and such.
+At worst, it's better than nothing.
+
+
extraction.techniques.FacebookOpengraphTags
-------------------------------------------
@@ -134,6 +154,7 @@ The default ordering of techniques is within the extraction.Extractor's
`techniques` class variable, and is::
extraction.techniques.FacebookOpengraphTags
+ extraction.techniques.HeadTags
You can modify the order and inclusion of techniques in three ways.
First, you can modify it by passing in a list of techniques to the
View
20 extraction/__init__.py
@@ -14,7 +14,7 @@
class Extracted(object):
"Contains data extracted from a page."
- def __init__(self, titles=None, descriptions=None, images=None, urls=None, **kwargs):
+ def __init__(self, titles=None, descriptions=None, images=None, urls=None, feeds=None, **kwargs):
"""
Initialize Extracted instance.
@@ -55,16 +55,20 @@ def __init__(self, titles=None, descriptions=None, images=None, urls=None, **kwa
images = []
if urls is None:
urls = []
+ if feeds is None:
+ feeds = []
assert type(titles) in (list, tuple), "titles must be a list or tuple"
assert type(descriptions) in (list, tuple), "descriptions must be a list or tuple"
assert type(images) in (list, tuple), "images must be a list or tuple"
assert type(urls) in (list, tuple), "urls must be a list or tuple"
+ assert type(feeds) in (list, tuple), "urls must be a list or tuple"
self.titles = titles
self.descriptions = descriptions
self.images = images
self.urls = urls
+ self.feeds = feeds
# stores unexpected and uncaptured values to avoid crashing if
# a technique returns additional types of data
@@ -102,15 +106,25 @@ def url(self):
else:
return None
+ @property
+ def feed(self):
+ "Return the best feed, if any."
+ if self.feeds:
+ return self.feeds[0]
+ else:
+ return None
+
class Extractor(object):
"Extracts title, summary and image(s) from an HTML document."
- techniques = ["extraction.techniques.FacebookOpengraphTags"]
+ techniques = ["extraction.techniques.FacebookOpengraphTags",
+ "extraction.techniques.HeadTags",
+ ]
extracted_class = Extracted
# for determining which cleanup mechanisms to apply
text_types = ["titles", "descriptions"]
- url_types = ["images", "urls"]
+ url_types = ["images", "urls", "feeds"]
def __init__(self, techniques=None, extracted_class=None, *args, **kwargs):
"Extractor."
View
55 extraction/techniques.py
@@ -19,6 +19,61 @@ def extract(self, html):
'urls': [],
}
+class HeadTags(Technique):
+ """
+ Extract info from standard HTML metatags like title, for example:
+
+ <head>
+ <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
+ <meta name="author" content="Will Larson" />
+ <meta name="description" content="Will Larson&#39;s blog about programming and other things." />
+ <meta name="keywords" content="Blog Will Larson Programming Life" />
+ <link rel="alternate" type="application/rss+xml" title="Page Feed" href="/feeds/" />
+ <link rel="canonical" href="http://lethain.com/digg-v4-architecture-process/">
+ <title>Digg v4&#39;s Architecture and Development Processes - Irrational Exuberance</title>
+ </head>
+
+ This is usually a last-resort, low quality, but reliable parsing mechanism.
+ """
+ meta_name_map = {
+ "description": "descriptions",
+ "author": "authors",
+ }
+
+ def extract(self, html):
+ "Extract data from meta, link and title tags within the head tag."
+ extracted = {}
+ soup = BeautifulSoup(html)
+ # extract data from title tag
+ title_tag = soup.find('title')
+ if title_tag:
+ extracted['titles'] = [title_tag.string]
+
+ # extract data from meta tags
+ for meta_tag in soup.find_all('meta'):
+ if 'name' in meta_tag.attrs and 'content' in meta_tag.attrs:
+ name = meta_tag['name']
+ if name in self.meta_name_map:
+ name_dest = self.meta_name_map[name]
+ if name_dest not in extracted:
+ extracted[name_dest] = []
+ extracted[name_dest].append(meta_tag.attrs['content'])
+
+ # extract data from link tags
+ for link_tag in soup.find_all('link'):
+ if 'rel' in link_tag.attrs:
+ if ('alternate' in link_tag['rel'] or link_tag['rel'] == 'alternate') and 'type' in link_tag.attrs and link_tag['type'] == "application/rss+xml" and 'href' in link_tag.attrs:
+ if 'feeds' not in extracted:
+ extracted['feeds'] = []
+ extracted['feeds'].append(link_tag['href'])
+ elif ('canonical' in link_tag['rel'] or link_tag['rel'] == 'canonical') and 'href' in link_tag.attrs:
+ if 'urls' not in extracted:
+ extracted['urls'] = []
+ extracted['urls'].append(link_tag['href'])
+
+ return extracted
+
+
class FacebookOpengraphTags(Technique):
"""
View
39 extraction/tests/tests.py
@@ -22,7 +22,33 @@ def test_rewriting_relative_urls(self):
# rewrites ../digg_v4/initial_org.png
self.assertEqual(extracted.images[1], "http://lethain.com/digg_v4/initial_org.png")
- def test_parse_facebook(self):
+ def test_default_techniques(self):
+ """
+ Test running the default techniques list with a simple page.
+
+ This is a bit of a high-level test to ensure that the default
+ techniques aren't completely broken.
+ """
+ extracted = self.extractor.extract(LETHAIN_COM_HTML, source_url="http://lethain.com/digg-v4-architecture-process/")
+ self.assertTrue(extracted.titles)
+ self.assertTrue(extracted.urls)
+ self.assertTrue(extracted.descriptions)
+ self.assertTrue(extracted.feeds)
+
+ def test_default_techniques_on_empty_page(self):
+ """
+ Test running the default techniques list against an empty HTML document.
+
+ This is useful for ensuring the defaut techniques fail sanely when they
+ encounter blank/empty documents.
+ """
+ extracted = self.extractor.extract("")
+ self.assertFalse(extracted.titles)
+ self.assertFalse(extracted.urls)
+ self.assertFalse(extracted.descriptions)
+ self.assertFalse(extracted.feeds)
+
+ def test_technique_facebook_meta_tags(self):
# make sure the shuffled sequence does not lose any elements
self.extractor.techniques = ["extraction.techniques.FacebookOpengraphTags"]
extracted = self.extractor.extract(FACEBOOK_HTML)
@@ -34,6 +60,17 @@ def test_parse_facebook(self):
self.assertTrue(extracted.description, "A group of U.S. Marines, under command of a renegade general, take over Alcatraz and threaten San Francisco Bay with biological weapons.")
self.assertEqual(len(extracted.descriptions), 1)
+ def test_technique_head_tags(self):
+ "Test extracting page information from HTML head tags (meta, title, ...)."
+ self.extractor.techniques = ["extraction.techniques.HeadTags"]
+ extracted = self.extractor.extract(LETHAIN_COM_HTML, source_url="http://lethain.com/digg-v4-architecture-process/")
+ self.assertEqual(extracted.title, "Digg v4's Architecture and Development Processes - Irrational Exuberance")
+ self.assertEqual(extracted.url, "http://lethain.com/digg-v4-architecture-process/")
+ self.assertEqual(extracted.image, None)
+ self.assertEquals(extracted.description, "Will Larson's blog about programming and other things.")
+ self.assertEqual(extracted.feed, "http://lethain.com/feeds/")
+ self.assertEqual(extracted._unexpected_values['authors'], ["Will Larson"])
+
def test_example_lethain_com_technique(self):
"Test extracting data from lethain.com with a custom technique in extraction.examples."
self.extractor.techniques = ["extraction.examples.custom_technique.LethainComTechnique"]
View
2 setup.py
@@ -10,5 +10,5 @@
url='http://pypi.python.org/pypi/extraction/',
license='LICENSE.txt',
description='Extract basic info from HTML webpages.',
- long_description=open('README.txt').read(),
+ long_description=open('README.rst').read(),
)

0 comments on commit 566229f

Please sign in to comment.
Something went wrong with that request. Please try again.