Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Comparing changes

Choose two branches to see what's changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
  • 2 commits
  • 5 files changed
  • 0 commit comments
  • 1 contributor
View
21 README.rst
@@ -70,6 +70,26 @@ To rerank the techniques, remove techniques or add new techniques
of your own, look at the `Using Extraction` section below.
+
+extraction.techniques.HeadTags
+------------------------------
+
+Every webpage's head tag contains has a title tag, and many also
+include additional data like descriptions, RSS feeds and such.
+This technique parses data that looks like:
+
+ <head>
+ <meta name="description" content="Will Larson&#39;s blog about programming and other things." />
+ <link rel="alternate" type="application/rss+xml" title="Page Feed" href="/feeds/" />
+ <link rel="canonical" href="http://lethain.com/digg-v4-architecture-process/">
+ <title>Digg v4&#39;s Architecture and Development Processes - Irrational Exuberance</title>
+ </head>
+
+While the head tag is authoritative source of canonical URLs and RSS,
+it's often very hit or miss for titles, descriptions and such.
+At worst, it's better than nothing.
+
+
extraction.techniques.FacebookOpengraphTags
-------------------------------------------
@@ -134,6 +154,7 @@ The default ordering of techniques is within the extraction.Extractor's
`techniques` class variable, and is::
extraction.techniques.FacebookOpengraphTags
+ extraction.techniques.HeadTags
You can modify the order and inclusion of techniques in three ways.
First, you can modify it by passing in a list of techniques to the
View
42 extraction/__init__.py
@@ -14,7 +14,7 @@
class Extracted(object):
"Contains data extracted from a page."
- def __init__(self, titles=None, descriptions=None, images=None, urls=None, **kwargs):
+ def __init__(self, titles=None, descriptions=None, images=None, urls=None, feeds=None, **kwargs):
"""
Initialize Extracted instance.
@@ -55,21 +55,47 @@ def __init__(self, titles=None, descriptions=None, images=None, urls=None, **kwa
images = []
if urls is None:
urls = []
+ if feeds is None:
+ feeds = []
assert type(titles) in (list, tuple), "titles must be a list or tuple"
assert type(descriptions) in (list, tuple), "descriptions must be a list or tuple"
assert type(images) in (list, tuple), "images must be a list or tuple"
assert type(urls) in (list, tuple), "urls must be a list or tuple"
+ assert type(feeds) in (list, tuple), "urls must be a list or tuple"
self.titles = titles
self.descriptions = descriptions
self.images = images
self.urls = urls
+ self.feeds = feeds
# stores unexpected and uncaptured values to avoid crashing if
# a technique returns additional types of data
self._unexpected_values = kwargs
+ def __repr__(self):
+ "String representation of extracted results."
+ details = (("title", self.titles),
+ ("url", self.urls),
+ ("image", self.images),
+ ("feed", self.feeds),
+ ("description", self.descriptions),
+ )
+
+ details_strs = []
+ max_shown = 50
+ for name, values, in details:
+ if values:
+ value = values[0]
+ count = len(values)
+ if count -1 > 0:
+ details_strs.append("(%s: '%s', %s more)" % (name, value[:max_shown], count-1))
+ else:
+ details_strs.append("(%s: '%s')" % (name, value[:max_shown]))
+
+ return "<%s: %s>" % (self.__class__.__name__, ", ".join(details_strs))
+
@property
def title(self):
"Return the best title, if any."
@@ -102,15 +128,25 @@ def url(self):
else:
return None
+ @property
+ def feed(self):
+ "Return the best feed, if any."
+ if self.feeds:
+ return self.feeds[0]
+ else:
+ return None
+
class Extractor(object):
"Extracts title, summary and image(s) from an HTML document."
- techniques = ["extraction.techniques.FacebookOpengraphTags"]
+ techniques = ["extraction.techniques.FacebookOpengraphTags",
+ "extraction.techniques.HeadTags",
+ ]
extracted_class = Extracted
# for determining which cleanup mechanisms to apply
text_types = ["titles", "descriptions"]
- url_types = ["images", "urls"]
+ url_types = ["images", "urls", "feeds"]
def __init__(self, techniques=None, extracted_class=None, *args, **kwargs):
"Extractor."
View
55 extraction/techniques.py
@@ -19,6 +19,61 @@ def extract(self, html):
'urls': [],
}
+class HeadTags(Technique):
+ """
+ Extract info from standard HTML metatags like title, for example:
+
+ <head>
+ <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
+ <meta name="author" content="Will Larson" />
+ <meta name="description" content="Will Larson&#39;s blog about programming and other things." />
+ <meta name="keywords" content="Blog Will Larson Programming Life" />
+ <link rel="alternate" type="application/rss+xml" title="Page Feed" href="/feeds/" />
+ <link rel="canonical" href="http://lethain.com/digg-v4-architecture-process/">
+ <title>Digg v4&#39;s Architecture and Development Processes - Irrational Exuberance</title>
+ </head>
+
+ This is usually a last-resort, low quality, but reliable parsing mechanism.
+ """
+ meta_name_map = {
+ "description": "descriptions",
+ "author": "authors",
+ }
+
+ def extract(self, html):
+ "Extract data from meta, link and title tags within the head tag."
+ extracted = {}
+ soup = BeautifulSoup(html)
+ # extract data from title tag
+ title_tag = soup.find('title')
+ if title_tag:
+ extracted['titles'] = [title_tag.string]
+
+ # extract data from meta tags
+ for meta_tag in soup.find_all('meta'):
+ if 'name' in meta_tag.attrs and 'content' in meta_tag.attrs:
+ name = meta_tag['name']
+ if name in self.meta_name_map:
+ name_dest = self.meta_name_map[name]
+ if name_dest not in extracted:
+ extracted[name_dest] = []
+ extracted[name_dest].append(meta_tag.attrs['content'])
+
+ # extract data from link tags
+ for link_tag in soup.find_all('link'):
+ if 'rel' in link_tag.attrs:
+ if ('alternate' in link_tag['rel'] or link_tag['rel'] == 'alternate') and 'type' in link_tag.attrs and link_tag['type'] == "application/rss+xml" and 'href' in link_tag.attrs:
+ if 'feeds' not in extracted:
+ extracted['feeds'] = []
+ extracted['feeds'].append(link_tag['href'])
+ elif ('canonical' in link_tag['rel'] or link_tag['rel'] == 'canonical') and 'href' in link_tag.attrs:
+ if 'urls' not in extracted:
+ extracted['urls'] = []
+ extracted['urls'].append(link_tag['href'])
+
+ return extracted
+
+
class FacebookOpengraphTags(Technique):
"""
View
39 extraction/tests/tests.py
@@ -22,7 +22,33 @@ def test_rewriting_relative_urls(self):
# rewrites ../digg_v4/initial_org.png
self.assertEqual(extracted.images[1], "http://lethain.com/digg_v4/initial_org.png")
- def test_parse_facebook(self):
+ def test_default_techniques(self):
+ """
+ Test running the default techniques list with a simple page.
+
+ This is a bit of a high-level test to ensure that the default
+ techniques aren't completely broken.
+ """
+ extracted = self.extractor.extract(LETHAIN_COM_HTML, source_url="http://lethain.com/digg-v4-architecture-process/")
+ self.assertTrue(extracted.titles)
+ self.assertTrue(extracted.urls)
+ self.assertTrue(extracted.descriptions)
+ self.assertTrue(extracted.feeds)
+
+ def test_default_techniques_on_empty_page(self):
+ """
+ Test running the default techniques list against an empty HTML document.
+
+ This is useful for ensuring the defaut techniques fail sanely when they
+ encounter blank/empty documents.
+ """
+ extracted = self.extractor.extract("")
+ self.assertFalse(extracted.titles)
+ self.assertFalse(extracted.urls)
+ self.assertFalse(extracted.descriptions)
+ self.assertFalse(extracted.feeds)
+
+ def test_technique_facebook_meta_tags(self):
# make sure the shuffled sequence does not lose any elements
self.extractor.techniques = ["extraction.techniques.FacebookOpengraphTags"]
extracted = self.extractor.extract(FACEBOOK_HTML)
@@ -34,6 +60,17 @@ def test_parse_facebook(self):
self.assertTrue(extracted.description, "A group of U.S. Marines, under command of a renegade general, take over Alcatraz and threaten San Francisco Bay with biological weapons.")
self.assertEqual(len(extracted.descriptions), 1)
+ def test_technique_head_tags(self):
+ "Test extracting page information from HTML head tags (meta, title, ...)."
+ self.extractor.techniques = ["extraction.techniques.HeadTags"]
+ extracted = self.extractor.extract(LETHAIN_COM_HTML, source_url="http://lethain.com/digg-v4-architecture-process/")
+ self.assertEqual(extracted.title, "Digg v4's Architecture and Development Processes - Irrational Exuberance")
+ self.assertEqual(extracted.url, "http://lethain.com/digg-v4-architecture-process/")
+ self.assertEqual(extracted.image, None)
+ self.assertEquals(extracted.description, "Will Larson's blog about programming and other things.")
+ self.assertEqual(extracted.feed, "http://lethain.com/feeds/")
+ self.assertEqual(extracted._unexpected_values['authors'], ["Will Larson"])
+
def test_example_lethain_com_technique(self):
"Test extracting data from lethain.com with a custom technique in extraction.examples."
self.extractor.techniques = ["extraction.examples.custom_technique.LethainComTechnique"]
View
2  setup.py
@@ -10,5 +10,5 @@
url='http://pypi.python.org/pypi/extraction/',
license='LICENSE.txt',
description='Extract basic info from HTML webpages.',
- long_description=open('README.txt').read(),
+ long_description=open('README.rst').read(),
)

No commit comments for this range

Something went wrong with that request. Please try again.