Skip to content


Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP


Don't through exception when HTML title is empty #1

merged 1 commit into from

2 participants


When extracting content from an HTML file with an empty title an exception occurs, see traceback below. To avoid this I added a call to filter to remove possible None values from data_values in the cleanup function. I also added a test.

Traceback (most recent call last):
  File "tests/", line 123, in test_empty_title
    extracted = self.extractor.extract(EMPTY_TITLE_HTML)
  File "/home/ramiro/repos/pub/bookmark-tools/local/lib/python2.7/site-packages/extraction/", line 248, in extract
    return self.extracted_class(**self.cleanup(extracted, html, source_url=source_url))
  File "/home/ramiro/repos/pub/bookmark-tools/local/lib/python2.7/site-packages/extraction/", line 211, in cleanup
    data_values = [self.cleanup_text(x) for x in data_values]
  File "/home/ramiro/repos/pub/bookmark-tools/local/lib/python2.7/site-packages/extraction/", line 183, in cleanup_text
    return " ".join(value.split())
AttributeError: 'NoneType' object has no attribute 'split'

Awesome! Much appreciated, merging it in.

@lethain lethain merged commit 6bdedc0 into from
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Jan 24, 2013
  1. @yaph

    Don't through exception when HTML title is empty, added test to make …

    yaph authored
    …sure 1st h1 (if exists) is used as title
This page is out of date. Refresh to see the latest.
4 extraction/
@@ -207,7 +207,7 @@ def cleanup(self, results, html, source_url=None):
cleaned_results = {}
for data_type, data_values in results.items():
if data_type in self.text_types:
- data_values = [self.cleanup_text(x) for x in data_values]
+ data_values = [self.cleanup_text(x) for x in filter(None, data_values)]
if data_type in self.url_types:
data_values = [self.cleanup_url(x, source_url=source_url) for x in data_values]
@@ -218,7 +218,7 @@ def cleanup(self, results, html, source_url=None):
cleaned_results[data_type] = unique_values
return cleaned_results
def extract(self, html, source_url=None):
12 extraction/tests/
@@ -146,3 +146,15 @@
+ <head>
+ <title></title>
+ </head>
+ <body>
+ <h2>H2</h2>
+ <h1>H1</h1>
+ <h1>H1 2</h1>
+ </body>
4 extraction/tests/
@@ -119,6 +119,10 @@ def test_example_new_return_type(self):
self.assertEqual(extracted.description, None)
self.assertEqual(extracted.image, None)
+ def test_empty_title(self):
+ "Test that HTML with an empty title sets first h1 heading as title."
+ extracted = self.extractor.extract(EMPTY_TITLE_HTML)
+ self.assertEqual(extracted.title, "H1")
if __name__ == '__main__':
Something went wrong with that request. Please try again.