Permalink
Browse files

Merge pull request #1 from yaph/develop

Don't through exception when HTML title is empty
  • Loading branch information...
2 parents e0734c3 + 87bbaf1 commit 6bdedc0aec5a9514e36267de2ec5754c3ded1cf9 @lethain committed Jan 25, 2013
Showing with 18 additions and 2 deletions.
  1. +2 −2 extraction/__init__.py
  2. +12 −0 extraction/tests/data.py
  3. +4 −0 extraction/tests/tests.py
@@ -207,7 +207,7 @@ def cleanup(self, results, html, source_url=None):
cleaned_results = {}
for data_type, data_values in results.items():
if data_type in self.text_types:
- data_values = [self.cleanup_text(x) for x in data_values]
+ data_values = [self.cleanup_text(x) for x in filter(None, data_values)]
if data_type in self.url_types:
data_values = [self.cleanup_url(x, source_url=source_url) for x in data_values]
@@ -218,7 +218,7 @@ def cleanup(self, results, html, source_url=None):
unique_values.append(data_value)
cleaned_results[data_type] = unique_values
-
+
return cleaned_results
def extract(self, html, source_url=None):
@@ -146,3 +146,15 @@
</body>
</html>
"""
+
+EMPTY_TITLE_HTML = """
+<html>
+ <head>
+ <title></title>
+ </head>
+ <body>
+ <h2>H2</h2>
+ <h1>H1</h1>
+ <h1>H1 2</h1>
+ </body>
+</html>"""
@@ -119,6 +119,10 @@ def test_example_new_return_type(self):
self.assertEqual(extracted.description, None)
self.assertEqual(extracted.image, None)
+ def test_empty_title(self):
+ "Test that HTML with an empty title sets first h1 heading as title."
+ extracted = self.extractor.extract(EMPTY_TITLE_HTML)
+ self.assertEqual(extracted.title, "H1")
if __name__ == '__main__':
unittest.main()

0 comments on commit 6bdedc0

Please sign in to comment.