Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
  • 2 commits
  • 3 files changed
  • 0 comments
  • 2 contributors
Jan 24, 2013
Ramiro Gómez Don't through exception when HTML title is empty, added test to make …
…sure 1st h1 (if exists) is used as title
87bbaf1
Will Larson Merge pull request #1 from yaph/develop
Don't through exception when HTML title is empty
6bdedc0
4  extraction/__init__.py
@@ -207,7 +207,7 @@ def cleanup(self, results, html, source_url=None):
207 207
         cleaned_results = {}
208 208
         for data_type, data_values in results.items():
209 209
             if data_type in self.text_types:
210  
-                data_values = [self.cleanup_text(x) for x in data_values]
  210
+                data_values = [self.cleanup_text(x) for x in filter(None, data_values)]
211 211
             if data_type in self.url_types:
212 212
                 data_values = [self.cleanup_url(x, source_url=source_url) for x in data_values]
213 213
 
@@ -218,7 +218,7 @@ def cleanup(self, results, html, source_url=None):
218 218
                     unique_values.append(data_value)
219 219
 
220 220
             cleaned_results[data_type] = unique_values
221  
-        
  221
+
222 222
         return cleaned_results
223 223
 
224 224
     def extract(self, html, source_url=None):
12  extraction/tests/data.py
@@ -146,3 +146,15 @@
146 146
           </body>
147 147
         </html>
148 148
 """
  149
+
  150
+EMPTY_TITLE_HTML = """
  151
+<html>
  152
+  <head>
  153
+    <title></title>
  154
+  </head>
  155
+  <body>
  156
+    <h2>H2</h2>
  157
+    <h1>H1</h1>
  158
+    <h1>H1 2</h1>
  159
+  </body>
  160
+</html>"""
4  extraction/tests/tests.py
@@ -119,6 +119,10 @@ def test_example_new_return_type(self):
119 119
         self.assertEqual(extracted.description, None)
120 120
         self.assertEqual(extracted.image, None)
121 121
 
  122
+    def test_empty_title(self):
  123
+        "Test that HTML with an empty title sets first h1 heading as title."
  124
+        extracted = self.extractor.extract(EMPTY_TITLE_HTML)
  125
+        self.assertEqual(extracted.title, "H1")
122 126
 
123 127
 if __name__ == '__main__':
124 128
     unittest.main()

No commit comments for this range

Something went wrong with that request. Please try again.