0.4.6 release

jvanasco · Jul 26, 2012 · 9e36e4c · 9e36e4c
1 parent 7d6989c
commit 9e36e4c
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 4 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,3 +1,6 @@
+0.4.6
+	- realized that some servers return gzip content, despite not advertising that this client accepts that content ; fixed by using some ideas from mark pilgrim's feedparser.  metadata_parser now advertises gzip and zlib, and processes it as needed
+
 0.4.5
 	- fixed a bug that prevented toplevel directories from being parsed
 

diff --git a/metadata_parser/__init__.py b/metadata_parser/__init__.py
@@ -1,4 +1,7 @@
+import gzip
+import zlib
 import re
+import struct
 import urllib2
 import urlparse
 
@@ -7,6 +10,14 @@
 except:
     from BeautifulSoup import BeautifulSoup
 
+try:
+    from io import BytesIO as _StringIO
+except ImportError:
+    try:
+        from cStringIO import StringIO as _StringIO
+    except ImportError:
+        from StringIO import StringIO as _StringIO
+
 
 RE_url= re.compile("""(https?\:\/\/[^\/]*(?:\:[\d]+)?)?(.*)""", re.I)
 
@@ -134,15 +145,38 @@ def fetch_url(self, url_data=None, url_headers=None , force_parse=False ):
                         pass
                     else:
                         raise NotParsable("I don't know what this file is")
-        raw= None
+
+
+        ## borrowing some ideas from http://code.google.com/p/feedparser/source/browse/trunk/feedparser/feedparser.py#3701
+
         req= None
+        raw= None
+        http_headers = {}
         if url_data or url_headers:
             req = urllib2.Request(self.url, url_data, url_headers)
+            req.add_header('Accept-encoding', 'gzip, deflate')
             raw = CustomHTTPRedirectOpener.open(req)
         else:
             req = urllib2.Request(self.url)
+            req.add_header('Accept-encoding', 'gzip, deflate')
             raw = CustomHTTPRedirectOpener.open(req)
+
+
         html = raw.read()
+
+        # lowercase all of the HTTP headers for comparisons per RFC 2616
+        http_headers = dict((k.lower(), v) for k, v in raw.headers.items())
+        if 'gzip' in http_headers.get('content-encoding', ''):
+            try:
+                html = gzip.GzipFile(fileobj=_StringIO(html)).read()
+            except (IOError, struct.error), e:
+                raise
+        elif 'deflate' in http_headers.get('content-encoding', ''):
+            try:
+                html = zlib.decompress(html)
+            except zlib.error, e:
+                raise
+
         self.url_actual= raw.geturl()
         self.url_info= raw.info()
         return html
@@ -176,6 +210,7 @@ def parser(self, html, force_parse=False ):
         """parses the html
         """
         if not isinstance(html,BeautifulSoup):
+            html = unicode(html,errors='ignore')
             try:
                 doc = BeautifulSoup(html,"lxml")
             except:
@@ -186,8 +221,8 @@ def parser(self, html, force_parse=False ):
         try:
             ogs = doc.html.head.findAll(property=re.compile(r'^og'))
             for og in ogs:
-                self.metadata['og'][og[u'property'][3:]]=og[u'content']
-        except AttributeError:
+                self.metadata['og'][og[u'property'][3:]] = og[u'content']
+        except ( AttributeError , KeyError ):
             pass
 
         # pull the text off the title

diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import sys, os
 
-version = '0.4.5'
+version = '0.4.6'
 
 setup(name='metadata_parser',
       version=version,