Permalink
Browse files

0.4.6 release

  • Loading branch information...
1 parent 7d6989c commit 9e36e4c5e6da3defe2c3afdf5986423fddd22c15 @jvanasco committed Jul 26, 2012
Showing with 42 additions and 4 deletions.
  1. +3 −0 CHANGELOG.txt
  2. +38 −3 metadata_parser/__init__.py
  3. +1 −1 setup.py
View
@@ -1,3 +1,6 @@
+0.4.6
+ - realized that some servers return gzip content, despite not advertising that this client accepts that content ; fixed by using some ideas from mark pilgrim's feedparser. metadata_parser now advertises gzip and zlib, and processes it as needed
+
0.4.5
- fixed a bug that prevented toplevel directories from being parsed
@@ -1,4 +1,7 @@
+import gzip
+import zlib
import re
+import struct
import urllib2
import urlparse
@@ -7,6 +10,14 @@
except:
from BeautifulSoup import BeautifulSoup
+try:
+ from io import BytesIO as _StringIO
+except ImportError:
+ try:
+ from cStringIO import StringIO as _StringIO
+ except ImportError:
+ from StringIO import StringIO as _StringIO
+
RE_url= re.compile("""(https?\:\/\/[^\/]*(?:\:[\d]+)?)?(.*)""", re.I)
@@ -134,15 +145,38 @@ def fetch_url(self, url_data=None, url_headers=None , force_parse=False ):
pass
else:
raise NotParsable("I don't know what this file is")
- raw= None
+
+
+ ## borrowing some ideas from http://code.google.com/p/feedparser/source/browse/trunk/feedparser/feedparser.py#3701
+
req= None
+ raw= None
+ http_headers = {}
if url_data or url_headers:
req = urllib2.Request(self.url, url_data, url_headers)
+ req.add_header('Accept-encoding', 'gzip, deflate')
raw = CustomHTTPRedirectOpener.open(req)
else:
req = urllib2.Request(self.url)
+ req.add_header('Accept-encoding', 'gzip, deflate')
raw = CustomHTTPRedirectOpener.open(req)
+
+
html = raw.read()
+
+ # lowercase all of the HTTP headers for comparisons per RFC 2616
+ http_headers = dict((k.lower(), v) for k, v in raw.headers.items())
+ if 'gzip' in http_headers.get('content-encoding', ''):
+ try:
+ html = gzip.GzipFile(fileobj=_StringIO(html)).read()
+ except (IOError, struct.error), e:
+ raise
+ elif 'deflate' in http_headers.get('content-encoding', ''):
+ try:
+ html = zlib.decompress(html)
+ except zlib.error, e:
+ raise
+
self.url_actual= raw.geturl()
self.url_info= raw.info()
return html
@@ -176,6 +210,7 @@ def parser(self, html, force_parse=False ):
"""parses the html
"""
if not isinstance(html,BeautifulSoup):
+ html = unicode(html,errors='ignore')
try:
doc = BeautifulSoup(html,"lxml")
except:
@@ -186,8 +221,8 @@ def parser(self, html, force_parse=False ):
try:
ogs = doc.html.head.findAll(property=re.compile(r'^og'))
for og in ogs:
- self.metadata['og'][og[u'property'][3:]]=og[u'content']
- except AttributeError:
+ self.metadata['og'][og[u'property'][3:]] = og[u'content']
+ except ( AttributeError , KeyError ):
pass
# pull the text off the title
View
@@ -1,7 +1,7 @@
from setuptools import setup, find_packages
import sys, os
-version = '0.4.5'
+version = '0.4.6'
setup(name='metadata_parser',
version=version,

0 comments on commit 9e36e4c

Please sign in to comment.