Skip to content

Commit

Permalink
0.4.6 release
Browse files Browse the repository at this point in the history
  • Loading branch information
jvanasco committed Jul 26, 2012
1 parent 7d6989c commit 9e36e4c
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 4 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
0.4.6
- realized that some servers return gzip content, despite not advertising that this client accepts that content ; fixed by using some ideas from mark pilgrim's feedparser. metadata_parser now advertises gzip and zlib, and processes it as needed

0.4.5
- fixed a bug that prevented toplevel directories from being parsed

Expand Down
41 changes: 38 additions & 3 deletions metadata_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import gzip
import zlib
import re
import struct
import urllib2
import urlparse

Expand All @@ -7,6 +10,14 @@
except:
from BeautifulSoup import BeautifulSoup

try:
from io import BytesIO as _StringIO
except ImportError:
try:
from cStringIO import StringIO as _StringIO
except ImportError:
from StringIO import StringIO as _StringIO


RE_url= re.compile("""(https?\:\/\/[^\/]*(?:\:[\d]+)?)?(.*)""", re.I)

Expand Down Expand Up @@ -134,15 +145,38 @@ def fetch_url(self, url_data=None, url_headers=None , force_parse=False ):
pass
else:
raise NotParsable("I don't know what this file is")
raw= None


## borrowing some ideas from http://code.google.com/p/feedparser/source/browse/trunk/feedparser/feedparser.py#3701

req= None
raw= None
http_headers = {}
if url_data or url_headers:
req = urllib2.Request(self.url, url_data, url_headers)
req.add_header('Accept-encoding', 'gzip, deflate')
raw = CustomHTTPRedirectOpener.open(req)
else:
req = urllib2.Request(self.url)
req.add_header('Accept-encoding', 'gzip, deflate')
raw = CustomHTTPRedirectOpener.open(req)


html = raw.read()

# lowercase all of the HTTP headers for comparisons per RFC 2616
http_headers = dict((k.lower(), v) for k, v in raw.headers.items())
if 'gzip' in http_headers.get('content-encoding', ''):
try:
html = gzip.GzipFile(fileobj=_StringIO(html)).read()
except (IOError, struct.error), e:
raise
elif 'deflate' in http_headers.get('content-encoding', ''):
try:
html = zlib.decompress(html)
except zlib.error, e:
raise

self.url_actual= raw.geturl()
self.url_info= raw.info()
return html
Expand Down Expand Up @@ -176,6 +210,7 @@ def parser(self, html, force_parse=False ):
"""parses the html
"""
if not isinstance(html,BeautifulSoup):
html = unicode(html,errors='ignore')
try:
doc = BeautifulSoup(html,"lxml")
except:
Expand All @@ -186,8 +221,8 @@ def parser(self, html, force_parse=False ):
try:
ogs = doc.html.head.findAll(property=re.compile(r'^og'))
for og in ogs:
self.metadata['og'][og[u'property'][3:]]=og[u'content']
except AttributeError:
self.metadata['og'][og[u'property'][3:]] = og[u'content']
except ( AttributeError , KeyError ):
pass

# pull the text off the title
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from setuptools import setup, find_packages
import sys, os

version = '0.4.5'
version = '0.4.6'

setup(name='metadata_parser',
version=version,
Expand Down

0 comments on commit 9e36e4c

Please sign in to comment.