Skip to content

Commit

Permalink
Merge branch 'develop' for 0.6.5
Browse files Browse the repository at this point in the history
  • Loading branch information
ikreymer committed Dec 5, 2014
2 parents cc776b6 + d31a4df commit 238a45b
Show file tree
Hide file tree
Showing 17 changed files with 203 additions and 30 deletions.
17 changes: 17 additions & 0 deletions CHANGES.rst
@@ -1,3 +1,20 @@
pywb 0.6.5 changelist
~~~~~~~~~~~~~~~~~~~~~

* fix static handling when content type can not be guessed, default to 'application/octet-stream'

* rewrite fix: understand partially encoded urls such as http%3A// in WbUrl, decode correctly

* rewrite fix: rewrite \/\/example.com and \\/\\/example.com in JS same as \\example.com

* cookies: add exact cookie rewriter which sets cookie to exact url only, never collection or host root

* don't rewrite rel=canonical links for services which rely on these

* cdx-indexer: Detect non-gzip chunk encoded .warc.gz/arc.gz archive files and show a meaningful
error message explaining how to fix issue (uncompress and possibly use warctools warc2warc to recompress)


pywb 0.6.4 changelist
~~~~~~~~~~~~~~~~~~~~~

Expand Down
2 changes: 1 addition & 1 deletion README.rst
@@ -1,4 +1,4 @@
PyWb 0.6.4
PyWb 0.6.5
==========

.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master
Expand Down
2 changes: 1 addition & 1 deletion pywb/framework/wbrequestresponse.py
Expand Up @@ -131,7 +131,7 @@ def normalize_post_query(self):
if not self.wb_url:
return

mime = self.env.get('CONTENT_TYPE').split(';')[0]
mime = self.env.get('CONTENT_TYPE', '').split(';')[0]
length = self.env.get('CONTENT_LENGTH')
stream = self.env['wsgi.input']

Expand Down
20 changes: 20 additions & 0 deletions pywb/rewrite/cookie_rewriter.py
Expand Up @@ -55,6 +55,24 @@ def rewrite_cookie(self, name, morsel):
return morsel


#=================================================================
class ExactPathCookieRewriter(WbUrlBaseCookieRewriter):
"""
Rewrite cookies only using exact path, useful for live rewrite
without a timestamp and to minimize cookie pollution
If path or domain present, simply remove
"""

def rewrite_cookie(self, name, morsel):
if morsel.get('domain'):
del morsel['domain']
# else set cookie to rewritten path
if morsel.get('path'):
del morsel['path']

self._remove_age_opts(morsel)
return morsel
#=================================================================
class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
"""
Expand All @@ -79,5 +97,7 @@ def rewrite_cookie(self, name, morsel):
def get_cookie_rewriter(cookie_scope):
if cookie_scope == 'root':
return RootScopeCookieRewriter
elif cookie_scope == 'exact':
return ExactPathCookieRewriter
else:
return MinimalScopeCookieRewriter
6 changes: 6 additions & 0 deletions pywb/rewrite/html_rewriter.py
Expand Up @@ -174,6 +174,12 @@ def _rewrite_tag_attrs(self, tag, tag_attrs):
elif attr_name == 'crossorigin':
attr_name = '_crossorigin'

# special case: link don't rewrite canonical
elif tag == 'link' and attr_name == 'href':
if not self.has_attr(tag_attrs, ('rel', 'canonical')):
rw_mod = handler.get(attr_name)
attr_value = self._rewrite_url(attr_value, rw_mod)

# special case: meta tag
elif (tag == 'meta') and (attr_name == 'content'):
if self.has_attr(tag_attrs, ('http-equiv', 'refresh')):
Expand Down
14 changes: 11 additions & 3 deletions pywb/rewrite/test/test_cookie_rewriter.py
@@ -1,4 +1,5 @@
r"""
# Default -- MinimalScopeRewriter
# No rewriting
>>> rewrite_cookie('a=b; c=d;')
[('Set-Cookie', 'a=b'), ('Set-Cookie', 'c=d')]
Expand All @@ -23,17 +24,24 @@
>>> rewrite_cookie('abc@def=123')
[]
# ExactCookieRewriter
>>> rewrite_cookie('some=value; Path=/diff/path/;', urlrewriter, ExactPathCookieRewriter)
[('Set-Cookie', 'some=value')]
>>> rewrite_cookie('some=value; Domain=.example.com; Path=/diff/path/; Max-Age=1500', urlrewriter, ExactPathCookieRewriter)
[('Set-Cookie', 'some=value')]
"""


from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter
from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter, ExactPathCookieRewriter
from pywb.rewrite.url_rewriter import UrlRewriter

urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')

urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/')


def rewrite_cookie(cookie_str, rewriter=urlrewriter):
return MinimalScopeCookieRewriter(rewriter).rewrite(cookie_str)
def rewrite_cookie(cookie_str, rewriter=urlrewriter, cookie_rewriter=MinimalScopeCookieRewriter):
return cookie_rewriter(rewriter).rewrite(cookie_str)

4 changes: 4 additions & 0 deletions pywb/rewrite/test/test_html_rewriter.py
Expand Up @@ -102,6 +102,10 @@
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
# don't rewrite rel=canonical
>>> parse('<link rel=canonical href="http://example.com/">')
<link rel="canonical" href="http://example.com/">
# doctype
>>> parse('<!doctype html PUBLIC "public">')
<!doctype html PUBLIC "public">
Expand Down
10 changes: 10 additions & 0 deletions pywb/rewrite/test/test_regex_rewriters.py
Expand Up @@ -45,6 +45,16 @@
>>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
# protocol-rel escapes
>>> _test_js('"//example.com/"')
'"/web/20131010/http://example.com/"'
>>> _test_js(r'"\/\/example.com/"')
'"/web/20131010/http:\\/\\/example.com/"'
>>> _test_js(r'"\\/\\/example.com/"')
'"/web/20131010/http:\\\\/\\\\/example.com/"'
# custom rules added
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)])
'window.WB_wombat_location = "/web/20131010/http://example.com/abc.html"; /*some_func(); */'
Expand Down
15 changes: 15 additions & 0 deletions pywb/rewrite/test/test_url_rewriter.py
Expand Up @@ -50,6 +50,21 @@
>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
>>> do_rewrite(r'//some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/20101226101112/http://some-other-site.com'
>>> do_rewrite(r'\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
>>> do_rewrite(r'\\/\\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
>>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '/')
'/2020/http://example.com/other.html'
Expand Down
22 changes: 22 additions & 0 deletions pywb/rewrite/test/test_wburl.py
Expand Up @@ -26,6 +26,13 @@
>>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E'))
"('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')"
# Test scheme partially encoded urls
>>> repr(WbUrl('https%3A//example.com/'))
"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')"
>>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/'))
"('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')"
# Query Urls
# ======================
>>> repr(WbUrl('*/http://example.com/abc?def=a'))
Expand Down Expand Up @@ -57,6 +64,21 @@
>>> repr(WbUrl('/example.com/'))
"('latest_replay', '', '', 'http://example.com/', 'http://example.com/')"
# Is_ Tests
>>> u = WbUrl('*/http://example.com/abc?def=a*')
>>> u.is_url_query()
True
>>> u.is_query()
True
>>> u2 = WbUrl('20130102im_/https:/example.com')
>>> u2.is_embed
True
>>> u2.is_replay()
True
# Error Urls
# ======================
Expand Down
6 changes: 4 additions & 2 deletions pywb/rewrite/url_rewriter.py
Expand Up @@ -17,7 +17,9 @@ class UrlRewriter(object):

PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']

def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None,
REL_SCHEME = ('//', r'\/\/', r'\\/\\/')

def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None,
root_path=None, cookie_scope=None):
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
self.prefix = prefix
Expand Down Expand Up @@ -45,7 +47,7 @@ def rewrite(self, url, mod=None):

is_abs = any(url.startswith(x) for x in self.PROTOCOLS)

if url.startswith('//'):
if url.startswith(self.REL_SCHEME):
is_abs = True
url = 'http:' + url

Expand Down
11 changes: 11 additions & 0 deletions pywb/rewrite/wburl.py
Expand Up @@ -85,6 +85,9 @@ class WbUrl(BaseWbUrl):
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.+)$')

DEFAULT_SCHEME = 'http://'

PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I)

# ======================

def __init__(self, url):
Expand All @@ -99,6 +102,14 @@ def __init__(self, url):
# protocol agnostic url -> http://
# no protocol -> http://
inx = self.url.find(':/')
if inx < 0:
# check for other partially encoded variants
m = self.PARTIAL_ENC_RX.match(self.url)
if m:
len_ = len(m.group(0))
self.url = urllib.unquote_plus(self.url[:len_]) + self.url[len_:]
inx = self.url.find(':/')

if inx < 0:
self.url = self.DEFAULT_SCHEME + self.url
else:
Expand Down
82 changes: 63 additions & 19 deletions pywb/warc/archiveiterator.py
Expand Up @@ -21,6 +21,25 @@ class ArchiveIterator(object):
"""

GZIP_ERR_MSG = """
ERROR: Non-chunked gzip file detected, gzip block continues
beyond single record.
This file is probably not a multi-chunk gzip but a single gzip file.
To allow seek, a gzipped {1} must have each record compressed into
a single gzip chunk and concatenated together.
This file is likely still valid and you can use it by decompressing it:
gunzip myfile.{0}.gz
You can then also use the 'warc2warc' tool from the 'warc-tools'
package which will create a properly chunked gzip file:
warc2warc -Z myfile.{0} > myfile.{0}.gz
"""

def __init__(self, fileobj):
self.fh = fileobj

Expand All @@ -42,27 +61,34 @@ def iter_records(self, block_size=16384):
block_size=block_size)
self.offset = self.fh.tell()

next_line = None
self.next_line = None

is_valid = True

while True:
try:
record = self._next_record(next_line)
record = self._next_record(self.next_line)
if not is_valid:
self._raise_err()

yield record
except EOFError:
break

self.read_to_end(record)

# for non-compressed, consume blank lines here
if not self.reader.decompressor:
next_line = self._consume_blanklines()
if next_line is None:
# at end of file
break
if self.reader.decompressor:
is_valid = self.reader.read_next_member()

def _raise_err(self):
frmt = 'warc/arc'
if self.known_format:
frmt = self.known_format

# reset reader for next member
else:
self.reader.read_next_member()
frmt_up = frmt.upper()

msg = self.GZIP_ERR_MSG.format(frmt, frmt_up)
raise Exception(msg)

def _consume_blanklines(self):
""" Consume blank lines that are between records
Expand All @@ -72,25 +98,31 @@ def _consume_blanklines(self):
and are included in record length which is the full gzip envelope
- For uncompressed, they are between records and so are NOT part of
the record length
count empty_size so that it can be substracted from
the record length for uncompressed
"""
empty_size = 0
while True:
line = self.reader.readline()
if len(line) == 0:
return None
return None, empty_size

if line.rstrip() == '':
self.offset = self.fh.tell() - self.reader.rem_length()
empty_size += len(line)
continue

return line
return line, empty_size

def read_to_end(self, record, compute_digest=False):
""" Read remainder of the stream
If a digester is included, update it
with the data read
"""

# already at end of this record, don't read until it is consumed
if self.member_info:
return self.member_info
return None

if compute_digest:
digester = hashlib.sha1()
Expand All @@ -114,19 +146,29 @@ def read_to_end(self, record, compute_digest=False):
- For uncompressed files, blank lines are read later,
and not included in the record length
"""
if self.reader.decompressor:
self._consume_blanklines()
#if self.reader.decompressor:
self.next_line, empty_size = self._consume_blanklines()

self.offset = self.fh.tell() - self.reader.rem_length()
#if self.offset < 0:
# raise Exception('Not Gzipped Properly')

if self.next_line:
self.offset -= len(self.next_line)

length = self.offset - curr_offset

if not self.reader.decompressor:
length -= empty_size

if compute_digest:
digest = base64.b32encode(digester.digest())
else:
digest = None

self.member_info = (curr_offset, length, digest)
return self.member_info
#return self.member_info
#return next_line

def _next_record(self, next_line):
""" Use loader to parse the record from the reader stream
Expand Down Expand Up @@ -250,7 +292,9 @@ def create_record_iter(arcv_iter, options):

entry.post_query = post_query

entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
#entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
arcv_iter.read_to_end(record, compute_digest)
entry.set_rec_info(*arcv_iter.member_info)
entry.record = record

yield entry
Expand Down

0 comments on commit 238a45b

Please sign in to comment.