Skip to content

Commit

Permalink
dyn collection and all coll improvements: (#69)
Browse files Browse the repository at this point in the history
support dynamic collections, all collection with remote archives (eg. s3:// paths)
- warcserver: allow custom dynamic collections index and archive path templates via 'dyn_index_path' and 'dyn_archive_path'
- pathresolver: allow resolving wildcard path prefixes with collection, to support remote paths and avoid globbing
- warcserver: don't add fixed collections dir to source to support resolving wildcard
- pathresolver: add wildcard resolving s3 path test
- referrer unrewrite: ensure referrer not empty
  • Loading branch information
ikreymer committed Sep 29, 2017
1 parent 02f8fa9 commit 924b983
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 13 deletions.
5 changes: 3 additions & 2 deletions pywb/apps/rewriterapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,8 +556,9 @@ def unrewrite_referrer(self, environ, full_prefix):

if referrer.startswith(full_prefix):
referrer = referrer[len(full_prefix):]
environ['HTTP_REFERER'] = WbUrl(referrer).url
return True
if referrer:
environ['HTTP_REFERER'] = WbUrl(referrer).url
return True

return False

Expand Down
12 changes: 12 additions & 0 deletions pywb/warcserver/resource/pathresolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,18 @@ def __init__(self, template):

def __call__(self, filename, cdx):
full_path = self.template

if hasattr(cdx, '_formatter') and cdx._formatter:
full_path = cdx._formatter.format(full_path)

path = full_path + filename
if '*' not in path:
return path

res_path = self.resolve_coll(path, cdx.get('source'))
if res_path:
return res_path

if '://' in path:
return path

Expand All @@ -49,6 +54,13 @@ def __call__(self, filename, cdx):
else:
return path

def resolve_coll(self, path, source):
if not source:
return

coll = source.split('/', 1)[0]
return path.replace('*', coll)

def __repr__(self):
return "PrefixResolver('{0}')".format(self.template)

Expand Down
9 changes: 9 additions & 0 deletions pywb/warcserver/resource/test/test_pathresolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ def test_resolver_dir_wildcard(self):
assert len(res) == 1
assert res[0] == os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')

def test_resolver_dir_wildcard_with_coll(self):
resolver = DefaultResolverMixin.make_best_resolver('s3://bucket/colls/*/archives/')

cdx = CDXObject()
cdx['source'] = 'my-coll/indexes/index.cdxj'

res = resolver('example.warc.gz', cdx)
assert res == 's3://bucket/colls/my-coll/archives/example.warc.gz'

def test_resolver_dir_wildcard_as_file_url(self):
url = to_file_url(get_test_dir()) + '/*/'
resolver = DefaultResolverMixin.make_best_resolver(url)
Expand Down
12 changes: 6 additions & 6 deletions pywb/warcserver/warcserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,14 @@ def load_auto_colls(self):
print('No Root Dir, Skip Auto Colls!')
return

#indexes_templ = os.path.join('{coll}', 'indexes') + os.path.sep
self.indexes_templ = self.AUTO_DIR_INDEX_PATH.replace('/', os.path.sep)
self.indexes_templ = self.config.get('dyn_index_path', self.AUTO_DIR_INDEX_PATH).replace('/', os.path.sep)

dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir,
base_dir=self.indexes_templ,
name=self.root_dir)
base_dir=self.indexes_templ)

self.archive_templ = self.AUTO_DIR_ARCHIVE_PATH.replace('/', os.path.sep)
self.archive_templ = os.path.join(self.root_dir, self.archive_templ)
self.archive_templ = self.config.get('dyn_archive_path', self.AUTO_DIR_ARCHIVE_PATH).replace('/', os.path.sep)
if '://' not in self.archive_templ:
self.archive_templ = os.path.join(self.root_dir, self.archive_templ)

handler = DefaultResourceHandler(dir_source, self.archive_templ)

Expand Down
10 changes: 5 additions & 5 deletions tests/test_record_replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@ def test_cdx_all_coll(self):
assert cdxj_lines[1]['url'] == 'http://httpbin.org/get?C=D'
assert cdxj_lines[2]['url'] == 'http://httpbin.org/get?C=D'

assert cdxj_lines[0]['source'] == to_path('_test_colls:test/indexes/autoindex.cdxj')
assert cdxj_lines[1]['source'] == to_path('_test_colls:test2/indexes/autoindex.cdxj')
assert cdxj_lines[2]['source'] == to_path('_test_colls:test/indexes/autoindex.cdxj')
assert cdxj_lines[0]['source'] == to_path('test/indexes/autoindex.cdxj')
assert cdxj_lines[1]['source'] == to_path('test2/indexes/autoindex.cdxj')
assert cdxj_lines[2]['source'] == to_path('test/indexes/autoindex.cdxj')

assert cdxj_lines[0]['filename'] == cdxj_lines[2]['filename']

Expand All @@ -104,7 +104,7 @@ def test_timemap_all_coll(self):
link_lines = res.text.rstrip().split('\n')
assert len(link_lines) == 5

assert to_path('_test_colls:test2/indexes/autoindex.cdxj') in link_lines[3]
assert to_path('_test_colls:test/indexes/autoindex.cdxj') in link_lines[4]
assert to_path('test2/indexes/autoindex.cdxj') in link_lines[3]
assert to_path('test/indexes/autoindex.cdxj') in link_lines[4]


0 comments on commit 924b983

Please sign in to comment.