Skip to content

Commit

Permalink
include the collection in Memento Link outputs: (#259)
Browse files Browse the repository at this point in the history
* include the collection in Memento Link outputs:
- add new cdx 'source-coll' field, storing only the collection
- ensure rel="collection" property included in the TimeMap and Link header
- tests: update all tests to include the 'source-coll' property
- docs: add 'collection provenance' to auto-all collection configuration docs
  • Loading branch information
ikreymer committed Oct 23, 2017
1 parent 9d681d1 commit 459cd70
Show file tree
Hide file tree
Showing 14 changed files with 83 additions and 25 deletions.
1 change: 1 addition & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#

collections:
all: $all
pywb:
index_paths: ./sample_archive/cdx/
archive_paths: ./sample_archive/warcs/
Expand Down
38 changes: 37 additions & 1 deletion docs/manual/configuring.rst
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,43 @@ The aggregate all collections automatically aggregates data from all collections

Accessing ``/all/<url>`` will cause an aggregate lookup within the collections directory.

Note: It is not (yet) possible to exclude collections from the all collection, although "special" collections are not included.
Note: It is not (yet) possible to exclude collections from the auto-all collection, although "special" collections are not included.

Collection Provenance
"""""""""""""""""""""

When using the auto-all collection, it is possible to determine the original collection of each resource by looking at the ``Link`` header metadata
if Memento API is enabled. The header will include the extra ``rel="collection"``, specifying the collection::

Link: <http://example.com/>; rel="original", <http://localhost:8080/all/mp_/http://example.com/>; rel="timegate", <http://localhost:8080/all/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://localhost:8080/all/20170920185327mp_/http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 18:20:19 GMT"; collection="coll-1"


For example, if two collections ``coll-1`` and ``coll-2`` contain ``http://example.com/``, loading the timemap for
``/all/timemap/link/http://example.com/`` might look like as follows::

<http://localhost:8080/all/timemap/link/http://example.com/>; rel="self"; type="application/link-format"; from="Wed, 20 Sep 2017 03:53:27 GMT",
<http://localhost:8080/all/mp_/http://example.com/>; rel="timegate",
<http://example.com/>; rel="original",
<http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 03:53:27 GMT"; collection="coll-1",
<http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 04:53:27 GMT"; collection="coll-2",

Identifiying the Collections
""""""""""""""""""""""""""""

When using the "all" collection, it is possible to determine the actual collection of each url by looking at the ``Link`` header metadata,
which in addition to memento relations, include the extra ``rel="collection"``, specifying the collection::

Link: <http://example.com/>; rel="original", <http://localhost:8080/all/mp_/http://example.com/>; rel="timegate", <http://localhost:8080/all/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://localhost:8080/all/20170920185327mp_/http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 18:20:19 GMT"; collection="coll-1"


For example, if two collections ``coll-1`` and ``coll-2`` contain ``http://example.com/``, loading the timemap for
``/all/timemap/link/http://example.com/`` might look like as follows::

<http://localhost:8080/all/timemap/link/http://example.com/>; rel="self"; type="application/link-format"; from="Wed, 20 Sep 2017 03:53:27 GMT",
<http://localhost:8080/all/mp_/http://example.com/>; rel="timegate",
<http://example.com/>; rel="original",
<http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 03:53:27 GMT"; collection="coll-1",
<http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 04:53:27 GMT"; collection="coll-2",


Generic Collection Definitions
Expand Down
6 changes: 3 additions & 3 deletions pywb/apps/rewriterapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ def render_content(self, wb_url, kwargs, environ):
if not is_ajax and self.enable_memento:
self._add_memento_links(cdx['url'], full_prefix,
memento_dt, cdx['timestamp'], status_headers,
is_timegate, is_proxy)
is_timegate, is_proxy, cdx.get('source-coll'))

set_content_loc = True

Expand Down Expand Up @@ -344,7 +344,7 @@ def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy):
return response

def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
status_headers, is_timegate, is_proxy):
status_headers, is_timegate, is_proxy, coll=None):

# memento url + header
if not memento_dt and memento_ts:
Expand All @@ -370,7 +370,7 @@ def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
link.append(MementoUtils.make_link(timemap_url, 'timemap'))

if memento_dt:
link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt))
link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt, coll))

link_str = ', '.join(link)

Expand Down
12 changes: 7 additions & 5 deletions pywb/utils/memento.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,10 @@ def make_timemap_memento_link(cls, cdx, datetime=None, rel='memento', end=',\n')
if not url:
url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length'))

memento = '<{0}>; rel="{1}"; datetime="{2}"; src="{3}"' + end

if not datetime:
datetime = timestamp_to_http_date(cdx['timestamp'])

return memento.format(url, rel, datetime, cdx.get('source', ''))
return cls.make_memento_link(url, rel, datetime, cdx.get('source-coll')) + end

@classmethod
def make_timemap(cls, cdx_iter):
Expand Down Expand Up @@ -113,7 +111,11 @@ def make_link(cls, url, type):
return '<{0}>; rel="{1}"'.format(url, type)

@classmethod
def make_memento_link(cls, url, type, dt):
return '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt)
def make_memento_link(cls, url, type, dt, coll=None):
res = '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt)
if coll:
res += '; collection="{0}"'.format(coll)

return res


13 changes: 11 additions & 2 deletions pywb/warcserver/index/aggregator.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,25 +55,31 @@ def load_child_source(self, name, source, params):
cdx_iter = iter([])
err_list = [(name, repr(wbe))]

def add_name(cdx, name):
def add_source(cdx, name):
if not cdx.get('url'):
return cdx

if cdx.get('source'):
cdx['source'] = name + ':' + cdx['source']
else:
cdx['source'] = name

cdx['source-coll'] = self._get_coll(name)

return cdx

if params.get('nosource') != 'true':
src_coll = params.get('param.' + name + '.src_coll')
if src_coll:
name += ':' + src_coll

cdx_iter = (add_name(cdx, name) for cdx in cdx_iter)
cdx_iter = (add_source(cdx, name) for cdx in cdx_iter)

return cdx_iter, err_list

def _get_coll(self, name):
return name

def load_index(self, params):
res_list = self._load_all(params)

Expand Down Expand Up @@ -295,6 +301,9 @@ def _load_files_single_dir(self, the_dir):

yield full_name, index_src

def _get_coll(self, name):
return name.split(os.path.sep, 1)[0]

def __repr__(self):
return '{0}(file://{1})'.format(self.__class__.__name__,
os.path.join(self.base_prefix, self.base_dir))
Expand Down
1 change: 1 addition & 0 deletions pywb/warcserver/index/test/test_fuzzymatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def get_expected(self, url, mime='text/html', filters=None):
'is_fuzzy': True,
'urlkey': canonicalize(url),
'source': 'source',
'source-coll': 'source',
'url': url,
'mime': mime}]

Expand Down
7 changes: 4 additions & 3 deletions pywb/warcserver/resource/pathresolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,10 @@ def __call__(self, filename, cdx):
if '*' not in path:
return path

res_path = self.resolve_coll(path, cdx.get('source'))
if res_path:
return res_path
#res_path = self.resolve_coll(path, cdx.get('source'))
coll = cdx.get('source-coll')
if coll:
return path.replace('*', coll)

if '://' in path:
return path
Expand Down
1 change: 1 addition & 0 deletions pywb/warcserver/resource/test/test_pathresolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def test_resolver_dir_wildcard_with_coll(self):

cdx = CDXObject()
cdx['source'] = 'my-coll/indexes/index.cdxj'
cdx['source-coll'] = 'my-coll'

res = resolver('example.warc.gz', cdx)
assert res == 's3://bucket/colls/my-coll/archives/example.warc.gz'
Expand Down
3 changes: 2 additions & 1 deletion pywb/warcserver/test/test_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,8 @@ def test_live_index(self):
cdxlist = list([json.loads(cdx) for cdx in resp.text.rstrip().split('\n')])
cdxlist[0]['timestamp'] = '2016'
assert(cdxlist == [{'url': 'http://httpbin.org/get', 'urlkey': 'org,httpbin)/get', 'is_live': 'true',
'mime': '', 'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}])
'mime': '', 'load_url': 'http://httpbin.org/get',
'source': 'live', 'source-coll': 'live', 'timestamp': '2016'}])

def test_live_resource(self):
headers = {'foo': 'bar'}
Expand Down
4 changes: 3 additions & 1 deletion tests/memento_fixture.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@ def make_timegate_link(self, url, fmod='', coll='pywb'):
format_ = '<http://localhost:80/{2}/{1}{0}>; rel="timegate"'
return format_.format(url, fmod_slash, coll)

def make_memento_link(self, url, ts, dt, fmod='', coll='pywb'):
def make_memento_link(self, url, ts, dt, fmod='', coll='pywb', include_coll=True):
format_ = '<http://localhost:80/{4}/{1}{3}/{0}>; rel="memento"; datetime="{2}"'
if include_coll:
format_ += '; collection="{4}"'
return format_.format(url, ts, dt, fmod, coll)


2 changes: 1 addition & 1 deletion tests/test_cdx_server_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def test_resolveRevisits(self):
originals = {}
for cdx in cdxes:
cdx = CDXObject(cdx.encode('utf-8'))
assert len(cdx) == 15
assert len(cdx) == 16

# orig.* fields are either all '-' or (int, int, filename)
# check if orig.* fields are equals to corresponding fields
Expand Down
10 changes: 5 additions & 5 deletions tests/test_memento.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def test_memento_top_frame(self):

links = self.get_links(resp)

assert self.make_memento_link(url, '20140127171238', dt, 'mp_') in links
assert self.make_memento_link(url, '20140127171238', dt, 'mp_', include_coll=False) in links

#timegate link
assert self.make_timegate_link(url, 'mp_') in links
Expand Down Expand Up @@ -131,8 +131,8 @@ def test_timemap(self):
<http://localhost:80/pywb/timemap/link/http://example.com?example=1>; rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT",
<http://localhost:80/pywb/mp_/http://example.com?example=1>; rel="timegate",
<http://example.com?example=1>; rel="original",
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; src="pywb:example.cdx",
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; src="pywb:example.cdx"
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; collection="pywb",
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; collection="pywb"
"""
assert exp == resp.text

Expand All @@ -148,8 +148,8 @@ def test_timemap_cdxj(self):
resp.charset = 'utf-8'

exp = """\
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "1043", "offset": "333", "filename": "example.warc.gz", "source": "pywb:example.cdx"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "status": "-", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "553", "offset": "1864", "filename": "example.warc.gz", "source": "pywb:example.cdx"}
com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "1043", "offset": "333", "filename": "example.warc.gz", "source": "pywb:example.cdx", "source-coll": "pywb"}
com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "status": "-", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "553", "offset": "1864", "filename": "example.warc.gz", "source": "pywb:example.cdx", "source-coll": "pywb"}
"""
assert exp == resp.text

Expand Down
2 changes: 1 addition & 1 deletion tests/test_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def test_proxy_replay(self, scheme):
assert 'WB Insert' in res.text
assert 'Example Domain' in res.text

assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"'
assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"; collection="pywb"'
assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'


Expand Down
8 changes: 6 additions & 2 deletions tests/test_record_replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,15 +97,19 @@ def test_cdx_all_coll(self):
assert cdxj_lines[1]['source'] == to_path('test2/indexes/autoindex.cdxj')
assert cdxj_lines[2]['source'] == to_path('test/indexes/autoindex.cdxj')

assert cdxj_lines[0]['source-coll'] == 'test'
assert cdxj_lines[1]['source-coll'] == 'test2'
assert cdxj_lines[2]['source-coll'] == 'test'

assert cdxj_lines[0]['filename'] == cdxj_lines[2]['filename']

def test_timemap_all_coll(self):
res = self.testapp.get('/all/timemap/link/http://httpbin.org/get?C=D')
link_lines = res.text.rstrip().split('\n')
assert len(link_lines) == 5

assert to_path('test2/indexes/autoindex.cdxj') in link_lines[3]
assert to_path('test/indexes/autoindex.cdxj') in link_lines[4]
assert to_path('collection="test2"') in link_lines[3]
assert to_path('collection="test"') in link_lines[4]


# ============================================================================
Expand Down

0 comments on commit 459cd70

Please sign in to comment.