include the collection in Memento Link outputs: (#259)

* include the collection in Memento Link outputs: - add new cdx 'source-coll' field, storing only the collection - ensure rel="collection" property included in the TimeMap and Link header - tests: update all tests to include the 'source-coll' property - docs: add 'collection provenance' to auto-all collection configuration docs
webrecorder · Oct 23, 2017 · 459cd70 · 459cd70
1 parent 9d681d1
commit 459cd70
Show file tree

Hide file tree

Showing 14 changed files with 83 additions and 25 deletions.
diff --git a/config.yaml b/config.yaml
@@ -3,6 +3,7 @@
 #
 
 collections:
+    all: $all
     pywb:
         index_paths: ./sample_archive/cdx/
         archive_paths: ./sample_archive/warcs/

diff --git a/docs/manual/configuring.rst b/docs/manual/configuring.rst
@@ -230,7 +230,43 @@ The aggregate all collections automatically aggregates data from all collections
 
 Accessing ``/all/<url>`` will cause an aggregate lookup within the collections directory.
 
-Note: It is not (yet) possible to exclude collections from the all collection, although "special" collections are not included.
+Note: It is not (yet) possible to exclude collections from the auto-all collection, although "special" collections are not included.
+
+Collection Provenance
+"""""""""""""""""""""
+
+When using the auto-all collection, it is possible to determine the original collection of each resource by looking at the ``Link`` header metadata
+if Memento API is enabled. The header will include the extra ``rel="collection"``, specifying the collection::
+
+  Link: <http://example.com/>; rel="original", <http://localhost:8080/all/mp_/http://example.com/>; rel="timegate", <http://localhost:8080/all/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://localhost:8080/all/20170920185327mp_/http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 18:20:19 GMT"; collection="coll-1"
+
+
+For example, if two collections ``coll-1`` and ``coll-2`` contain ``http://example.com/``, loading the timemap for
+``/all/timemap/link/http://example.com/`` might look like as follows::
+
+  <http://localhost:8080/all/timemap/link/http://example.com/>; rel="self"; type="application/link-format"; from="Wed, 20 Sep 2017 03:53:27 GMT",
+  <http://localhost:8080/all/mp_/http://example.com/>; rel="timegate",
+  <http://example.com/>; rel="original",
+  <http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 03:53:27 GMT"; collection="coll-1",
+  <http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 04:53:27 GMT"; collection="coll-2",
+
+Identifiying the Collections
+""""""""""""""""""""""""""""
+
+When using the "all" collection, it is possible to determine the actual collection of each url by looking at the ``Link`` header metadata,
+which in addition to memento relations, include the extra ``rel="collection"``, specifying the collection::
+
+  Link: <http://example.com/>; rel="original", <http://localhost:8080/all/mp_/http://example.com/>; rel="timegate", <http://localhost:8080/all/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://localhost:8080/all/20170920185327mp_/http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 18:20:19 GMT"; collection="coll-1"
+
+
+For example, if two collections ``coll-1`` and ``coll-2`` contain ``http://example.com/``, loading the timemap for
+``/all/timemap/link/http://example.com/`` might look like as follows::
+
+  <http://localhost:8080/all/timemap/link/http://example.com/>; rel="self"; type="application/link-format"; from="Wed, 20 Sep 2017 03:53:27 GMT",
+  <http://localhost:8080/all/mp_/http://example.com/>; rel="timegate",
+  <http://example.com/>; rel="original",
+  <http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 03:53:27 GMT"; collection="coll-1",
+  <http://example.com/>; rel="memento"; datetime="Wed, 20 Sep 2017 04:53:27 GMT"; collection="coll-2",
 
 
 Generic Collection Definitions

diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py
@@ -311,7 +311,7 @@ def render_content(self, wb_url, kwargs, environ):
         if not is_ajax and self.enable_memento:
             self._add_memento_links(cdx['url'], full_prefix,
                                     memento_dt, cdx['timestamp'], status_headers,
-                                    is_timegate, is_proxy)
+                                    is_timegate, is_proxy, cdx.get('source-coll'))
 
             set_content_loc = True
 
@@ -344,7 +344,7 @@ def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy):
         return response
 
     def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
-                           status_headers, is_timegate, is_proxy):
+                           status_headers, is_timegate, is_proxy, coll=None):
 
         # memento url + header
         if not memento_dt and memento_ts:
@@ -370,7 +370,7 @@ def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
             link.append(MementoUtils.make_link(timemap_url, 'timemap'))
 
         if memento_dt:
-            link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt))
+            link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt, coll))
 
         link_str = ', '.join(link)
 

diff --git a/pywb/utils/memento.py b/pywb/utils/memento.py
@@ -70,12 +70,10 @@ def make_timemap_memento_link(cls, cdx, datetime=None, rel='memento', end=',\n')
         if not url:
             url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length'))
 
-        memento = '<{0}>; rel="{1}"; datetime="{2}"; src="{3}"' + end
-
         if not datetime:
             datetime = timestamp_to_http_date(cdx['timestamp'])
 
-        return memento.format(url, rel, datetime, cdx.get('source', ''))
+        return cls.make_memento_link(url, rel, datetime, cdx.get('source-coll')) + end
 
     @classmethod
     def make_timemap(cls, cdx_iter):
@@ -113,7 +111,11 @@ def make_link(cls, url, type):
         return '<{0}>; rel="{1}"'.format(url, type)
 
     @classmethod
-    def make_memento_link(cls, url, type, dt):
-        return '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt)
+    def make_memento_link(cls, url, type, dt, coll=None):
+        res = '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt)
+        if coll:
+            res += '; collection="{0}"'.format(coll)
+
+        return res
 
 
diff --git a/pywb/warcserver/index/aggregator.py b/pywb/warcserver/index/aggregator.py
@@ -55,25 +55,31 @@ def load_child_source(self, name, source, params):
             cdx_iter = iter([])
             err_list = [(name, repr(wbe))]
 
-        def add_name(cdx, name):
+        def add_source(cdx, name):
             if not cdx.get('url'):
                 return cdx
 
             if cdx.get('source'):
                 cdx['source'] = name + ':' + cdx['source']
             else:
                 cdx['source'] = name
+
+            cdx['source-coll'] = self._get_coll(name)
+
             return cdx
 
         if params.get('nosource') != 'true':
             src_coll = params.get('param.' + name + '.src_coll')
             if src_coll:
                 name += ':' + src_coll
 
-            cdx_iter = (add_name(cdx, name) for cdx in cdx_iter)
+            cdx_iter = (add_source(cdx, name) for cdx in cdx_iter)
 
         return cdx_iter, err_list
 
+    def _get_coll(self, name):
+        return name
+
     def load_index(self, params):
         res_list = self._load_all(params)
 
@@ -295,6 +301,9 @@ def _load_files_single_dir(self, the_dir):
 
                 yield full_name, index_src
 
+    def _get_coll(self, name):
+        return name.split(os.path.sep, 1)[0]
+
     def __repr__(self):
         return '{0}(file://{1})'.format(self.__class__.__name__,
                                         os.path.join(self.base_prefix, self.base_dir))

diff --git a/pywb/warcserver/index/test/test_fuzzymatcher.py b/pywb/warcserver/index/test/test_fuzzymatcher.py
@@ -41,6 +41,7 @@ def get_expected(self, url, mime='text/html', filters=None):
                'is_fuzzy': True,
                'urlkey': canonicalize(url),
                'source': 'source',
+               'source-coll': 'source',
                'url': url,
                'mime': mime}]
 

diff --git a/pywb/warcserver/resource/pathresolvers.py b/pywb/warcserver/resource/pathresolvers.py
@@ -41,9 +41,10 @@ def __call__(self, filename, cdx):
         if '*' not in path:
             return path
 
-        res_path = self.resolve_coll(path, cdx.get('source'))
-        if res_path:
-            return res_path
+        #res_path = self.resolve_coll(path, cdx.get('source'))
+        coll = cdx.get('source-coll')
+        if coll:
+            return path.replace('*', coll)
 
         if '://' in path:
             return path

diff --git a/pywb/warcserver/resource/test/test_pathresolvers.py b/pywb/warcserver/resource/test/test_pathresolvers.py
@@ -35,6 +35,7 @@ def test_resolver_dir_wildcard_with_coll(self):
 
         cdx = CDXObject()
         cdx['source'] = 'my-coll/indexes/index.cdxj'
+        cdx['source-coll'] = 'my-coll'
 
         res = resolver('example.warc.gz', cdx)
         assert res == 's3://bucket/colls/my-coll/archives/example.warc.gz'

diff --git a/pywb/warcserver/test/test_handlers.py b/pywb/warcserver/test/test_handlers.py
@@ -142,7 +142,8 @@ def test_live_index(self):
         cdxlist = list([json.loads(cdx) for cdx in resp.text.rstrip().split('\n')])
         cdxlist[0]['timestamp'] = '2016'
         assert(cdxlist == [{'url': 'http://httpbin.org/get', 'urlkey': 'org,httpbin)/get', 'is_live': 'true',
-                            'mime': '', 'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}])
+                            'mime': '', 'load_url': 'http://httpbin.org/get',
+                            'source': 'live', 'source-coll': 'live', 'timestamp': '2016'}])
 
     def test_live_resource(self):
         headers = {'foo': 'bar'}

diff --git a/tests/memento_fixture.py b/tests/memento_fixture.py
@@ -23,8 +23,10 @@ def make_timegate_link(self, url, fmod='', coll='pywb'):
         format_ = '<http://localhost:80/{2}/{1}{0}>; rel="timegate"'
         return format_.format(url, fmod_slash, coll)
 
-    def make_memento_link(self, url, ts, dt, fmod='', coll='pywb'):
+    def make_memento_link(self, url, ts, dt, fmod='', coll='pywb', include_coll=True):
         format_ = '<http://localhost:80/{4}/{1}{3}/{0}>; rel="memento"; datetime="{2}"'
+        if include_coll:
+            format_ += '; collection="{4}"'
         return format_.format(url, ts, dt, fmod, coll)
 
 
diff --git a/tests/test_cdx_server_app.py b/tests/test_cdx_server_app.py
@@ -181,7 +181,7 @@ def test_resolveRevisits(self):
         originals = {}
         for cdx in cdxes:
             cdx = CDXObject(cdx.encode('utf-8'))
-            assert len(cdx) == 15
+            assert len(cdx) == 16
 
             # orig.* fields are either all '-' or (int, int, filename)
             # check if orig.* fields are equals to corresponding fields

diff --git a/tests/test_memento.py b/tests/test_memento.py
@@ -54,7 +54,7 @@ def test_memento_top_frame(self):
 
         links = self.get_links(resp)
 
-        assert self.make_memento_link(url, '20140127171238', dt, 'mp_') in links
+        assert self.make_memento_link(url, '20140127171238', dt, 'mp_', include_coll=False) in links
 
         #timegate link
         assert self.make_timegate_link(url, 'mp_') in links
@@ -131,8 +131,8 @@ def test_timemap(self):
 <http://localhost:80/pywb/timemap/link/http://example.com?example=1>; rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT",
 <http://localhost:80/pywb/mp_/http://example.com?example=1>; rel="timegate",
 <http://example.com?example=1>; rel="original",
-<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; src="pywb:example.cdx",
-<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; src="pywb:example.cdx"
+<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; collection="pywb",
+<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; collection="pywb"
 """
         assert exp == resp.text
 
@@ -148,8 +148,8 @@ def test_timemap_cdxj(self):
         resp.charset = 'utf-8'
 
         exp = """\
-com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "1043", "offset": "333", "filename": "example.warc.gz", "source": "pywb:example.cdx"}
-com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "status": "-", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "553", "offset": "1864", "filename": "example.warc.gz", "source": "pywb:example.cdx"}
+com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "1043", "offset": "333", "filename": "example.warc.gz", "source": "pywb:example.cdx", "source-coll": "pywb"}
+com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "status": "-", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "redirect": "-", "robotflags": "-", "length": "553", "offset": "1864", "filename": "example.warc.gz", "source": "pywb:example.cdx", "source-coll": "pywb"}
 """
         assert exp == resp.text
 

diff --git a/tests/test_proxy.py b/tests/test_proxy.py
@@ -60,7 +60,7 @@ def test_proxy_replay(self, scheme):
         assert 'WB Insert' in res.text
         assert 'Example Domain' in res.text
 
-        assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"'
+        assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"; collection="pywb"'
         assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
 
 

diff --git a/tests/test_record_replay.py b/tests/test_record_replay.py
@@ -97,15 +97,19 @@ def test_cdx_all_coll(self):
         assert cdxj_lines[1]['source'] == to_path('test2/indexes/autoindex.cdxj')
         assert cdxj_lines[2]['source'] == to_path('test/indexes/autoindex.cdxj')
 
+        assert cdxj_lines[0]['source-coll'] == 'test'
+        assert cdxj_lines[1]['source-coll'] == 'test2'
+        assert cdxj_lines[2]['source-coll'] == 'test'
+
         assert cdxj_lines[0]['filename'] == cdxj_lines[2]['filename']
 
     def test_timemap_all_coll(self):
         res = self.testapp.get('/all/timemap/link/http://httpbin.org/get?C=D')
         link_lines = res.text.rstrip().split('\n')
         assert len(link_lines) == 5
 
-        assert to_path('test2/indexes/autoindex.cdxj') in link_lines[3]
-        assert to_path('test/indexes/autoindex.cdxj') in link_lines[4]
+        assert to_path('collection="test2"') in link_lines[3]
+        assert to_path('collection="test"') in link_lines[4]
 
 
 # ============================================================================