Skip to content

Commit

Permalink
cdx: cleanup regarding and more consistency for RemoteCDXServer
Browse files Browse the repository at this point in the history
RemoteCDXServer delegates filter/processing and simply proxies response from remote
RemoteCDXSource (and default usage with CDXServer) only fetches the unfiltered/unprocessed
stream and performs cdx ops locally
  • Loading branch information
ikreymer committed Mar 2, 2014
1 parent 739d0a6 commit 15d2cdd
Show file tree
Hide file tree
Showing 8 changed files with 29 additions and 26 deletions.
2 changes: 1 addition & 1 deletion pywb/cdx/cdxops.py
Expand Up @@ -25,7 +25,7 @@ def cdx_load(sources, query, perms_checker=None, process=True):
cdx_iter = load_cdx_streams(sources, query)
cdx_iter = make_obj_iter(cdx_iter, query)

if process and query.process:
if process and not query.secondary_index_only:
cdx_iter = process_cdx(cdx_iter, query)

if perms_checker:
Expand Down
7 changes: 3 additions & 4 deletions pywb/cdx/cdxserver.py
Expand Up @@ -175,14 +175,13 @@ def __init__(self, source, **kwargs):

if isinstance(source, RemoteCDXSource):
self.source = source
elif (isinstance(source, str) and
any(source.startswith(x) for x in ['http://', 'https://'])):
self.source = RemoteCDXSource(source)
elif (isinstance(source, str) and is_http(source)):
self.source = RemoteCDXSource(source, remote_processing=True)
else:
raise Exception('Invalid remote cdx source: ' + str(source))

def load_cdx_query(self, query):
remote_iter = cdx_load(self.sources, query, process=False)
remote_iter = cdx_load([self.source], query, process=False)
return self._check_cdx_iter(remote_iter, query)

def __str__(self):
Expand Down
9 changes: 4 additions & 5 deletions pywb/cdx/cdxsource.py
Expand Up @@ -41,19 +41,18 @@ class RemoteCDXSource(CDXSource):
Only url and match type params are proxied at this time,
the stream is passed through all other filters locally.
"""
def __init__(self, filename, cookie=None, proxy_all=True):
def __init__(self, filename, cookie=None, remote_processing=False):
self.remote_url = filename
self.cookie = cookie
self.proxy_all = proxy_all
self.remote_processing = remote_processing

def load_cdx(self, query):
if self.proxy_all:
query.set_process(False)
if self.remote_processing:
remote_query = query
else:
# Only send url and matchType params to remote
remote_query = CDXQuery(url=query.url,
match_type=query.matchType)
match_type=query.match_type)

urlparams = remote_query.urlencode()

Expand Down
13 changes: 5 additions & 8 deletions pywb/cdx/query.py
Expand Up @@ -79,13 +79,6 @@ def reverse(self):
def secondary_index_only(self):
return self._get_bool('showPagedIndex')

@property
def process(self):
return self._get_bool('processOps', True)

def set_process(self, process):
self.params['processOps'] = process

def _get_bool(self, name, def_val=False):
v = self.params.get(name)
if v:
Expand All @@ -103,6 +96,10 @@ def urlencode(self):

@staticmethod
def from_wsgi_env(env):
return CDXQuery(**CDXQuery.extract_params_from_wsgi_env(env))

@staticmethod
def extract_params_from_wsgi_env(env):
""" utility function to extract params and create a CDXQuery
from a WSGI environment dictionary
"""
Expand All @@ -119,4 +116,4 @@ def from_wsgi_env(env):
if name != 'filter':
params[name] = val[0]

return CDXQuery(**params)
return params
16 changes: 14 additions & 2 deletions pywb/cdx/test/cdxserver_test.py
Expand Up @@ -142,6 +142,8 @@
('filename', 'dupes.warc.gz')]
# NOTE: external dependency -- need self-contained test TODO
# Load remote query but filter locally
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
>>> pprint.pprint(x.next().items())
[('urlkey', 'com,example)/'),
Expand All @@ -152,14 +154,24 @@
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
('length', '1792')]
# No local filtering/processing of cdx, simply return result from remote server
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
>>> pprint.pprint(x.next().items())
[('urlkey', 'com,example)/'),
('timestamp', '20020120142510'),
('original', 'http://example.com:80/'),
('mimetype', 'text/html'),
('statuscode', '200'),
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
('length', '1792')]
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
Traceback (most recent call last):
AccessException: Blocked By Robots
"""

#=================================================================
from pywb.cdx.cdxserver import CDXServer
from pywb.cdx.cdxserver import CDXServer, RemoteCDXServer
import os
import sys
import pprint
Expand Down
1 change: 0 additions & 1 deletion pywb/cdx/zipnum.py
Expand Up @@ -121,7 +121,6 @@ def load_cdx(self, query):
prev_size=1)

if query.secondary_index_only:
query.set_process(False)
return idx_iter
else:
blocks = self.idx_to_cdx(idx_iter, query)
Expand Down
4 changes: 2 additions & 2 deletions pywb/handlers.py
Expand Up @@ -79,8 +79,8 @@ def __init__(self, index_reader, view = None):
self.view = view if view else TextCapturesView()

def __call__(self, wbrequest):
query = CDXQuery.from_wsgi_env(wbrequest.env)
cdx_lines = self.index_reader.load_cdx_query(query)
params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env)
cdx_lines = self.index_reader.load_cdx(**params)

return self.view.render_response(wbrequest, cdx_lines)

Expand Down
3 changes: 0 additions & 3 deletions pywb/indexreader.py
Expand Up @@ -34,9 +34,6 @@ def load_for_request(self, wbrequest):

return cdxlines

def load_cdx_query(self, query):
return self.cdx_server.load_cdx_query(query)

def load_cdx(self, **params):
return self.cdx_server.load_cdx(**params)

Expand Down

0 comments on commit 15d2cdd

Please sign in to comment.