Skip to content

Commit

Permalink
Fuzzy Rewrite Improvements (#263)
Browse files Browse the repository at this point in the history
rules system:
- 'mixin' class for adding custom rewrite mixin, initialized with optional 'mixin_params'
- 'force_type' to always force rewriting text type for rule match (eg. if application/octet-stream)
- fuzzy rewrite: 'find_all' mode for matching via regex.findall() instead of search()
- load_function moved to generic load_py_name
- new rules for fb!
- JSReplaceFuzzy mixin to replace content based on query (or POST) regex match
- tests: tests JSReplaceFuzzy rewriting

query:
- append '?' for fuzzy matching if filters are set
- cdx['is_fuzzy'] set to '1' instead of True

client-side: rewrite
- add window.Request object rewrite
- improved rewrite of wb server + path, avoid double-slash
- fetch() rewrite proxy_to_obj()
- proxy_to_obj() null check
- WombatLocation prop change, skip if prop is the same
  • Loading branch information
ikreymer committed Nov 1, 2017
1 parent 520ee35 commit bcbc00a
Show file tree
Hide file tree
Showing 9 changed files with 197 additions and 26 deletions.
20 changes: 18 additions & 2 deletions pywb/rewrite/content_rewriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from pywb.utils.io import StreamIter, BUFF_SIZE

from pywb.utils.loaders import load_yaml_config
from pywb.utils.loaders import load_yaml_config, load_py_name


# ============================================================================
Expand Down Expand Up @@ -55,6 +55,10 @@ def parse_rewrite_rule(self, config):
parse_rules_func = self.init_js_regex(regexs)
rule['js_regex_func'] = parse_rules_func

mixin = rule.get('mixin')
if mixin:
rule['mixin'] = load_py_name(mixin)

return rule

def get_rule(self, cdx):
Expand All @@ -73,6 +77,11 @@ def get_rw_class(self, rule, text_type, rwinfo):
rw_type = rule.get(text_type, text_type)
rw_class = self.get_rewriter(rw_type, rwinfo)

mixin = rule.get('mixin')
if mixin:
mixin_params = rule.get('mixin_params', {})
rw_class = type('custom_js_rewriter', (mixin, rw_class), mixin_params)

return rw_type, rw_class

def create_rewriter(self, text_type, rule, rwinfo, cdx, head_insert_func=None):
Expand Down Expand Up @@ -159,8 +168,15 @@ def __call__(self, record, url_rewriter, cookie_rewriter,
rwinfo = RewriteInfo(record, self, url_rewriter, cookie_rewriter)
content_rewriter = None

url_rewriter.rewrite_opts['cdx'] = cdx

rule = self.get_rule(cdx)

force_type = rule.get('force_type')
if force_type:
rwinfo.text_type = force_type

if rwinfo.should_rw_content():
rule = self.get_rule(cdx)
content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func)

gen = None
Expand Down
40 changes: 30 additions & 10 deletions pywb/rewrite/regex_rewriters.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,7 @@
import re
from pywb.rewrite.content_rewriter import StreamingRewriter


# =================================================================
def load_function(string):
import importlib

string = string.split(':', 1)
mod = importlib.import_module(string[0])
return getattr(mod, string[1])
from pywb.utils.loaders import load_py_name
from six.moves.urllib.parse import unquote


# =================================================================
Expand Down Expand Up @@ -101,7 +94,7 @@ def parse_rule(obj):
if 'rewrite' in obj:
replace = RegexRewriter.archival_rewrite(rewriter)
elif 'function' in obj:
replace = load_function(obj['function'])
replace = load_py_name(obj['function'])
else:
replace = RegexRewriter.format(obj.get('replace', '{0}'))
group = obj.get('group', 0)
Expand Down Expand Up @@ -259,6 +252,33 @@ class JSWombatProxyRewriter(JSWombatProxyRewriterMixin, RegexRewriter):
pass


# =================================================================
class JSReplaceFuzzy(object):
rx_obj = None

def __init__(self, *args, **kwargs):
super(JSReplaceFuzzy, self).__init__(*args, **kwargs)
if not self.rx_obj:
self.rx_obj = re.compile(self.rx)

def rewrite(self, string):
string = super(JSReplaceFuzzy, self).rewrite(string)
cdx = self.url_rewriter.rewrite_opts['cdx']
if cdx.get('is_fuzzy'):
expected = unquote(cdx['url'])
actual = unquote(self.url_rewriter.wburl.url)

exp_m = self.rx_obj.search(expected)
act_m = self.rx_obj.search(actual)

if exp_m and act_m:
result = string.replace(exp_m.group(1), act_m.group(1))
if result != string:
string = result

return string


# =================================================================
# Set 'default' JSRewriter
JSRewriter = JSLinkAndLocationRewriter
Expand Down
22 changes: 20 additions & 2 deletions pywb/rewrite/test/test_content_rewriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,20 @@ def _create_response_record(self, url, headers, payload, warc_headers):
warc_headers_dict=warc_headers)

def rewrite_record(self, headers, content, ts, url='http://example.com/',
prefix='http://localhost:8080/prefix/', warc_headers=None):
prefix='http://localhost:8080/prefix/', warc_headers=None,
request_url=None):

record = self._create_response_record(url, headers, content, warc_headers)

wburl = WbUrl(ts + '/' + url)
wburl = WbUrl(ts + '/' + (request_url or url))
url_rewriter = UrlRewriter(wburl, prefix)

cdx = CDXObject()
cdx['url'] = url
cdx['timestamp'] = ts
cdx['urlkey'] = canonicalize(url)
if request_url != url:
cdx['is_fuzzy'] = '1'

return self.content_rewriter(record, url_rewriter, None, cdx=cdx)

Expand Down Expand Up @@ -254,6 +257,21 @@ def test_rewrite_text_plain_as_js(self):

assert b''.join(gen).decode('utf-8') == content

def test_custom_fuzzy_replace(self):
headers = {'Content-Type': 'application/octet-stream'}
content = '{"ssid":"1234"}'

actual_url = 'http://facebook.com/ajax/pagelet/generic.php/photoviewerinitpagelet?data="ssid":1234'
request_url = 'http://facebook.com/ajax/pagelet/generic.php/photoviewerinitpagelet?data="ssid":5678'

headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_',
url=actual_url,
request_url=request_url)

assert headers.headers == [('Content-Type', 'application/octet-stream')]

assert b''.join(gen).decode('utf-8') == '{"ssid":"5678"}'

def test_hls_default_max(self):
headers = {'Content-Type': 'application/vnd.apple.mpegurl'}
with open(os.path.join(get_test_dir(), 'text_content', 'sample_hls.m3u8'), 'rt') as fh:
Expand Down
36 changes: 36 additions & 0 deletions pywb/rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,38 @@ rules:

# facebook rules
#=================================================================
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/photoviewerinitpagelet'

rewrite:
mixin: 'pywb.rewrite.regex_rewriters:JSReplaceFuzzy'
mixin_params:
rx: '"ssid":([\d]+)'

force_type: 'json'

fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|(?:query_type|fbid)[^,]+))'

- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/'

#fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|(?:query_type|fbid)[^,]+))'

- url_prefix: 'com,facebook)/ajax/ufi/reply_fetch.php'

fuzzy_lookup:
- 'ft_ent_identifier'
- 'parent_comment_ids[0]'
- lsd

- url_prefix: 'com,facebook)/ajax/ufi/comment_fetch.php'

fuzzy_lookup:
- 'source'
- 'offset'
- 'length'
- 'ft_ent_identifier'
- 'feed_context'

- url_prefix: 'com,facebook)/ajax/ufi/'

fuzzy_lookup:
Expand Down Expand Up @@ -97,7 +124,16 @@ rules:

fuzzy_lookup: '([?&][^_]\w+=[^&]+)+'

- url_prefix: 'com,facebook)/api/graphqlbatch'

fuzzy_lookup:
match: '("q[\d]+":|after:\\"[^"]+)'
find_all: true

- url_prefix: 'com,facebook)/'

fuzzy_lookup: '([?&][^_]\w+=[^&]+)+'

rewrite:
js_regexs:
- match: 'Bootloader\.configurePage.*?;'
Expand Down
48 changes: 45 additions & 3 deletions pywb/static/wombat.js
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,10 @@ var _WBWombat = function($wbwindow, wbinfo) {
} else {
url = "";
}
url += "/" + path;
if (path && path[0] != "/") {
url += "/";
}
url += path;
}

return url;
Expand Down Expand Up @@ -516,6 +519,10 @@ var _WBWombat = function($wbwindow, wbinfo) {
return;
}

if (this["_" + prop] == value) {
return;
}

this["_" + prop] = value;

if (!this._parser) {
Expand Down Expand Up @@ -873,10 +880,44 @@ var _WBWombat = function($wbwindow, wbinfo) {
init_opts = init_opts || {};
init_opts["credentials"] = "include";

return orig_fetch.call(this, input, init_opts);
return orig_fetch.call(proxy_to_obj(this), input, init_opts);
}
}


//============================================
function init_request_override()
{
var orig_request = $wbwindow.Request;

if (!orig_request) {
return;
}

$wbwindow.Request = (function (Request) {
return function(input, init_opts) {
if (typeof(input) === "string") {
input = rewrite_url(input);
} else if (typeof(input) === "object" && input.url) {
var new_url = rewrite_url(input.url);

if (new_url != input.url) {
// input = new Request(new_url, input);
input.url = new_url;
}
}

init_opts = init_opts || {};
init_opts["credentials"] = "include";

return new Request(input, init_opts);
}

})($wbwindow.Request);

$wbwindow.Request.prototype = orig_request.prototype;
}

//============================================
function override_prop_extract(proto, prop, cond) {
var orig_getter = get_orig_getter(proto, prop);
Expand Down Expand Up @@ -2767,7 +2808,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
//============================================
function proxy_to_obj(source) {
try {
return source.__WBProxyRealObj__ || source;
return (source && source.__WBProxyRealObj__) || source;
} catch (e) {
return source;
}
Expand Down Expand Up @@ -2997,6 +3038,7 @@ var _WBWombat = function($wbwindow, wbinfo) {

// Fetch
init_fetch_rewrite();
init_request_override();

// Worker override (experimental)
init_web_worker_override();
Expand Down
13 changes: 13 additions & 0 deletions pywb/utils/canonicalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,16 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
>>> calc_search_range('http://example.com/path/file.html', 'prefix')
('com,example)/path/file.html', 'com,example)/path/file.htmm')
# slash and ?
>>> calc_search_range('http://example.com/path/', 'prefix')
('com,example)/path/', 'com,example)/path0')
>>> calc_search_range('http://example.com/path?', 'prefix')
('com,example)/path?', 'com,example)/path@')
>>> calc_search_range('http://example.com/path/?', 'prefix')
('com,example)/path?', 'com,example)/path@')
>>> calc_search_range('http://example.com/path/file.html', 'host')
('com,example)/', 'com,example*')
Expand Down Expand Up @@ -158,6 +168,9 @@ def inc_last_char(x):
if url.endswith('/') and not start_key.endswith('/'):
start_key += '/'

if url.endswith('?') and not start_key.endswith('?'):
start_key += '?'

end_key = inc_last_char(start_key)

elif match_type == 'host':
Expand Down
9 changes: 9 additions & 0 deletions pywb/utils/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@
s3_avail = False


# =================================================================
def load_py_name(string):
import importlib

string = string.split(':', 1)
mod = importlib.import_module(string[0])
return getattr(mod, string[1])


#=================================================================
def is_http(filename):
return filename.startswith(('http://', 'https://'))
Expand Down

0 comments on commit bcbc00a

Please sign in to comment.