Skip to content

Commit

Permalink
rules and fuzzy match fix:
Browse files Browse the repository at this point in the history
- rules: fix rule from regex '~' switch, add test
- fuzzymatch filters: use set instead of list to avoid dupes
  • Loading branch information
ikreymer committed Oct 21, 2017
1 parent 30be6f2 commit 9d681d1
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 6 deletions.
2 changes: 1 addition & 1 deletion pywb/rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ rules:
#- mime

filter:
- '~urlkey:{0}'
- 'urlkey:{0}'
- '!mimetype:text/plain'

type: 'domain'
Expand Down
4 changes: 2 additions & 2 deletions pywb/warcserver/index/fuzzymatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def parse_fuzzy_rule(self, rule):
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type)

def get_fuzzy_match(self, urlkey, params):
filters = []
filters = set()
matched_rule = None

for rule in self.rules:
Expand All @@ -78,7 +78,7 @@ def get_fuzzy_match(self, urlkey, params):
matched_rule = rule
for g in m.groups():
for f in matched_rule.filter_str:
filters.append(f.format(g))
filters.add(f.format(g))

break

Expand Down
18 changes: 15 additions & 3 deletions pywb/warcserver/index/test/test_fuzzymatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def get_params(self, url, actual_url, mime='text/html'):
return params

def get_expected(self, url, mime='text/html', filters=None):
filters = filters or ['urlkey:']
filters = filters or {'urlkey:'}
exp = [{'filter': filters,
'is_fuzzy': True,
'urlkey': canonicalize(url),
Expand Down Expand Up @@ -102,12 +102,23 @@ def test_fuzzy_jquery_2(self):
cdx_iter, errs = self.fuzzy(self.source, params)
assert list(cdx_iter) == self.get_expected(actual_url)

def test_fuzzy_custom_rule(self):
def test_fuzzy_custom_rule_yt(self):
url = 'http://youtube.com/get_video_info?a=b&html5=true&___abc=123&video_id=ABCD&id=1234'
actual_url = 'http://youtube.com/get_video_info?a=d&html5=true&___abc=125&video_id=ABCD&id=1234'
params = self.get_params(url, actual_url)
cdx_iter, errs = self.fuzzy(self.source, params)
filters = ['urlkey:html5=true', 'urlkey:video_id=abcd']
filters = {'urlkey:html5=true', 'urlkey:video_id=abcd'}
assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)

def test_fuzzy_custom_rule_yt_2(self):
url = 'https://r1---sn-xyz.googlevideo.com/videoplayback?id=ABCDEFG&itag=22&food=abc'
actual_url = 'https://r1---sn-abcdefg.googlevideo.com/videoplayback?id=ABCDEFG&itag=22&foo=abc&_1=2'
params = self.get_params(url, actual_url)
cdx_iter, errs = self.fuzzy(self.source, params)
filters = {'urlkey:id=abcdefg',
'urlkey:itag=22',
'!mimetype:text/plain'}

assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)

def test_no_fuzzy_custom_rule_video_id_diff(self):
Expand Down Expand Up @@ -159,3 +170,4 @@ def test_no_fuzzy_jquery_other_arg_mismatch(self):
cdx_iter, errs = self.fuzzy(self.source, params)
assert list(cdx_iter) == []


0 comments on commit 9d681d1

Please sign in to comment.