Skip to content

Commit

Permalink
fuzzy match: add support for specifying regex and args seperately for
Browse files Browse the repository at this point in the history
fuzzy_lookup match
  • Loading branch information
ikreymer committed Dec 26, 2014
1 parent ffb702c commit 8d6845a
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 22 deletions.
30 changes: 10 additions & 20 deletions pywb/cdx/cdxdomainspecific.py
Expand Up @@ -13,11 +13,6 @@

#=================================================================
def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
"""
>>> (canon, fuzzy) = load_domain_specific_cdx_rules(None, True)
>>> canon('http://test.example.example/path/index.html?a=b&id=value&c=d')
'example,example,test)/path/index.html?id=value'
"""
canon = None
fuzzy = None

Expand Down Expand Up @@ -162,24 +157,24 @@ def unsurt(self):

@staticmethod
def make_regex(config):
# just query args
if isinstance(config, list):
string = CDXDomainSpecificRule.make_query_match_regex(config)
# assumes string

# split out base and args
elif isinstance(config, dict):
string = config.get('regex', '')
string += CDXDomainSpecificRule.make_query_match_regex(
config.get('args', []))

# else assume string
else:
string = config
string = str(config)

return re.compile(string)

@staticmethod
def make_query_match_regex(params_list):
r"""
>>> CDXDomainSpecificRule.make_query_match_regex(['para', 'id', 'abc'])
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
>>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()'])
'[?&](abc\\(\\)=[^&]+).*[?&](id\\[0\\]=[^&]+)'
"""
params_list.sort()

def conv(value):
Expand All @@ -188,8 +183,3 @@ def conv(value):
params_list = map(conv, params_list)
final_str = '.*'.join(params_list)
return final_str


if __name__ == "__main__":
import doctest
doctest.testmod()
40 changes: 40 additions & 0 deletions pywb/cdx/test/test_cdxdomainspecific.py
@@ -0,0 +1,40 @@
r"""
Load Rules
>>> (canon, fuzzy) = load_domain_specific_cdx_rules(None, True)
>>> canon('http://test.example.example/path/index.html?a=b&id=value&c=d')
'example,example,test)/path/index.html?id=value'
# Fuzzy Query Args Builder
>>> CDXDomainSpecificRule.make_query_match_regex(['para', 'id', 'abc'])
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
>>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()'])
'[?&](abc\\(\\)=[^&]+).*[?&](id\\[0\\]=[^&]+)'
# Fuzzy Match Query + Args
# list
>>> CDXDomainSpecificRule.make_regex(['para', 'id', 'abc']).pattern
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
# dict
>>> CDXDomainSpecificRule.make_regex(dict(regex='com,test,.*\)/', args=['para', 'id', 'abc'])).pattern
'com,test,.*\\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
# string
>>> CDXDomainSpecificRule.make_regex('com,test,.*\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)').pattern
'com,test,.*\\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
"""


from pywb.cdx.cdxdomainspecific import CDXDomainSpecificRule
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules


if __name__ == "__main__":
import doctest
doctest.testmod()
14 changes: 12 additions & 2 deletions pywb/rules.yaml
Expand Up @@ -148,12 +148,22 @@ rules:

- url_prefix: 'com,youtube,c'

fuzzy_lookup: 'com,youtube,c.*/videogoodput.*([?&]id=[^&]+)'
fuzzy_lookup:
match:
regex: 'com,youtube,c.*/videogoodput.*'
args:
- id

- url_prefix: 'com,googlevideo,'

fuzzy_lookup:
match: 'com,googlevideo.*/videoplayback.*([?&]id=[^&]+).*([?&]itag=[^&]+).*([?&]mime=[^&]+)'
match:
regex: 'com,googlevideo.*/videoplayback.*'
args:
- id
- itag
- mime

filter:
- '~urlkey:{0}'
- '!mimetype:text/plain'
Expand Down

0 comments on commit 8d6845a

Please sign in to comment.