Skip to content

Commit

Permalink
Merge pull request #5363 from hypothesis/replace-question-with-unders…
Browse files Browse the repository at this point in the history
…core

Replace ? wildcard with _
  • Loading branch information
robertknight committed Oct 11, 2018
2 parents 9efc57a + f95073f commit ed5c783
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 38 deletions.
8 changes: 4 additions & 4 deletions docs/_extra/api-reference/hypothesis.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -380,15 +380,15 @@ paths:
PDF fingerprint.
`*` will match any character sequence (including an empty one),
and a `?` will match any single character. `*`s are only permitted
within the path and query parts of the URI and `?`s are only permitted
and a `_` will match any single character. `*`s are only permitted
within the path and query parts of the URI and `_`s are only permitted
within the domain, path, and query parts of the URI.
Escaping wildcards is not supported.
Examples of valid uris: `http://foo.com/*` `urn:x-pdf:*` `file://localhost/?bc.pdf`
Examples of valid uris: `http://foo.com/*` `urn:x-pdf:*` `file://localhost/_bc.pdf`
Examples of invalid uris: `*foo.com` `u?n:*` `file://*` `http://foo.com*`
Examples of invalid uris: `*foo.com` `u_n:*` `file://*` `http://foo.com*`
<mark>This feature is experimental and the API may change.</mark>
required: false
Expand Down
8 changes: 4 additions & 4 deletions h/schemas/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,14 +439,14 @@ class SearchParamsSchema(colander.Schema):
PDF fingerprint.
`*` will match any character sequence (including an empty one),
and a `?` will match any single character. `*`s are only permitted
within the path and query parts of the URI and `?`s are only permitted
and a `_` will match any single character. `*`s are only permitted
within the path and query parts of the URI and `_`s are only permitted
within the domain, path, and query parts of the URI.
Escaping wildcards is not supported.
Examples of valid uris":" `http://foo.com/*` `urn:x-pdf:*` `file://localhost/?bc.pdf`
Examples of invalid uris":" `*foo.com` `u?n:*` `file://*` `http://foo.com*`
Examples of valid uris":" `http://foo.com/*` `urn:x-pdf:*` `file://localhost/_bc.pdf`
Examples of invalid uris":" `*foo.com` `u_n:*` `file://*` `http://foo.com*`
""",
)
any = colander.SchemaNode(
Expand Down
37 changes: 19 additions & 18 deletions h/search/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,21 @@ def wildcard_uri_is_valid(wildcard_uri):
"""
Return True if uri contains wildcards in appropriate places, return False otherwise.
*'s are not permitted in the scheme or netloc. ?'s are not permitted in the scheme.
*'s are not permitted in the scheme or netloc. _'s are not permitted in the scheme.
"""
if "*" not in wildcard_uri and "?" not in wildcard_uri:
if "*" not in wildcard_uri and "_" not in wildcard_uri:
return False

normalized_uri = urlparse.urlparse(wildcard_uri.replace("?", ""))
normalized_uri = urlparse.urlparse(wildcard_uri)
if not normalized_uri.scheme or "*" in normalized_uri.netloc:
return False

normalized_uri = urlparse.urlparse(wildcard_uri.replace("*", ""))
if not normalized_uri.scheme:
# If a wildcard comes before the port aka: http://localhost:_3000 the request for the
# port will fail with a ValueError: invalid literal for int() with base 10: '_3000'.
try:
normalized_uri.port
except ValueError:
return False

return True


Expand Down Expand Up @@ -292,7 +294,7 @@ def __init__(self, request, separate_keys=False):
:param request: the pyramid.request object
:param separate_keys: if True will treat wildcard_uri as wildcards and uri/url
as exact match. If False will treat any values in uri/url containing wildcards
("?" or "*") as wildcard searches.
("_" or "*") as wildcard searches.
"""
self.request = request
Expand All @@ -309,8 +311,8 @@ def __call__(self, search, params):
else:
uris = popall(params, 'uri') + popall(params, 'url')
# Split into wildcard uris and non wildcard uris.
wildcard_uris = [u for u in uris if "*" in u or "?" in u]
uris = [u for u in uris if "*" not in u and "?" not in u]
wildcard_uris = [u for u in uris if "*" in u or "_" in u]
uris = [u for u in uris if "*" not in u and "_" not in u]

# Only add valid uri's to the search list.
wildcard_uris = self._normalize_uris(
Expand All @@ -336,26 +338,25 @@ def _normalize_uris(self, query_uris, normalize_method=uri.normalize):

def _wildcard_uri_normalized(self, wildcard_uri):
"""
Same as uri.normalized but it doesn't strip ending `?` from uri's.
Same as uri.normalized but it replaces _'s with ?'s after normalization.
It's possible to have a wildcard at the end of a uri, however
uri.normalize strips `?`s from the end of uris and something like
http://foo.com/* will not be normalized to http://foo.com* without
removing the `*` before normalization. To compensate for this,
we check for an ending wildcard and add it back after normalization.
Although elasticsearch uses ? we use _ since ? is a special reserved url
character and this means we can avoid dealing with normalization headaches.
While it's possible to escape `?` and `*` using \\, the uri.normalize
While it's possible to escape wildcards`using \\, the uri.normalize
converts \\ to encoded url format which does not behave the same in
elasticsearch. Thus, escaping wildcard characters is not currently
supported.
"""
# If the url is something like http://example.com/*, normalize it to
# http://example.com* so it finds all urls including the base url.
trailing_wildcard = ""
if wildcard_uri.endswith("?") or wildcard_uri.endswith("*"):
if wildcard_uri.endswith("*"):
trailing_wildcard = wildcard_uri[-1]
wildcard_uri = wildcard_uri[:-1]
wildcard_uri = uri.normalize(wildcard_uri)
wildcard_uri += trailing_wildcard
return wildcard_uri
return wildcard_uri.replace("_", "?")


class UserFilter(object):
Expand Down
26 changes: 14 additions & 12 deletions tests/h/search/query_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ class TestUriCombinedWildcardFilter():
@pytest.mark.parametrize('params,expected_ann_indexes,separate_keys', [
# Test with separate_keys = True (aka uri/url are exact match & wildcard_uri is wildcard match.)
(webob.multidict.MultiDict([("wildcard_uri", "http://bar.com/baz?45")]),
(webob.multidict.MultiDict([("wildcard_uri", "http://bar.com/baz_45")]),
[2, 3],
True),
(webob.multidict.MultiDict([("uri", "urn:x-pdf:a34480f5dbed8c4482a3a921e0196d2a"),
Expand All @@ -495,7 +495,7 @@ class TestUriCombinedWildcardFilter():
True),
# Test with separate_keys = False (aka uri/url contain both exact & wildcard matches.)
(webob.multidict.MultiDict([("uri", "http://bar.com/baz-45?")]),
(webob.multidict.MultiDict([("uri", "http://bar.com/baz-45_")]),
[1],
False),
(webob.multidict.MultiDict([("uri", "http://bar.com/*")]),
Expand Down Expand Up @@ -532,21 +532,21 @@ def test_matches(
assert sorted(result.annotation_ids) == sorted([ann_ids[ann] for ann in expected_ann_indexes])

@pytest.mark.parametrize('params,separate_keys', [
(webob.multidict.MultiDict([("wildcard_uri", "http?://bar.com")]), True),
(webob.multidict.MultiDict([("wildcard_uri", "http_://bar.com")]), True),
(webob.multidict.MultiDict([("url", "ur*n:x-pdf:*")]), False),
])
def test_ignores_urls_with_wildcards_in_the_domain(self, es_dsl_search, pyramid_request, params, separate_keys):
def test_ignores_urls_with_wildcards_at_invalid_locations(self, es_dsl_search, pyramid_request, params, separate_keys):
urifilter = query.UriCombinedWildcardFilter(pyramid_request, separate_keys)

q = urifilter(es_dsl_search, params).to_dict()

assert "should" not in q['query']['bool']

@pytest.mark.parametrize('params,separate_keys', [
(webob.multidict.MultiDict([("wildcard_uri", "http?://bar.com"),
(webob.multidict.MultiDict([("wildcard_uri", "http_://bar.com"),
("uri", "http://bar.com"),
("url", "http://baz.com")]), True),
(webob.multidict.MultiDict([("uri", "http?://bar.com"),
(webob.multidict.MultiDict([("uri", "http_://bar.com"),
("url", "http://baz.com")]), False),
])
def test_pops_params(self, es_dsl_search, pyramid_request, params, separate_keys):
Expand All @@ -567,19 +567,21 @@ def _get_search(self, search, pyramid_request, separate_keys):
@pytest.mark.parametrize('wildcard_uri,expected', [
("htt*://bar.com", False),
("*http://bar.com", False),
("http?://bar.com", False),
("http://bar?com*", False),
("?http://bar.com", False),
("http_://bar.com", False),
("http://bar_com*", False),
("_http://bar.com", False),
("http://localhost:3000*", False),
("http://localhost:_3000", False),
("http://bar*.com", False),
("file://*", False),
("https://foo.com", False),
("http://foo.com*", False),
("http://foo.com/*", True),
("urn:*", True),
("http://bar.com?foo=baz", True),
("doi:10.101?", True),
("http://example.com?", True),
("http://bar.com_foo=baz", True),
("doi:10.101_", True),
("http://example.com_", True),
("http://example.com/__/", True),
])
def test_identifies_wildcard_uri_is_valid(wildcard_uri, expected):
assert query.wildcard_uri_is_valid(wildcard_uri) == expected
Expand Down

0 comments on commit ed5c783

Please sign in to comment.