Skip to content

Commit

Permalink
Merge pull request #5275 from hypothesis/add-uri-wildcard-filter
Browse files Browse the repository at this point in the history
Add wildcard uri filter
  • Loading branch information
Hannah Stepanek committed Sep 18, 2018
2 parents 682764d + e975ca1 commit 0ef9193
Show file tree
Hide file tree
Showing 8 changed files with 265 additions and 15 deletions.
10 changes: 6 additions & 4 deletions h/activity/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@
from h.models import Annotation, Group
from h.search import Search
from h.search import parser
from h.search import TopLevelAnnotationsFilter
from h.search import AuthorityFilter
from h.search import TagsAggregation
from h.search import UsersAggregation
from h.search import (TopLevelAnnotationsFilter,
AuthorityFilter,
TagsAggregation,
UsersAggregation,
UriFilter)


class ActivityResults(namedtuple('ActivityResults', [
Expand Down Expand Up @@ -167,6 +168,7 @@ def _execute_search(request, query, page_size):
search = Search(request, stats=request.stats)
search.append_modifier(AuthorityFilter(authority=request.default_authority))
search.append_modifier(TopLevelAnnotationsFilter())
search.append_modifier(UriFilter(request=request))
for agg in aggregations_for(query):
search.append_aggregation(agg)

Expand Down
12 changes: 8 additions & 4 deletions h/search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,21 @@
from h.search.client import get_client
from h.search.config import init
from h.search.core import Search
from h.search.query import TopLevelAnnotationsFilter
from h.search.query import AuthorityFilter
from h.search.query import TagsAggregation
from h.search.query import UsersAggregation
from h.search.query import (TopLevelAnnotationsFilter,
AuthorityFilter,
TagsAggregation,
UsersAggregation,
UriCombinedWildcardFilter,
UriFilter)

__all__ = (
'Search',
'TopLevelAnnotationsFilter',
'AuthorityFilter',
'TagsAggregation',
'UsersAggregation',
'UriCombinedWildcardFilter',
'UriFilter',
'get_client',
'init',
)
Expand Down
3 changes: 1 addition & 2 deletions h/search/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ def __init__(self, request, separate_replies=False, stats=None, _replies_limit=2
query.Limiter(),
query.DeletedFilter(),
query.AuthFilter(request),
query.UriFilter(request),
query.GroupFilter(),
query.GroupAuthFilter(request),
query.UserFilter(),
Expand Down Expand Up @@ -107,7 +106,7 @@ def _search(self, modifiers, aggregations, params):
return response

def _search_annotations(self, params):
# If seperate_replies is True, don't return any replies to annotations.
# If separate_replies is True, don't return any replies to annotations.
modifiers = self._modifiers
if self.separate_replies:
modifiers = [query.TopLevelAnnotationsFilter()] + modifiers
Expand Down
127 changes: 126 additions & 1 deletion h/search/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from dateutil.parser import parse
from dateutil import tz
from datetime import datetime as dt
from h._compat import urlparse

from h import storage
from h.util import uri
from elasticsearch_dsl import Q
Expand All @@ -15,6 +17,40 @@
OFFSET_MAX = 9800


def wildcard_uri_is_valid(wildcard_uri):
"""
Return True if uri contains wildcards in appropriate places, return False otherwise.
Wildcards are not permitted in the scheme or netloc of the uri.
"""
if "*" not in wildcard_uri and "?" not in wildcard_uri:
return False
try:
normalized_uri = urlparse.urlparse(wildcard_uri)

# Remove all parts of the url except the scheme, netloc, and provide a substitute
# path value "p" so that uri's that only have a scheme and path are still valid.
uri_parts = (normalized_uri.scheme, normalized_uri.netloc, "p", "", "", "")

# Remove the "p" standing for path from the end of the uri.
begining_of_uri = urlparse.urlunparse(uri_parts)[:-1]

# If a wildcard was in the scheme the uri may come back as "" (a falsey value).
if begining_of_uri and wildcard_uri.startswith(begining_of_uri):
return True
except ValueError:
pass
return False


def _pop_param_values(params, key):
""" Pops and returns all values of the key in params"""
values = params.getall(key)
if values:
del params[key]
return values


class KeyValueMatcher(object):
"""
Adds any parameters as straightforward key-value matchers.
Expand Down Expand Up @@ -84,7 +120,7 @@ def __call__(self, search, params):
sort_by = "user_raw"

# Since search_after depends on the field that the annotations are
# being sorted by, it is set here rather than in a seperate class.
# being sorted by, it is set here rather than in a separate class.
search_after = params.pop("search_after", None)
if search_after:
if sort_by in ["updated", "created"]:
Expand Down Expand Up @@ -242,6 +278,95 @@ def __call__(self, search, params):
'terms', **{'target.scope': list(uris)})


class UriCombinedWildcardFilter(object):

"""
A filter that selects only annotations where the uri matches.
If separate_keys is True:
Wildcard searches are only performed on wildcard_uri's, and exact match searches
are performed on uri/url parameters.
If separate_keys is False:
uri/url parameters are expected to contain exact matches and wildcard matches.
Values containing wildcards are interpreted as wildcard searches.
If specifying a uri with wildcards:
* will match any character sequence (including an empty one), and a ? will match
any single character.
"""

def __init__(self, request, separate_keys=False):
"""Initialize a new UriFilter.
:param request: the pyramid.request object
:param separate_keys: if True will treat wildcard_uri as wildcards and uri/url
as exact match. If False will treat any values in uri/url containing wildcards
("?" or "*") as wildcard searches.
"""
self.request = request
self.separate_keys = separate_keys

def __call__(self, search, params):
# If there are no uris to search for there is nothing to do so return early.
if not ("uri" in params or "url" in params or "wildcard_uri" in params):
return search

if self.separate_keys:
uris = _pop_param_values(params, 'uri') + _pop_param_values(params, 'url')
wildcard_uris = _pop_param_values(params, 'wildcard_uri')
else:
uris = _pop_param_values(params, 'uri') + _pop_param_values(params, 'url')
# Split into wildcard uris and non wildcard uris.
wildcard_uris = [u for u in uris if "*" in u or "?" in u]
uris = [u for u in uris if "*" not in u and "?" not in u]

# Only add valid uri's to the search list.
wildcard_uris = self._normalize_uris(
[u for u in wildcard_uris if wildcard_uri_is_valid(u)],
normalize_method=self._wildcard_uri_normalized)
uris = self._normalize_uris(uris)

queries = []
if wildcard_uris:
queries = [Q('wildcard', **{"target.scope": u}) for u in wildcard_uris]
if uris:
queries.append(Q("terms", **{'target.scope': uris}))
return search.query('bool', should=queries)

def _normalize_uris(self, query_uris, normalize_method=uri.normalize):
uris = set()
for query_uri in query_uris:
expanded = storage.expand_uri(self.request.db, query_uri)

us = [normalize_method(u) for u in expanded]
uris.update(us)
return list(uris)

def _wildcard_uri_normalized(self, wildcard_uri):
"""
Same as uri.normalized but it doesn't strip ending `?` from uri's.
It's possible to have a wildcard at the end of a uri, however
uri.normalize strips `?`s from the end of uris and something like
http://foo.com/* will not be normalized to http://foo.com* without
removing the `*` before normalization. To compensate for this,
we check for an ending wildcard and add it back after normalization.
While it's possible to escape `?` and `*` using \\, the uri.normalize
converts \\ to encoded url format which does not behave the same in
elasticsearch. Thus, escaping wildcard characters is not currently
supported.
"""
trailing_wildcard = ""
if wildcard_uri.endswith("?") or wildcard_uri.endswith("*"):
trailing_wildcard = wildcard_uri[-1]
wildcard_uri = wildcard_uri[:-1]
wildcard_uri = uri.normalize(wildcard_uri)
wildcard_uri += trailing_wildcard
return wildcard_uri


class UserFilter(object):

"""
Expand Down
8 changes: 6 additions & 2 deletions h/views/api/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import newrelic.agent

from h import search as search_lib
from h.search import UriFilter
from h import storage
from h.exceptions import PayloadError
from h.events import AnnotationEvent
Expand Down Expand Up @@ -108,9 +109,12 @@ def search(request):
separate_replies = params.pop('_separate_replies', False)

stats = getattr(request, 'stats', None)
result = search_lib.Search(request,

search = search_lib.Search(request,
separate_replies=separate_replies,
stats=stats).run(params)
stats=stats)
search.append_modifier(UriFilter(request))
result = search.run(params)

svc = request.find_service(name='annotation_json_presentation')

Expand Down
4 changes: 3 additions & 1 deletion h/views/badge.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ def badge(request):
count = 0
else:
query = {'uri': uri, 'limit': 0}
result = search.Search(request, stats=request.stats).run(query)
s = search.Search(request, stats=request.stats)
s.append_modifier(search.UriFilter(request))
result = s.run(query)
count = result.total

return {'total': count}
4 changes: 3 additions & 1 deletion h/views/feeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@

def _annotations(request):
"""Return the annotations from the search API."""
result = search.Search(request, stats=request.stats).run(request.params)
s = search.Search(request, stats=request.stats)
s.append_modifier(search.UriFilter(request))
result = s.run(request.params)
return fetch_ordered_annotations(request.db, result.annotation_ids)


Expand Down
112 changes: 112 additions & 0 deletions tests/h/search/query_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,118 @@ def storage(self, patch):
return patch('h.search.query.storage')


class TestUriCombinedWildcardFilter():

@pytest.mark.parametrize('params,expected_ann_indexes,separate_keys', [
# Test with separate_keys = True (aka uri/url are exact match & wildcard_uri is wildcard match.)
(webob.multidict.MultiDict([("wildcard_uri", "http://bar.com/baz?45")]),
[2, 3],
True),
(webob.multidict.MultiDict([("uri", "urn:x-pdf:a34480f5dbed8c4482a3a921e0196d2a"),
("wildcard_uri", "http://bar.com/baz*45")]),
[2, 3, 4, 5],
True),
(webob.multidict.MultiDict([("uri", "urn:x-pdf:a34480f5dbed8c4482a3a921e0196d2a"),
("url", "http://bar.com/baz*45")]),
[3, 5],
True),
# Test with separate_keys = False (aka uri/url contain both exact & wildcard matches.)
(webob.multidict.MultiDict([("uri", "http://bar.com/baz-45?")]),
[1],
False),
(webob.multidict.MultiDict([("uri", "http://bar.com/*")]),
[0, 1, 2, 3, 4],
False),
(webob.multidict.MultiDict([("uri", "urn:x-pdf:a34480f5dbed8c4482a3a921e0196d2a"),
("uri", "http://bar.com/baz*45")]),
[2, 3, 4, 5],
False),
])
def test_matches(
self,
search,
pyramid_request,
Annotation,
params,
expected_ann_indexes,
separate_keys,
):
"""
All uri matches (wildcard and exact) are OR'd.
"""
search = self._get_search(search, pyramid_request, separate_keys)

ann_ids = [Annotation(target_uri="http://bar.com?foo").id,
Annotation(target_uri="http://bar.com/baz-457").id,
Annotation(target_uri="http://bar.com/baz-45").id,
Annotation(target_uri="http://bar.com/baz*45").id,
Annotation(target_uri="http://bar.com/baz/*/45").id,
Annotation(target_uri="urn:x-pdf:a34480f5dbed8c4482a3a921e0196d2a").id]

result = search.run(params)

assert sorted(result.annotation_ids) == sorted([ann_ids[ann] for ann in expected_ann_indexes])

@pytest.mark.parametrize('params,separate_keys', [
(webob.multidict.MultiDict([("wildcard_uri", "http?://bar.com")]), True),
(webob.multidict.MultiDict([("url", "ur*n:x-pdf:*")]), False),
])
def test_ignores_urls_with_wildcards_in_the_domain(self, pyramid_request, params, separate_keys):
urifilter = query.UriCombinedWildcardFilter(pyramid_request, separate_keys)
search = elasticsearch_dsl.Search(
using="default", index=pyramid_request.es.index
)

q = urifilter(search, params).to_dict()

assert "should" not in q['query']['bool']

@pytest.mark.parametrize('params,separate_keys', [
(webob.multidict.MultiDict([("wildcard_uri", "http?://bar.com"),
("uri", "http://bar.com"),
("url", "http://baz.com")]), True),
(webob.multidict.MultiDict([("uri", "http?://bar.com"),
("url", "http://baz.com")]), False),
])
def test_pops_params(self, pyramid_request, params, separate_keys):
urifilter = query.UriCombinedWildcardFilter(pyramid_request, separate_keys)
search = elasticsearch_dsl.Search(
using="default", index=pyramid_request.es.index
)

urifilter(search, params).to_dict()

assert "uri" not in params
assert "url" not in params
assert "wildcard_uri" not in params

def _get_search(self, search, pyramid_request, separate_keys):
search.append_modifier(query.UriCombinedWildcardFilter(
pyramid_request, separate_keys))
return search


@pytest.mark.parametrize('wildcard_uri,expected', [
("http?://bar.com", False),
("htt*://bar.com", False),
("http://localhost:3000*", False),
("http://bar*.com", False),
("http://bar?com", False),
("*?http://bar.com", False),
("file://*", False),
("https://foo.com", False),
("http://foo.com*", False),
("urn:*", True),
("urn:x-pdf:*", True),
("http://foo.com/*", True),
("doi:10.101?", True)
])
def test_identifies_wildcard_uri_is_valid(wildcard_uri, expected):
assert query.wildcard_uri_is_valid(wildcard_uri) == expected


class TestDeletedFilter(object):

def test_excludes_deleted_annotations(self, search, es_client, Annotation):
Expand Down

0 comments on commit 0ef9193

Please sign in to comment.