Disallow underscore wildcard in domain (#5372)

* Replace ? with _ in error message * Dissallow _ wildcards in domain * Add note about spec support for _'s in domain * fixup: which spec * fixup: add example python url to doc string * fixup: remove trailing whitespaces * fixup: doc why wildcards at begining of url aren't performant
hypothesis · Oct 16, 2018 · f895098 · f895098
1 parent 8860c58
commit f895098
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 27 deletions.
diff --git a/docs/_extra/api-reference/hypothesis.yaml b/docs/_extra/api-reference/hypothesis.yaml
@@ -343,17 +343,17 @@ paths:
           default: 20
         - name: search_after
           in: query
-          description: Returns results after the annotation who's sort field has this value. 
-            If specifying a date use the format yyyy-MM-dd'T'HH:mm:ss.SSX or time in 
-            miliseconds since the epoch. This is used for iteration through large collections 
+          description: Returns results after the annotation who's sort field has this value.
+            If specifying a date use the format yyyy-MM-dd'T'HH:mm:ss.SSX or time in
+            miliseconds since the epoch. This is used for iteration through large collections
             of results.
           required: false
           type: string
         - name: offset
           in: query
           description: >
-            The number of initial annotations to skip. This is used for pagination. Not 
-            suitable for paging through thousands of annotations-search_after should be 
+            The number of initial annotations to skip. This is used for pagination. Not
+            suitable for paging through thousands of annotations-search_after should be
             used instead.
           required: false
           type: integer
@@ -397,22 +397,21 @@ paths:
         - name: wildcard_uri
           in: query
           description: |
-            Limit the results to annotations matching the wildcard URI. 
-            URI can be a URL (a web page address) or a URN representing another 
-            kind of resource such as DOI (Digital Object Identifier) or a 
+            Limit the results to annotations matching the wildcard URI.
+            URI can be a URL (a web page address) or a URN representing another
+            kind of resource such as DOI (Digital Object Identifier) or a
             PDF fingerprint.
 
-            `*` will match any character sequence (including an empty one), 
-            and a `_` will match any single character. `*`s are only permitted 
-            within the path and query parts of the URI and `_`s are only permitted
-            within the domain, path, and query parts of the URI.
+            `*` will match any character sequence (including an empty one),
+            and a `_` will match any single character. Wildcards are only permitted
+            within the path and query parts of the URI.
 
             Escaping wildcards is not supported.
 
             Examples of valid uris: `http://foo.com/*` `urn:x-pdf:*` `file://localhost/_bc.pdf`
 
             Examples of invalid uris: `*foo.com` `u_n:*` `file://*` `http://foo.com*`
-            
+
             <mark>This feature is experimental and the API may change.</mark>
           required: false
           type: string

diff --git a/h/schemas/annotation.py b/h/schemas/annotation.py
@@ -20,7 +20,7 @@ def _validate_wildcard_uri(node, value):
         if not wildcard_uri_is_valid(val):
             raise colander.Invalid(
                 node,
-                """Wildcards (? and *) are not permitted within the
+                """Wildcards (_ and *) are not permitted within the
                 domain of wildcard_uri""")
 
 
@@ -439,9 +439,8 @@ class SearchParamsSchema(colander.Schema):
             PDF fingerprint.
 
             `*` will match any character sequence (including an empty one),
-            and a `_` will match any single character. `*`s are only permitted
-            within the path and query parts of the URI and `_`s are only permitted
-            within the domain, path, and query parts of the URI.
+            and a `_` will match any single character. Wildcards are only permitted
+            within the path and query parts of the URI.
 
             Escaping wildcards is not supported.
 

diff --git a/h/search/query.py b/h/search/query.py
@@ -22,21 +22,25 @@ def wildcard_uri_is_valid(wildcard_uri):
     """
     Return True if uri contains wildcards in appropriate places, return False otherwise.
 
-    *'s are not permitted in the scheme or netloc. _'s are not permitted in the scheme.
+    *'s and _'s are not permitted in the scheme or netloc aka:
+        scheme://netloc/path;parameters?query#fragment.
+
+    If a wildcard is near the begining of a url, elasticsearch will find a large portion of the
+    annotations because it is based on luncene which searches from left to right. In order to
+    avoid the performance implications of having such a large initial search space, wildcards are
+    not allowed in the begining of the url.
     """
     if "*" not in wildcard_uri and "_" not in wildcard_uri:
         return False
 
+    # Note: according to the URL spec _'s are allowed in the domain so this may be
+    # something that needs to be supported at a later date.
     normalized_uri = urlparse.urlparse(wildcard_uri)
-    if not normalized_uri.scheme or "*" in normalized_uri.netloc:
+    if (not normalized_uri.scheme or
+            "*" in normalized_uri.netloc or
+            "_" in normalized_uri.netloc):
         return False
 
-    # If a wildcard comes before the port aka: http://localhost:_3000 the request for the
-    # port will fail with a ValueError: invalid literal for int() with base 10: '_3000'.
-    try:
-        normalized_uri.port
-    except ValueError:
-        return False
     return True
 
 

diff --git a/tests/h/search/query_test.py b/tests/h/search/query_test.py
@@ -572,15 +572,15 @@ def _get_search(self, search, pyramid_request, separate_keys):
     ("_http://bar.com", False),
     ("http://localhost:3000*", False),
     ("http://localhost:_3000", False),
+    ("http://bar.com_foo=baz", False),
+    ("http://example.com_", False),
     ("http://bar*.com", False),
     ("file://*", False),
     ("https://foo.com", False),
     ("http://foo.com*", False),
     ("http://foo.com/*", True),
     ("urn:*", True),
-    ("http://bar.com_foo=baz", True),
     ("doi:10.101_", True),
-    ("http://example.com_", True),
     ("http://example.com/__/", True),
 ])
 def test_identifies_wildcard_uri_is_valid(wildcard_uri, expected):