Skip to content

Commit

Permalink
visitor: use regex instead of wildcard in query string
Browse files Browse the repository at this point in the history
* Solves issues with unsupported combination of wildcard with escaped characters
* Solves issue with multiple raport_numbers values in one string
* INSPIR-3614
  • Loading branch information
MJedr committed Jun 16, 2020
1 parent 7887fa9 commit 6f7ecc9
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 63 deletions.
36 changes: 30 additions & 6 deletions inspire_query_parser/visitors/elastic_search_visitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,23 @@ def _generate_query_string_query(self, value, fieldnames, analyze_wildcard):

return query

def _generate_query_string_regex_query(self, value, fieldnames):
escaped_value = escape_query_string_special_characters(value)
regex_value = "/" + escaped_value.replace('*', '.*') + "/"
if not fieldnames:
field_specifier, field_specifier_value = 'default_field', '_all'
else:
field_specifier = 'fields'
field_specifier_value = fieldnames if isinstance(fieldnames, list) else [fieldnames]

query = {
'query_string': {
'query': regex_value,
field_specifier: field_specifier_value,
}
}
return query

# TODO Move it to visitor utils and write tests for it.
def _generate_term_query(self, fieldname, value, boost=None):
if not boost:
Expand Down Expand Up @@ -754,18 +771,25 @@ def handle_value_wildcard(self, node, fieldnames=None):
bai_field_variation=FieldVariations.search,
query_bai_field_if_dots_in_name=True
)
query = self._generate_query_string_query(
query = self._generate_query_string_regex_query(
node.value,
fieldnames=bai_fieldnames or fieldnames,
analyze_wildcard=True
fieldnames=bai_fieldnames or fieldnames
)
return self._generate_nested_author_query(query, fieldnames)

query = self._generate_query_string_query(
if self.KEYWORD_TO_ES_FIELDNAME["reportnumber"] == fieldnames:
value = \
('' if node.value.startswith(ast.GenericValue.WILDCARD_TOKEN) else '*') + \
node.value + \
('' if node.value.endswith(ast.GenericValue.WILDCARD_TOKEN) else '*')

return self._generate_query_string_regex_query(value, fieldnames=fieldnames)

query = self._generate_query_string_regex_query(
node.value,
fieldnames=fieldnames,
analyze_wildcard=True
fieldnames=fieldnames
)

return wrap_query_in_nested_if_field_is_nested(query, fieldnames, self.NESTED_FIELDS)

def handle_author_query(self, node, fieldnames=None):
Expand Down
130 changes: 73 additions & 57 deletions tests/test_elastic_search_visitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,7 @@ def test_elastic_search_visitor_find_institution_partial_value_cer():
query_str = 'affautocomplete:cer*'
expected_es_query = {
"query_string": {
"query": "cer*",
"analyze_wildcard": True,
"query": "/cer.*/",
"fields": [
"affautocomplete"
]
Expand Down Expand Up @@ -780,11 +779,10 @@ def test_elastic_search_visitor_wildcard_support():
"path": "authors",
"query": {
"query_string": {
"query": "*alge",
"query": "/.*alge/",
"fields": [
"authors.full_name"
],
"analyze_wildcard": True
]
}
}
}
Expand Down Expand Up @@ -838,11 +836,10 @@ def test_elastic_search_visitor_first_author_wildcard_support():
"path": "first_author",
"query": {
"query_string": {
"query": "*alge",
"query": "/.*alge/",
"fields": [
"first_author.full_name"
],
"analyze_wildcard": True
]
}
}
}
Expand Down Expand Up @@ -1617,34 +1614,39 @@ def test_elastic_search_visitor_handles_partial_match_value_with_bai_value_and_p
def test_elastic_search_visitor_handles_wildcard_simple_and_partial_bai_like_queries():
query_str = "a S.Mele* and 'S.Mel*'"
expected_es_query = {
"bool": {
"must": [
{
"nested": {
"path": "authors",
"query": {
"query_string": {
"query": "S.Mele*",
"fields": ["authors.ids.value.search", "authors.full_name"],
"analyze_wildcard": True,
}
},
}
},
{
"nested": {
"path": "authors",
"query": {
"query_string": {
"query": "*S.Mel*",
"fields": ["authors.ids.value.search", "authors.full_name"],
"analyze_wildcard": True,
}
},
}
},
]
}
"bool":{
"must":[
{
"nested":{
"path":"authors",
"query":{
"query_string":{
"query":"/S.Mele.*/",
"fields":[
"authors.ids.value.search",
"authors.full_name"
]
}
}
}
},
{
"nested":{
"path":"authors",
"query":{
"query_string":{
"query":"*S.Mel*",
"fields":[
"authors.ids.value.search",
"authors.full_name"
],
"analyze_wildcard":True
}
}
}
}
]
}
}

generated_es_query = _parse_query(query_str)
Expand Down Expand Up @@ -3203,11 +3205,12 @@ def test_elastic_search_visitor_find_journal_with_year():
def test_regression_wildcard_query_with_dot():
query_string = 'references.reference.dois:10.7483/OPENDATA.CMS*'
expected_es_query = {
'query_string': {
'query': '10.7483\\/OPENDATA.CMS*',
'fields': ['references.reference.dois'],
'analyze_wildcard': True
}
"query_string":{
"query":"/10.7483\\/OPENDATA.CMS.*/",
"fields":[
"references.reference.dois"
]
}
}

generated_es_query = _parse_query(query_string)
Expand Down Expand Up @@ -3375,16 +3378,15 @@ def test_check_texkey_doesnt_match_recid():
def test_wildcard_query_works_with_slash():
query_str = r"a S.M/ele*"
expected_es_query = {
"nested": {
"path": "authors",
"query": {
"query_string": {
"query": "S.M\\/ele*",
"fields": [
"nested":{
"path":"authors",
"query":{
"query_string":{
"query":"/S.M\\/ele.*/",
"fields":[
"authors.ids.value.search",
"authors.full_name"
],
"analyze_wildcard": True
]
}
}
}
Expand Down Expand Up @@ -3434,19 +3436,33 @@ def test_range_date_queries_are_nested():
def test_queries_with_wildcard_support_nested_fields():
query_str = 'publication_info.cnum:*'
expected_es_query = {
"nested": {
"path": "publication_info",
"query": {
"query_string": {
"query": "*",
"fields": [
"nested":{
"path":"publication_info",
"query":{
"query_string":{
"query":"/.*/",
"fields":[
"publication_info.cnum"
],
"analyze_wildcard": True
]
}
}
}
}

generated_es_query = _parse_query(query_str)
assert generated_es_query == expected_es_query


def test_wildcard_query_for_report_numbers_field():
query_str = 'r VBSCAN-*'
expected_es_query = {
"query_string":{
"query":"/.*VBSCAN\\-.*/",
"fields":[
"report_numbers.value.fuzzy"
]
}
}

generated_es_query = _parse_query(query_str)
assert generated_es_query == expected_es_query

0 comments on commit 6f7ecc9

Please sign in to comment.