Skip to content

Commit

Permalink
visitor: fix wildcard search with dots
Browse files Browse the repository at this point in the history
* INSPIR-3417
  • Loading branch information
drjova committed Apr 16, 2020
1 parent 0de37ee commit fffb2e7
Show file tree
Hide file tree
Showing 2 changed files with 154 additions and 109 deletions.
165 changes: 84 additions & 81 deletions inspire_query_parser/visitors/elastic_search_visitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,7 @@ def _generate_fieldnames_if_bai_query(self, node_value, bai_field_variation, que
# Case of partial BAI, e.g. ``J.Smith``.
return [self.AUTHORS_BAI_FIELD + '.' + bai_field_variation] + \
force_list(self.KEYWORD_TO_ES_FIELDNAME['author'])

else:
return None
return None

def _generate_author_query(self, author_name):
"""Generates a query handling specifically authors.
Expand Down Expand Up @@ -705,95 +703,100 @@ def visit_keyword(self, node):
# If no keyword is found, return the original node value (case of an unknown keyword).
return self.KEYWORD_TO_ES_FIELDNAME.get(node.value, node.value)

def visit_value(self, node, fieldnames=None):
if not fieldnames:
fieldnames = '_all'

if node.contains_wildcard:
if self.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames:
return self._generate_date_with_wildcard_query(node.value)
def visit_value_wildcard(self, node, fieldnames=None):
if self.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames:
return self._generate_date_with_wildcard_query(node.value)

bai_fieldnames = None
if self.KEYWORD_TO_ES_FIELDNAME['author'] == fieldnames:
bai_fieldnames = self._generate_fieldnames_if_bai_query(
node.value,
bai_field_variation=FieldVariations.search,
query_bai_field_if_dots_in_name=True
)

query = self._generate_query_string_query(
node.value,
fieldnames=bai_fieldnames or fieldnames,
analyze_wildcard=True
)
query = self._generate_query_string_query(
node.value,
fieldnames=bai_fieldnames or fieldnames,
analyze_wildcard=True
)

if self.KEYWORD_TO_ES_FIELDNAME['author'] == fieldnames:
return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query)
return query
else:
if isinstance(fieldnames, list):
if self.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames:
# Date queries with simple values are transformed into range queries, among the given and the exact
# next date, according to the granularity of the given date.
return self._generate_range_queries(force_list(fieldnames), {ES_RANGE_EQ_OPERATOR: node.value})
if self.KEYWORD_TO_ES_FIELDNAME['author'] == fieldnames:
return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query)
return query

def visit_value(self, node, fieldnames=None):
if not fieldnames:
fieldnames = '_all'

if self.KEYWORD_TO_ES_FIELDNAME['journal'] == fieldnames:
return self._generate_journal_nested_queries(node.value)
if node.contains_wildcard:
return self.visit_value_wildcard(node, fieldnames=fieldnames)

return {
'multi_match': {
'fields': fieldnames,
'query': node.value,
}
if isinstance(fieldnames, list):
if self.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames:
# Date queries with simple values are transformed into range queries, among the given and the exact
# next date, according to the granularity of the given date.
return self._generate_range_queries(force_list(fieldnames), {ES_RANGE_EQ_OPERATOR: node.value})

if self.KEYWORD_TO_ES_FIELDNAME['journal'] == fieldnames:
return self._generate_journal_nested_queries(node.value)

return {
'multi_match': {
'fields': fieldnames,
'query': node.value,
}
else:
if self.KEYWORD_TO_ES_FIELDNAME['author'] == fieldnames:
bai_fieldnames = self._generate_fieldnames_if_bai_query(
node.value,
bai_field_variation=FieldVariations.search,
query_bai_field_if_dots_in_name=True
)
if bai_fieldnames:
if len(bai_fieldnames) == 1:
query = {"match": {bai_fieldnames[0]: node.value}}
return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query)
else:
# Not an exact BAI pattern match, but node's value looks like BAI (no spaces and dots),
# e.g. `S.Mele`. In this case generate a partial match query.
return self.visit_partial_match_value(node, bai_fieldnames)

return self._generate_author_query(node.value)

elif self.KEYWORD_TO_ES_FIELDNAME['exact-author'] == fieldnames:
return self._generate_exact_author_query(node.value)

elif self.KEYWORD_TO_ES_FIELDNAME['irn'] == fieldnames:
return {'term': {fieldnames: ''.join(('SPIRES-', node.value))}}

elif self.KEYWORD_TO_ES_FIELDNAME['title'] == fieldnames:
return self._generate_title_queries(node.value)

elif self.KEYWORD_TO_ES_FIELDNAME['type-code'] == fieldnames:
return self._generate_type_code_query(node.value)

elif self.KEYWORD_TO_ES_FIELDNAME['affiliation'] == fieldnames:
query = generate_match_query(
self.KEYWORD_TO_ES_FIELDNAME['affiliation'],
node.value,
with_operator_and=True
)
return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query)

elif fieldnames not in self.KEYWORD_TO_ES_FIELDNAME.values():
colon_value = ':'.join([fieldnames, node.value])
given_field_query = generate_match_query(fieldnames, node.value, with_operator_and=True)
texkey_query = ''
if self.TEXKEY_REGEX.match(colon_value):
texkey_query = self._generate_term_query('texkeys.raw', colon_value, boost=2.0)
_all_field_query = generate_match_query('_all', colon_value, with_operator_and=True)
query = wrap_queries_in_bool_clauses_if_more_than_one(
[given_field_query, texkey_query, _all_field_query], use_must_clause=False)
return wrap_query_in_nested_if_field_is_nested(query, fieldnames, self.NESTED_FIELDS)

return generate_match_query(fieldnames, node.value, with_operator_and=True)
}
else:
if self.KEYWORD_TO_ES_FIELDNAME['author'] == fieldnames:
bai_fieldnames = self._generate_fieldnames_if_bai_query(
node.value,
bai_field_variation=FieldVariations.search,
query_bai_field_if_dots_in_name=True
)
if bai_fieldnames:
if len(bai_fieldnames) == 1:
query = {"match": {bai_fieldnames[0]: node.value}}
return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query)
else:
# Not an exact BAI pattern match, but node's value looks like BAI (no spaces and dots),
# e.g. `S.Mele`. In this case generate a partial match query.
return self.visit_partial_match_value(node, bai_fieldnames)

return self._generate_author_query(node.value)

elif self.KEYWORD_TO_ES_FIELDNAME['exact-author'] == fieldnames:
return self._generate_exact_author_query(node.value)

elif self.KEYWORD_TO_ES_FIELDNAME['irn'] == fieldnames:
return {'term': {fieldnames: ''.join(('SPIRES-', node.value))}}

elif self.KEYWORD_TO_ES_FIELDNAME['title'] == fieldnames:
return self._generate_title_queries(node.value)

elif self.KEYWORD_TO_ES_FIELDNAME['type-code'] == fieldnames:
return self._generate_type_code_query(node.value)

elif self.KEYWORD_TO_ES_FIELDNAME['affiliation'] == fieldnames:
query = generate_match_query(
self.KEYWORD_TO_ES_FIELDNAME['affiliation'],
node.value,
with_operator_and=True
)
return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query)

elif fieldnames not in self.KEYWORD_TO_ES_FIELDNAME.values():
colon_value = ':'.join([fieldnames, node.value])
given_field_query = generate_match_query(fieldnames, node.value, with_operator_and=True)
texkey_query = ''
if self.TEXKEY_REGEX.match(colon_value):
texkey_query = self._generate_term_query('texkeys.raw', colon_value, boost=2.0)
_all_field_query = generate_match_query('_all', colon_value, with_operator_and=True)
query = wrap_queries_in_bool_clauses_if_more_than_one(
[given_field_query, texkey_query, _all_field_query], use_must_clause=False)
return wrap_query_in_nested_if_field_is_nested(query, fieldnames, self.NESTED_FIELDS)

return generate_match_query(fieldnames, node.value, with_operator_and=True)

def visit_exact_match_value(self, node, fieldnames=None):
"""Generates a term query (exact search in ElasticSearch)."""
Expand Down
98 changes: 70 additions & 28 deletions tests/test_elastic_search_visitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1526,37 +1526,36 @@ def test_elastic_search_visitor_handles_partial_match_value_with_bai_value_and_p

def test_elastic_search_visitor_handles_wildcard_simple_and_partial_bai_like_queries():
query_str = "a S.Mele* and 'S.Mel*'"
expected_es_query = \
{
"bool": {
"must": [
{
"nested": {
"path": "authors",
"query": {
"query_string": {
"analyze_wildcard": True,
"fields": ["authors.ids.value.search", "authors.full_name"],
"query": "S.Mele*"
}
expected_es_query = {
"bool": {
"must": [
{
"nested": {
"path": "authors",
"query": {
"query_string": {
"query": "S.Mele*",
"fields": ["authors.ids.value.search", "authors.full_name"],
"analyze_wildcard": True,
}
}
},
{
"nested": {
"path": "authors",
"query": {
"query_string": {
"analyze_wildcard": True,
"fields": ["authors.ids.value.search", "authors.full_name"],
"query": "*S.Mel*"
}
},
}
},
{
"nested": {
"path": "authors",
"query": {
"query_string": {
"query": "*S.Mel*",
"fields": ["authors.ids.value.search", "authors.full_name"],
"analyze_wildcard": True,
}
}
},
]
}
},
}
},
]
}
}

generated_es_query = _parse_query(query_str)
assert generated_es_query == expected_es_query
Expand Down Expand Up @@ -2649,3 +2648,46 @@ def test_elastic_search_visitor_find_journal_with_year():

generated_es_query = _parse_query(query_str)
assert generated_es_query == expected_es_query


def test_regression_wildcard_query_with_dot():
query_string = 'references.reference.dois:10.7483/OPENDATA.CMS*'
expected_es_query = {
'query_string': {
'query': '10.7483/OPENDATA.CMS*',
'fields': ['references.reference.dois'],
'analyze_wildcard': True
}
}

generated_es_query = _parse_query(query_string)
assert generated_es_query == expected_es_query


def test_regression_query_with_multiple_dots():
query_string = 'references.reference.dois:10.7483/OPENDATA.CMS.ATLAS'
expected_es_query = {
"bool": {
"should": [
{
"match": {
"references.reference.dois": {
"query": "10.7483/OPENDATA.CMS.ATLAS",
"operator": "and",
}
}
},
{
"match": {
"_all": {
"query": "references.reference.dois:10.7483/OPENDATA.CMS.ATLAS",
"operator": "and",
}
}
},
]
}
}

generated_es_query = _parse_query(query_string)
assert generated_es_query == expected_es_query

0 comments on commit fffb2e7

Please sign in to comment.