Skip to content

Commit

Permalink
fix nested handling in naming and match propagation
Browse files Browse the repository at this point in the history
  • Loading branch information
alexgarel committed Nov 1, 2020
1 parent 3fd5c5d commit 3e5e2eb
Show file tree
Hide file tree
Showing 6 changed files with 518 additions and 87 deletions.
4 changes: 2 additions & 2 deletions docs/source/quick_start.rst
Expand Up @@ -396,14 +396,14 @@ first identifying every matching element using :py:class:`MatchingPropagator`::

>>> from luqum.naming import MatchingPropagator, matching_from_names
>>> propagate_matching = MatchingPropagator()
>>> paths_ok, paths_ko = propagate_matching(tree, matching_from_names(matched_queries, names))
>>> paths_ok, paths_ko = propagate_matching(tree, *matching_from_names(matched_queries, names))

And then using :py:class:`HTMLMarker` to display it in html (you could make your own also)::

>>> from luqum.naming import HTMLMarker
>>> mark_html = HTMLMarker() # you can customize some parameters, refer to doc
>>> mark_html(tree, paths_ok, paths_ko)
'<span class="ok"><span class="ko">foo~2 </span>OR (bar AND baz)</span>'
'<span class="ok"><span class="ko">foo~2 </span>OR (<span class="ko"><span class="ok">bar </span>AND baz</span>)</span>'


__ https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html#request-body-search-queries-and-filters
95 changes: 95 additions & 0 deletions luqum/elasticsearch/nested.py
@@ -0,0 +1,95 @@
"""If you have a query with a nested query containing operations,
when using named queries, Elasticsearch won't report inner matching.
This is a problem if you extensively use it.
"""


def get_first_name(query):
if isinstance(query, dict):
if "_name" in query:
return query["_name"]
elif "bool" in query:
# do not go down bool
return None
else:
children = query.values()
elif isinstance(query, list):
children = query
else:
return None
iter_candidates = (get_first_name(child) for child in children)
candidates = [candidate for candidate in iter_candidates if candidate is not None]
return candidates[0] if candidates else None


def extract_nested_queries(query, query_nester=None):
"""given a query,
extract all queries that are under a nested query and boolean operations,
returning an atomic nested version of them.
Those nested queries, also take care of changing the name to the nearest inner name,
This is useful for Elasticsearch won't go down explaining why a nested query is matching.
:param dict query: elasticsearch query to analyze
:param callable query_nester: this is the function called to nest sub queries, leave it default
:return list: queries that you should run to get all matching
.. note:: because we re-nest part of bool queries, results might not be accurate
for::
{"bool": "must" : [
{"nested": {"path": "a", "match": {"x": "y"}}},
{"nested": {"path": "a", "match": {"x": "z"}}}
]}
is not the same as::
{"nested": {"path": "a", "bool": "must": [{"match": {"x": "y"}}, {"match": {"x": "z"}}]}}
if x is multivalued.
The first would match `{"a": [{"x": "y"}, {"x": "z"}]}`
While the second would only match if `x` contains `"y z"` or `"z y"`
"""
queries = [] # this contains our result
in_nested = query_nester is not None
sub_query_nester = query_nester
if isinstance(query, dict):
if "nested" in query:
params = {k: v for k, v in query["nested"].items() if k not in ("query", "name")}

def sub_query_nester(req, name):
nested = {"nested": {"query": req, **params}}
if query_nester is not None:
nested = query_nester(nested, name)
if name is not None:
nested["nested"]["_name"] = name
return nested

bool_param = {"must", "should", "must_not"} & set(query.keys())
if bool_param and in_nested:
# we are in a list of operations in a bool inside a nested,
# make a query with nested on sub arguments
op, = bool_param # must or should or must_not
# normalize to a list
sub_queries = query[op] if isinstance(query[op], list) else [query[op]]
# add nesting
nested_sub_queries = [
query_nester(sub_query, get_first_name(sub_query)) for sub_query in sub_queries
]
# those are queries we want to return
queries.extend(nested_sub_queries)
# continue processing in each sub query
# (before nesting, nesting is contained in query_nester)
children = sub_queries
else:
children = query.values()
elif isinstance(query, list):
children = query
else:
# leaf: final recursivity
children = []

# recurse
for child_query in children:
queries.extend(
extract_nested_queries(child_query, query_nester=sub_query_nester)
)
return queries
82 changes: 48 additions & 34 deletions luqum/naming.py
Expand Up @@ -98,9 +98,10 @@ def matching_from_names(names, name_to_path):
:param list names: list of names
:param dict name_to_path: association of names with path to children
:return set: corresponding list of matching path
:return tuple: (set of matching paths, set of other known paths)
"""
return {name_to_path[name] for name in names}
matching = {name_to_path[name] for name in names}
return (matching, set(name_to_path.values()) - matching)


def element_from_path(tree, path):
Expand Down Expand Up @@ -143,45 +144,57 @@ def __init__(self, default_operation=tree.OrOperation):
if default_operation is tree.OrOperation:
self.OR_NODES = self.OR_NODES + (tree.UnknownOperation,)

def _propagate(self, node, matching, path):
def _status_from_parent(self, path, matching, other):
"""Get status from nearest parent in hierarchie which had a name
"""
if path in matching:
return True
elif path in other:
return False
elif not path:
return False
else:
return self._status_from_parent(path[:-1], matching, other)

def _propagate(self, node, matching, other, path):
"""recursively propagate matching
return tuple: (
node is matching,
set of pathes of matching sub nodes,
set of pathes of non matching sub nodes)
"""
if path not in matching:
if node.children and not isinstance(node, self.NO_CHILDREN_PROPAGATE):
paths_ok = set() # path of nodes that are matching
paths_ko = set() # path of nodes that are not matching
children_status = [] # bool for each children, indicating if it matches or not
# children first, for our result may depend on them
for i, child in enumerate(node.children):
child_ok, sub_ok, sub_ko = self._propagate(child, matching, path + (i,))
paths_ok.update(sub_ok)
paths_ko.update(sub_ko)
children_status.append(child_ok)
# compute parent success from children
operator = any if isinstance(node, self.OR_NODES) else all
node_ok = operator(children_status)
# eventually negate result
if isinstance(node, self.NEGATION_NODES):
node_ok = not node_ok
paths_ok, paths_ko = paths_ko, paths_ok
# add path nod to the right set
target_set = paths_ok if node_ok else paths_ko
target_set.add(path)
# return result
return node_ok, paths_ok, paths_ko
else:
# non matching final node
return False, set(), {path}
paths_ok = set() # path of nodes that are matching
paths_ko = set() # path of nodes that are not matching
children_status = [] # bool for each children, indicating if it matches or not
# recurse children
if node.children and not isinstance(node, self.NO_CHILDREN_PROPAGATE):
for i, child in enumerate(node.children):
child_ok, sub_ok, sub_ko = self._propagate(
child, matching, other, path + (i,),
)
paths_ok.update(sub_ok)
paths_ko.update(sub_ko)
children_status.append(child_ok)
# resolve node status
if path in matching:
node_ok = True
elif children_status: # compute from children
# compute parent success from children
operator = any if isinstance(node, self.OR_NODES) else all
node_ok = operator(children_status)
else:
# single node matching
return True, {path}, set()

def __call__(self, tree, matching=None):
node_ok = self._status_from_parent(path, matching, other)
if isinstance(node, self.NEGATION_NODES):
# negate result
node_ok = not node_ok
# add node to the right set
target_set = paths_ok if node_ok else paths_ko
target_set.add(path)
# return result
return node_ok, paths_ok, paths_ko

def __call__(self, tree, matching, other=frozenset()):
"""
Given a list of paths that are known to match,
return all pathes in the tree that are matches.
Expand All @@ -192,12 +205,13 @@ def __call__(self, tree, matching=None):
Descending would mean risking to give non consistent information.
:param list matching: list of path of matching nodes (each path is a tuple)
:param list other: list of other path that had a name, but were not reported as matching
:return tuple: (
set of matching path after propagation,
set of non matching pathes after propagation)
"""
tree_ok, paths_ok, paths_ko = self._propagate(tree, matching, ())
tree_ok, paths_ok, paths_ko = self._propagate(tree, matching, other, ())
return paths_ok, paths_ko


Expand Down

0 comments on commit 3e5e2eb

Please sign in to comment.