From 5382ef06f0059270d4ee01b5bcfa0de8ba731973 Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Wed, 21 Oct 2020 09:59:25 +0100 Subject: [PATCH 1/7] test re-organization --- luqum/naming.py | 109 +++++-------- luqum/visitor.py | 9 +- setup.py | 1 - tests/__init__.py | 0 tests/test_elasticsearch/__init__.py | 0 tests/{ => test_elasticsearch}/book.json | 0 .../es_integration_utils.py | 148 ++++++++++++++++++ .../test_es_integration.py | 127 +-------------- 8 files changed, 204 insertions(+), 190 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/test_elasticsearch/__init__.py rename tests/{ => test_elasticsearch}/book.json (100%) create mode 100644 tests/test_elasticsearch/es_integration_utils.py rename tests/{ => test_elasticsearch}/test_es_integration.py (56%) diff --git a/luqum/naming.py b/luqum/naming.py index ccd3981..21672c4 100644 --- a/luqum/naming.py +++ b/luqum/naming.py @@ -5,6 +5,7 @@ This module adds support for that. """ +from . import tree from .visitor import TreeVisitor @@ -23,82 +24,58 @@ def get_name(node): class TreeAutoNamer(TreeVisitor): # helper for :py:func:`tree_name_index` - def __init__(self): - super().__init__(track_parents=False) + DEFAULT_TARGETS = (tree.Range, tree.Term, tree.Fuzzy, tree.BaseApprox) + """by default targets are the one translated to a leaf term in elasticsearch - def visit_base_operation(self, node, context): - # operations are the splitting point, we name them and make children subnames - names = context["names"] - set_name(node, ("_".join(names))) - for i, c in enumerate(node.children): - yield from self.visit_iter(c, context={"names": names + [str(i)]}) + :param tuple targets: class of elements that should receive a name + :param bool all_names: shall we name children of named elements ? + """ - def visit_term(self, node, context=None): - names = context["names"] - set_name(node, ("_".join(names))) - return [] + LETTERS = "abcdefghilklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + _letter_pos = {i: l for i, l in enumerate(LETTERS)} + _pos_letter = {l: i for i, l in enumerate(LETTERS)} - def visit_range(self, node, context=None): - names = context["names"] - set_name(node, ("_".join(names))) - # no need to visit children - return [] + def __init__(self, targets=None, all_names=False): + self.targets = targets if targets is not None else self.DEFAULT_TARGETS + self.all_names = all_names + super().__init__(track_parents=False) + + def generic_visit(self, node, context): + visit_children = True + if isinstance(node, self.targets): + # add name + self.context["name"] = name = self.next_name(context["name"]) + set_name(node, name) + # track name / path correspondance + context["name_to_path"][name] = tuple(context["path"]) + visit_children = self.all_names + if visit_children: + # continue with path trackings + for i, child in enumerate(node.children): + child_context = dict(context, path=context["path"] + [i]) + yield from self.visit_iter(child, context=child_context) + else: + yield node def visit(self, node): - return list(self.visit_iter(node, context={"names": ["0"]})) + """visit the tree and add names to nodes while tracking their path + """ + context = {"path": [], "name": None, "name_to_path": {}} + list(self.visit_iter(node, context=context)) + return context["name_to_path"] -def auto_name(tree): +def auto_name(tree, targets=None, all_names=False): """Automatically add names to nodes of a parse tree. - We add them to terminal nodes : range, phrases and words, as this is where it is useful, - but also on operations, to easily grab the group. - """ - TreeAutoNamer().visit(tree) - + We add them to terminal nodes : range, phrases and words, as this is where it is useful. -class NameIndexer(TreeVisitor): - # helper for :py:func:`tree_name_index` + :return: a dict giving the path to a the children having this name + """ + return TreeAutoNamer().visit(tree, targets, all_names) - def __init__(self): - super().__init__(track_parents=True) - def generic_visit(self, node, context): - # visit children - sub_names = list(super().generic_visit(node, context)) - name = get_name(node) - root_node = not context.get("parents") - if name is not None or root_node: - str_repr = str(node) - # search for subnodes position - subnodes_pos = [] - idx = 0 - for (subname, sub_repr, sub_subnodes_pos) in sub_names: - pos = str_repr.find(sub_repr, idx) - if pos >= 0: # pragma: no branch - length = len(sub_repr) - subnodes_pos.append((subname, pos, length, sub_subnodes_pos)) - idx = pos + length - sub_names = [(name, str_repr, subnodes_pos)] - yield from sub_names - - def __call__(self, node): - subnames = self.visit(node) - # by construction, root node, only return one entry - name, str_repr, subnodes_pos = subnames[0] - if name is not None: - # resolve last level - subnodes_pos = [(name, 0, len(str_repr), subnodes_pos)] - return subnodes_pos - - -def _flatten_name_index(subnodes_pos, start_pos=0): - for name, pos, length, children in subnodes_pos: - yield name, start_pos + pos, length - yield from _flatten_name_index(children, start_pos + pos) - - -def name_index(tree): +def name_index(tree, name_to_path): """Given a tree with names, give the index of each group in the string representation. also gives the node type. @@ -111,10 +88,6 @@ def name_index(tree): :param tree: a luqum parse tree :return dict: mapping each name to a `(start position, length)` tuple """ - subnodes_pos = NameIndexer()(tree) - # flatten the hierarchy - result = {name: (pos, length) for name, pos, length in _flatten_name_index(subnodes_pos)} - return result def extract(expr, name, name_index): diff --git a/luqum/visitor.py b/luqum/visitor.py index 0edb1f3..568f3c6 100644 --- a/luqum/visitor.py +++ b/luqum/visitor.py @@ -115,6 +115,13 @@ def __init__(self, track_new_parents=False, **kwargs): self.track_new_parents = track_new_parents super().__init__(**kwargs) + def _clone_item(self, node): + """simply call node.clone_item + + Surcharge this method to add specific tweaks if needed (like copying special attributes) + """ + return node.clone_item() + def visit(self, tree, context=None): if context is None: context = {} @@ -137,7 +144,7 @@ def generic_visit(self, node, context): It simply clone node and children """ - new_node = node.clone_item() + new_node = self._clone_item(node) new_node.children = self.clone_children(node, new_node, context) yield new_node diff --git a/setup.py b/setup.py index d34ac04..a985874 100644 --- a/setup.py +++ b/setup.py @@ -35,5 +35,4 @@ 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', ], - test_suite='luqum.tests' ) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_elasticsearch/__init__.py b/tests/test_elasticsearch/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/book.json b/tests/test_elasticsearch/book.json similarity index 100% rename from tests/book.json rename to tests/test_elasticsearch/book.json diff --git a/tests/test_elasticsearch/es_integration_utils.py b/tests/test_elasticsearch/es_integration_utils.py new file mode 100644 index 0000000..b682d07 --- /dev/null +++ b/tests/test_elasticsearch/es_integration_utils.py @@ -0,0 +1,148 @@ +import json +import os + +import elasticsearch_dsl +from elasticsearch.exceptions import ConnectionError +from elasticsearch.helpers import bulk +from elasticsearch_dsl import Date, Index, Integer, Nested, Object, Search, analyzer +from elasticsearch_dsl.connections import connections + +from luqum.elasticsearch import ElasticsearchQueryBuilder, SchemaAnalyzer + + +MAJOR_ES = elasticsearch_dsl.VERSION[0] +if MAJOR_ES > 2: + from elasticsearch_dsl import Keyword + +ES6 = False +if MAJOR_ES >= 6: + from elasticsearch_dsl import Text, Document, InnerDoc + + ES6 = True +else: + from elasticsearch_dsl import ( + String as Text, + DocType as Document, + InnerObjectWrapper as InnerDoc, + ) + + +def get_es(): + """Return an es connection or None if none seems available. + + Also wait for ES to be ready (yellow status) + """ + # you may use ES_HOST environment variable to configure Elasticsearch + # launching something like + # docker run --rm -p "127.0.0.1:9200:9200" -e "discovery.type=single-node" elasticsearch:7.8.0 + # is a simple way to get an instance + connections.configure(default=dict(hosts=os.environ.get("ES_HOST", "localhost"), timeout=20)) + client = connections.get_connection("default") + try: + # check ES running + client.cluster.health(wait_for_status='yellow') + except ConnectionError: + client = None + return client + + +if MAJOR_ES > 2: + + class Illustrator(InnerDoc): + """Inner object to be nested in Book, details on an illustrator + """ + name = Text() + birthdate = Date() + nationality = Keyword() + + +class Book(Document): + """An objects representing a book in ES + """ + title = Text(fields={ + "no_vowels": Text( + analyzer=analyzer("no_vowels", "pattern", pattern="[\Waeiouy]"), # noqa: W605 + search_analyzer="standard" + ) + }) + edition = Text() + author = Object(properties={"name": Text(), "birthdate": Date()}) + publication_date = Date() + n_pages = Integer() + + if ES6: + illustrators = Nested(Illustrator) + + class Index: + name = "bk" + + else: + illustrators = Nested( + properties={ + "name": Text(), + "birthdate": Date(), + "nationality": Keyword() + if MAJOR_ES > 2 + else Text(index="not_analyzed"), + } + ) + + class Meta: + index = "bk" + + +def add_book_data(es): + """Create a "bk" index and fill it with data + """ + Book.init() + with open(os.path.join(os.path.dirname(__file__), "book.json")) as f: + datas = json.load(f) + + actions = ( + {"_op_type": "index", "_id": i, "_source": d} + for i, d in enumerate(datas["books"]) + ) + if MAJOR_ES >= 7: + bulk(es, actions, index="bk", refresh=True) + else: + if ES6: + doc_type = "doc" + else: + doc_type = "book" + bulk(es, actions, index="bk", doc_type=doc_type, refresh=True) + + +def book_search(es): + """Return an elasticsearch_dsl search object + """ + return Search(using=es, index="bk") + + +def book_query_builder(es): + """Return an ElasticsearchQueryBuilder adapted for search in book. + + title is adapted to search the title.no_wowels field along with the title + """ + MESSAGES_SCHEMA = {"mappings": Book._doc_type.mapping.to_dict()} + schema_analizer = SchemaAnalyzer(MESSAGES_SCHEMA) + + builder_options = schema_analizer.query_builder_options() + builder_options['field_options'] = { + 'title.no_vowels': { + 'match_type': 'multi_match', + 'type': 'most_fields', + 'fields': ('title', 'title.no_vowels') + } + } + return ElasticsearchQueryBuilder(**builder_options) + + +def remove_book_index(es): + """clean "bk" index + """ + if es is None: + return + if ES6: + Book._index.delete() + else: + Index("bk").delete(ignore=404) diff --git a/tests/test_es_integration.py b/tests/test_elasticsearch/test_es_integration.py similarity index 56% rename from tests/test_es_integration.py rename to tests/test_elasticsearch/test_es_integration.py index 5d1a214..3cfbb6c 100644 --- a/tests/test_es_integration.py +++ b/tests/test_elasticsearch/test_es_integration.py @@ -1,107 +1,10 @@ -import json -import os from unittest import TestCase, skipIf -import elasticsearch_dsl -from elasticsearch.exceptions import ConnectionError -from elasticsearch.helpers import bulk -from elasticsearch_dsl import Date, Index, Integer, Nested, Object, Search, analyzer -from elasticsearch_dsl.connections import connections -from luqum.elasticsearch import ElasticsearchQueryBuilder, SchemaAnalyzer from luqum.parser import parser -MAJOR_ES = elasticsearch_dsl.VERSION[0] -if MAJOR_ES > 2: - from elasticsearch_dsl import Keyword - -ES6 = False -if MAJOR_ES >= 6: - from elasticsearch_dsl import Text, Document, InnerDoc - - ES6 = True -else: - from elasticsearch_dsl import ( - String as Text, - DocType as Document, - InnerObjectWrapper as InnerDoc, - ) - - -def get_es(): - # you may use ES_HOST environment variable to configure Elasticsearch - # launching something like - # docker run --rm -p "127.0.0.1:9200:9200" -e "discovery.type=single-node" elasticsearch:7.8.0 - # is a simple way to get an instance - connections.configure(default=dict(hosts=os.environ.get("ES_HOST", "localhost"), timeout=20)) - client = connections.get_connection("default") - try: - # check ES runnig - client.cluster.health(wait_for_status='yellow') - except ConnectionError: - client = None - return client - - -if MAJOR_ES > 2: - - class Illustrator(InnerDoc): - name = Text() - birthdate = Date() - nationality = Keyword() - - -class Book(Document): - title = Text(fields={ - "no_vowels": Text( - analyzer=analyzer("no_vowels", "pattern", pattern="[\Waeiouy]"), # noqa: W605 - search_analyzer="standard" - ) - }) - edition = Text() - author = Object(properties={"name": Text(), "birthdate": Date()}) - publication_date = Date() - n_pages = Integer() - - if ES6: - illustrators = Nested(Illustrator) - - class Index: - name = "bk" - - else: - illustrators = Nested( - properties={ - "name": Text(), - "birthdate": Date(), - "nationality": Keyword() - if MAJOR_ES > 2 - else Text(index="not_analyzed"), - } - ) - - class Meta: - index = "bk" - - -def add_data(): - search = connections.get_connection() - Book.init() - with open(os.path.join(os.path.dirname(__file__), "book.json")) as f: - datas = json.load(f) - - actions = ( - {"_op_type": "index", "_id": i, "_source": d} - for i, d in enumerate(datas["books"]) - ) - if MAJOR_ES >= 7: - bulk(search, actions, index="bk", refresh=True) - else: - if ES6: - doc_type = "doc" - else: - doc_type = "book" - - bulk(search, actions, index="bk", doc_type=doc_type, refresh=True) +from .es_integration_utils import ( + add_book_data, book_query_builder, book_search, get_es, remove_book_index, +) @skipIf(get_es() is None, "Skipping ES test as I can't reach ES") @@ -112,20 +15,9 @@ def setUpClass(cls): cls.es_client = get_es() if cls.es_client is None: return - cls.search = Search(using=cls.es_client, index="bk") - MESSAGES_SCHEMA = {"mappings": Book._doc_type.mapping.to_dict()} - schema_analizer = SchemaAnalyzer(MESSAGES_SCHEMA) - - builder_options = schema_analizer.query_builder_options() - builder_options['field_options'] = { - 'title.no_vowels': { - 'match_type': 'multi_match', - 'type': 'most_fields', - 'fields': ('title', 'title.no_vowels') - } - } - cls.es_builder = ElasticsearchQueryBuilder(**builder_options) - add_data() + cls.es_builder = book_query_builder(cls.es_client) + cls.search = book_search(cls.es_client) + add_book_data(cls.es_client) def _ask_luqum(self, req): tree = parser.parse(req) @@ -256,9 +148,4 @@ def test_subfield_multi_match_search(self): @classmethod def tearDownClass(cls): - if cls.es_client is None: - return - if ES6: - Book._index.delete() - else: - Index("bk").delete(ignore=404) + remove_book_index(cls.es_client) From a7e475f1e7cd8afbd3679a9c183e44701eb10ddd Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Thu, 22 Oct 2020 11:20:46 +0100 Subject: [PATCH 2/7] Naming queries and tools to explain matches --- .gitignore | 6 +- CHANGELOG.rst | 24 + docs/source/api.rst | 19 +- docs/source/quick_start.rst | 63 ++- luqum/elasticsearch/tree.py | 8 +- luqum/elasticsearch/visitor.py | 45 +- luqum/naming.py | 272 ++++++++-- luqum/tree.py | 6 +- luqum/visitor.py | 107 +++- tests/test_elasticsearch/book.json | 27 +- .../es_integration_utils.py | 10 +- .../test_elasticsearch/test_es_integration.py | 2 +- tests/test_elasticsearch/test_es_naming.py | 193 +++++++ tests/test_elasticsearch/test_naming.py | 250 +++++++++ tests/test_elasticsearch/tests.py | 120 +--- tests/test_naming.py | 513 ++++++++++++++---- tests/test_visitor.py | 77 ++- 17 files changed, 1378 insertions(+), 364 deletions(-) create mode 100644 tests/test_elasticsearch/test_es_naming.py create mode 100644 tests/test_elasticsearch/test_naming.py diff --git a/.gitignore b/.gitignore index d9e6d02..07fbf8e 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,11 @@ __pycache__/ *$py.class # PLY -luqum/parser.out +parser.out +parsetab.py + +# coverage +cover/ # C extensions *.so diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 45fbf14..ff928b8 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,30 @@ and this project tries to adhere to `Semantic Versioning`_. .. _`Keep a Changelog`: http://keepachangelog.com/en/1.0.0/ .. _`Semantic Versioning`: http://semver.org/spec/v2.0.0.html +Rolling +======= + +Changed +------- + +- completely modified the naming module and `auto_name` function, as it was not practical as is. + +Added +----- + +- added tools to build visual explanations about why a request matches a results + (leveraging `elasticsearch named queries`__. +- added a visitor and transformer that tracks path to element while visiting the tree. + +__ https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html#request-body-search-queries-and-filters + +Fixed +----- + +- fixed the handling of names when transforming luqum tree to elasticsearch queries + and added integration tests. + + 0.10.0 - 2020-09-22 =================== diff --git a/docs/source/api.rst b/docs/source/api.rst index 9d3c75c..524e9d7 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -40,6 +40,18 @@ luqum.elasticsearch :member-order: bysource +Naming and explaining matches +============================== + + +luqum.naming +------------ + +.. automodule:: luqum.naming + :members: + :member-order: bysource + + Utilities ========== @@ -52,13 +64,6 @@ luqum.visitor: Manipulating trees :member-order: bysource -luqum.naming: Naming query parts ---------------------------------- - -.. automodule:: luqum.naming - :members: - :member-order: bysource - luqum.auto_head_tail: Automatic addition of spaces -------------------------------------------------- diff --git a/docs/source/quick_start.rst b/docs/source/quick_start.rst index 4aee8fe..dcb2375 100644 --- a/docs/source/quick_start.rst +++ b/docs/source/quick_start.rst @@ -3,6 +3,7 @@ Quick start >>> from unittest import TestCase >>> t = TestCase() + >>> t.maxDiff = None .. _tutorial-parsing: @@ -339,8 +340,8 @@ We can pretty print it:: .. _`elasticsearch_dsl`: https://pypi.python.org/pypi/elasticsearch-dsl .. _`Elasticsearch queries DSL`: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html -Named Queries --------------- +Named Queries: explaining a match +--------------------------------- .. py:currentmodule:: luqum.naming @@ -355,7 +356,18 @@ Say we have a query:: We can use :py:func:`auto_name` to automatically add names:: >>> from luqum.naming import auto_name - >>> auto_name(tree) + >>> names = auto_name(tree) + +names contains a dict association names to path in the luqum tree. +For example the first name "a" is associated with element "foo", +and we can retrieve it easly thanks to small utils for navigating the tree:: + + >>> from luqum.naming import element_from_path, element_from_name + >>> element_from_name(tree, "a", names) + Fuzzy(Word('foo'), 2) + >>> element_from_path(tree, (0, 0)) + Word('foo') + The generated elastic search queries use the names when building the query (see `elastic named queries`__):: @@ -364,16 +376,10 @@ when building the query (see `elastic named queries`__):: >>> t.assertDictEqual( ... es_query, ... {'bool': {'should': [ - ... {'fuzzy': {'text': {'_name': '0_0', 'fuzziness': 2.0, 'value': 'foo'}}}, + ... {'fuzzy': {'text': {'fuzziness': 2.0, 'value': 'foo', '_name': 'a'}}}, ... {'bool': {'must': [ - ... {'match': {'text': { - ... '_name': '0_1_0', - ... 'query': 'bar', - ... 'zero_terms_query': 'all'}}}, - ... {'match': {'text': { - ... '_name': '0_1_1', - ... 'query': 'baz', - ... 'zero_terms_query': 'all'}}} + ... {'match': {'text': {'query': 'bar', 'zero_terms_query': 'all', '_name': 'c'}}}, + ... {'match': {'text': {'query': 'baz', 'zero_terms_query': 'all', '_name': 'd'}}} ... ]}} ... ]}} ... ) @@ -381,18 +387,23 @@ when building the query (see `elastic named queries`__):: If you use this on elasticsearch, for each record, elastic will return the part of the queries matched by the record, using their names. -To display it to the user, we can find back which name refers to which part of the query, -using :py:func:`name_index` and :py:func:`extract`:: - - >>> from luqum.naming import name_index, extract - >>> index = name_index(tree) - >>> index["0_1_0"] # for each name, associate start index and length - (10, 3) - >>> extract(expr, "0_1_0", index) - 'bar' - >>> extract(expr, "0_1", index) - 'bar AND baz' - - +Imagine elasticsearch returned us we match on 'b' and 'c':: + + >>> matched_queries = ['b', 'c'] + +To display it to the user, we have two step to undergo: +first identifying every matching element using :py:class:`MatchingPropagator`:: + + >>> from luqum.naming import MatchingPropagator, matching_from_names + >>> propagate_matching = MatchingPropagator() + >>> paths_ok, paths_ko = propagate_matching(tree, matching_from_names(matched_queries, names)) + +And then using :py:class:`HTMLMarker` to display it in html (you could make your own also):: + + >>> from luqum.naming import HTMLMarker + >>> mark_html = HTMLMarker() # you can customize some parameters, refer to doc + >>> mark_html(tree, paths_ok, paths_ko) + 'foo~2 OR (bar AND baz)' + -__ https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-named-queries-and-filters.html +__ https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html#request-body-search-queries-and-filters diff --git a/luqum/elasticsearch/tree.py b/luqum/elasticsearch/tree.py index ce0465f..b5064aa 100644 --- a/luqum/elasticsearch/tree.py +++ b/luqum/elasticsearch/tree.py @@ -233,10 +233,11 @@ class ENested(AbstractEOperation): Take care to remove ENested children """ - def __init__(self, nested_path, nested_fields, items, *args, **kwargs): + def __init__(self, nested_path, nested_fields, items, *args, _name=None, **kwargs): self._nested_path = [nested_path] self.items = self._exclude_nested_children(items) + self._name = _name @property def nested_path(self): @@ -289,7 +290,10 @@ def _exclude_nested_children(self, subtree): @property def json(self): - return {'nested': {'path': self.nested_path, 'query': self.items.json}} + data = {'nested': {'path': self.nested_path, 'query': self.items.json}} + if self._name: + data['nested']['_name'] = self._name + return data class EShould(EOperation): diff --git a/luqum/elasticsearch/visitor.py b/luqum/elasticsearch/visitor.py index 9c4bf6a..9bec382 100644 --- a/luqum/elasticsearch/visitor.py +++ b/luqum/elasticsearch/visitor.py @@ -242,6 +242,19 @@ def _is_should(self, operation): self.default_operator == ElasticsearchQueryBuilder.SHOULD ) + def _propagate_name(self, node, child_context): + """if node has a name, put it in child_context to propagate it + """ + name = get_name(node) + if name: + child_context["name"] = name + + def get_name(self, node, context): + """get node name or take it from context (inherited from upper layers) + """ + node_name = get_name(node) + return node_name if node_name is not None else context.get("name") + def _yield_nested_children(self, parent, children): """ Raise if a OR (should) is in a AND (must) without being in parenthesis:: @@ -273,11 +286,13 @@ def _yield_nested_children(self, parent, children): def _binary_operation(self, cls, node, context): children = self.simplify_if_same(node.children, node) children = self._yield_nested_children(node, children) - (visit_iter) = super().visit_iter # can't use super inside the comprehension expression + visit_iter = super().visit_iter # can't use super inside the comprehension expression + child_context = dict(context) + self._propagate_name(node, child_context) items = [ item for child in children - for item in visit_iter(child, context) + for item in visit_iter(child, child_context) ] yield self.es_item_factory.build(cls, items) @@ -300,16 +315,20 @@ def visit_search_field(self, node, context): child_context = dict(context, parents=context.get("parents", ()) + (node,)) child_context[self.CONTEXT_ANALYZE_MARKER] = name not in self._not_analyzed_fields child_context[self.CONTEXT_FIELD_PREFIX] = prefix + self._propagate_name(node, child_context) enode, = self.visit_iter(node.expr, child_context) nested_path = self._split_nested(node, context) skip_nesting = isinstance(enode, ENested) # no need to nest a nested if nested_path is not None and not skip_nesting: - enode = self.es_item_factory.build(ENested, nested_path=nested_path, items=enode) + enode = self.es_item_factory.build( + ENested, nested_path=nested_path, items=enode, _name=self.get_name(node, context), + ) yield enode def visit_not(self, node, context): children = self.simplify_if_same(node.children, node) child_context = dict(context, parents=context.get("parents", ()) + (node,)) + self._propagate_name(node, child_context) items = [ item for child in children @@ -330,17 +349,17 @@ def visit_unknown_operation(self, *args, **kwargs): yield from self._must_operation(*args, **kwargs) def visit_boost(self, node, context): - eword, = super().generic_visit(node, context) + eword, = self.generic_visit(node, context) eword.boost = float(node.force) yield eword def visit_fuzzy(self, node, context): - eword, = super().generic_visit(node, context) + eword, = self.generic_visit(node, context) eword.fuzziness = float(node.degree) yield eword def visit_proximity(self, node, context): - ephrase, = super().generic_visit(node, context) + ephrase, = self.generic_visit(node, context) if self._is_analyzed(context): ephrase.slop = float(node.degree) else: @@ -348,6 +367,12 @@ def visit_proximity(self, node, context): ephrase.fuzziness = float(node.degree) yield ephrase + def generic_visit(self, node, context): + # propagate name + child_context = dict(context) + self._propagate_name(node, child_context) + yield from super().generic_visit(node, child_context) + def visit_word(self, node, context): if self._is_analyzed(context): if self.match_word_as_phrase: @@ -361,7 +386,7 @@ def visit_word(self, node, context): q=node.value, method=method, fields=self._fields(context), - _name=get_name(node), + _name=self.get_name(node, context), ) def visit_phrase(self, node, context): @@ -370,7 +395,7 @@ def visit_phrase(self, node, context): EPhrase, phrase=node.value, fields=self._fields(context), - _name=get_name(node), + _name=self.get_name(node, context), ) else: # in the case of a term, parenthesis are just there to escape spaces or colons @@ -378,7 +403,7 @@ def visit_phrase(self, node, context): EWord, q=node.value[1:-1], # remove quotes fields=self._fields(context), - _name=get_name(node), + _name=self.get_name(node, context), ) def visit_range(self, node, context): @@ -388,7 +413,7 @@ def visit_range(self, node, context): } yield self.es_item_factory.build( ERange, - _name=get_name(node), + _name=self.get_name(node, context), fields=self._fields(context), **kwargs ) diff --git a/luqum/naming.py b/luqum/naming.py index 21672c4..9e4de7e 100644 --- a/luqum/naming.py +++ b/luqum/naming.py @@ -6,7 +6,7 @@ This module adds support for that. """ from . import tree -from .visitor import TreeVisitor +from .visitor import PathTrackingVisitor, PathTrackingTransformer #: Names are added to tree items via an attribute named `_luqum_name` @@ -21,80 +21,252 @@ def get_name(node): return getattr(node, NAME_ATTR, None) -class TreeAutoNamer(TreeVisitor): - # helper for :py:func:`tree_name_index` - - DEFAULT_TARGETS = (tree.Range, tree.Term, tree.Fuzzy, tree.BaseApprox) - """by default targets are the one translated to a leaf term in elasticsearch - - :param tuple targets: class of elements that should receive a name - :param bool all_names: shall we name children of named elements ? +class TreeAutoNamer(PathTrackingVisitor): + """Helper for :py:func:`auto_name` """ LETTERS = "abcdefghilklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" - _letter_pos = {i: l for i, l in enumerate(LETTERS)} _pos_letter = {l: i for i, l in enumerate(LETTERS)} - def __init__(self, targets=None, all_names=False): - self.targets = targets if targets is not None else self.DEFAULT_TARGETS - self.all_names = all_names - super().__init__(track_parents=False) + def next_name(self, name): + """Given name, return next name - def generic_visit(self, node, context): - visit_children = True - if isinstance(node, self.targets): - # add name - self.context["name"] = name = self.next_name(context["name"]) - set_name(node, name) - # track name / path correspondance - context["name_to_path"][name] = tuple(context["path"]) - visit_children = self.all_names - if visit_children: - # continue with path trackings - for i, child in enumerate(node.children): - child_context = dict(context, path=context["path"] + [i]) - yield from self.visit_iter(child, context=child_context) + :: + >>> tan = TreeAutoNamer() + >>> tan.next_name(None) + 'a' + >>> tan.next_name('aZ') + 'aZa' + >>> tan.next_name('azb') + 'azc' + """ + if name is None: + # bootstrap + return self.LETTERS[0] else: - yield node + actual_pos = self._pos_letter[name[-1]] + try: + # we want to increment last letter + return name[:-1] + self.LETTERS[actual_pos + 1] + except IndexError: + # we exhausts letters, add a new one instead + return name + self.LETTERS[0] + + def visit_base_operation(self, node, context): + """name is to be set on children of operations + """ + # put a _name on each children + name = context["global"]["name"] + for i, child in enumerate(node.children): + name = self.next_name(name) + set_name(child, name) + # remember name to path + context["global"]["name_to_path"][name] = context["path"] + (i,) + # put name back in global context + context["global"]["name"] = name + yield from self.generic_visit(node, context) def visit(self, node): """visit the tree and add names to nodes while tracking their path """ - context = {"path": [], "name": None, "name_to_path": {}} - list(self.visit_iter(node, context=context)) - return context["name_to_path"] + # trick: we use a "global" dict inside context dict so that when we copy context, + # we still track the same objects + context = {"global": {"name": None, "name_to_path": {}}} + super().visit(node, context) + name_to_path = context["global"]["name_to_path"] + # handle special case, if we have no name so far, put one on the root + if not name_to_path: + node_name = self.next_name(context["global"]["name"]) + set_name(node, node_name) + name_to_path[node_name] = () + return name_to_path def auto_name(tree, targets=None, all_names=False): - """Automatically add names to nodes of a parse tree. + """Automatically add names to nodes of a parse tree, in order to be able to track matching. + + We add them to top nodes under operations as this is where it is useful for ES named queries + + :return dict: association of name with the path (as a tuple) to a the corresponding children + """ + return TreeAutoNamer().visit(tree) + + +def matching_from_names(names, name_to_path): + """Utility to convert a list of name and the result of auto_name + to the matching parameter for :py:class:`MatchingPropagator` - We add them to terminal nodes : range, phrases and words, as this is where it is useful. + :param list names: list of names + :param dict name_to_path: association of names with path to children + :return set: corresponding list of matching path + """ + return {name_to_path[name] for name in names} + + +def element_from_path(tree, path): + """Given a tree, retrieve element corresponding to path + + :param luqum.tree.Item tree: luqum expression tree + :param tuple path: tuple representing top down access to a child + :return luqum.tree.Item: target item + """ + # python likes iterations over recursivity + node = tree + path = list(path) + while path: + node = node.children[path.pop(0)] + return node + + +def element_from_name(tree, name, name_to_path): + return element_from_path(tree, name_to_path[name]) + + +class MatchingPropagator: + """Class propagating matching to upper elements based on known base element matching + + :param luqum.tree.Item default_operation: tells how to treat UnknownOperation. + Choose between :py:class:`luqum.tree.OrOperation` and :py:class:`luqum.tree.AndOperation` + """ - :return: a dict giving the path to a the children having this name + OR_NODES = (tree.OrOperation,) + """A tuple of nodes types considered as OR operations """ - return TreeAutoNamer().visit(tree, targets, all_names) + NEGATION_NODES = (tree.Not, tree.Prohibit) + """A tuple of nodes types considered as NOT operations + """ + NO_CHILDREN_PROPAGATE = (tree.Range, tree.BaseApprox) + """A tuple of nodes for which propagation is of no use + """ + + def __init__(self, default_operation=tree.OrOperation): + if default_operation is tree.OrOperation: + self.OR_NODES = self.OR_NODES + (tree.UnknownOperation,) + def _propagate(self, node, matching, path): + """recursively propagate matching -def name_index(tree, name_to_path): - """Given a tree with names, give the index of each group in the string representation. - also gives the node type. + return tuple: ( + node is matching, + set of pathes of matching sub nodes, + set of pathes of non matching sub nodes) + """ + if path not in matching: + if node.children and not isinstance(node, self.NO_CHILDREN_PROPAGATE): + paths_ok = set() # path of nodes that are matching + paths_ko = set() # path of nodes that are not matching + children_status = [] # bool for each children, indicating if it matches or not + # children first, for our result may depend on them + for i, child in enumerate(node.children): + child_ok, sub_ok, sub_ko = self._propagate(child, matching, path + (i,)) + paths_ok.update(sub_ok) + paths_ko.update(sub_ko) + children_status.append(child_ok) + # compute parent success from children + operator = any if isinstance(node, self.OR_NODES) else all + node_ok = operator(children_status) + # eventually negate result + if isinstance(node, self.NEGATION_NODES): + node_ok = not node_ok + paths_ok, paths_ko = paths_ko, paths_ok + # add path nod to the right set + target_set = paths_ok if node_ok else paths_ko + target_set.add(path) + # return result + return node_ok, paths_ok, paths_ko + else: + # non matching final node + return False, set(), {path} + else: + # single node matching + return True, {path}, set() - .. warning:: this is not an efficient implementation, - It will call str representation several times on each item, and seek for substrings. + def __call__(self, tree, matching=None): + """ + Given a list of paths that are known to match, + return all pathes in the tree that are matches. - see :py:class:`TreeNameIndexer` + .. note:: we do not descend into nodes that are positive. + Normally matching just provides nodes at the right levels + for propagation to be effective. + Descending would mean risking to give non consistent information. + :param list matching: list of path of matching nodes (each path is a tuple) - :param tree: a luqum parse tree - :return dict: mapping each name to a `(start position, length)` tuple + :return tuple: ( + set of matching path after propagation, + set of non matching pathes after propagation) + """ + tree_ok, paths_ok, paths_ko = self._propagate(tree, matching, ()) + return paths_ok, paths_ko + + +class ExpressionMarker(PathTrackingTransformer): + """A visitor to mark a tree based on elements belonging to a path or not + + One intended usage is to add marker around nodes matching a request, + by altering tail and head of elements """ + def mark_node(self, node, path, *info): + """implement this in your own code, maybe altering the head / tail arguments + """ + return node + + def generic_visit(self, node, context): + # we simply generate new_node and mark it + new_node, = super().generic_visit(node, context) + yield self.mark_node(new_node, context["path"], *context["info"]) + + def __call__(self, tree, *info): + return self.visit(tree, context={"info": info}) -def extract(expr, name, name_index): - """extract named part of expression, using name_index - :param str expr: the lucene expression - :param str name: name of the part to extract - :param dict name_index: the dict obtained from :py:func:`name_index` +class HTMLMarker(ExpressionMarker): + """from paths that are ok or ko, add html elements with right class around elements + + :param str ok_class: class for elements in paths_ok + :param str ko_class: class for elements in paths_ko + :param str element: html element used to surround sub expressions """ - return expr[name_index[name][0]: name_index[name][0] + name_index[name][1]] + + def __init__(self, ok_class="ok", ko_class="ko", element="span"): + super().__init__() + self.ok_class = ok_class + self.ko_class = ko_class + self.element = element + + def css_class(self, path, paths_ok, paths_ko): + return self.ok_class if path in paths_ok else self.ko_class if path in paths_ko else None + + def mark_node(self, node, path, paths_ok, paths_ko, parcimonious): + node_class = self.css_class(path, paths_ok, paths_ko) + add_class = node_class is not None + if add_class and parcimonious: + # find nearest parent with a class + parent_class = None + parent_path = path + while parent_class is None and parent_path: + parent_path = parent_path[:-1] + parent_class = self.css_class(parent_path, paths_ok, paths_ko) + # only add class if different from parent + add_class = node_class != parent_class + if add_class: + node.head = f'<{self.element} class="{node_class}">{node.head}' + node.tail = f'{node.tail}' + return node + + def __call__(self, tree, paths_ok, paths_ko, parcimonious=True): + """representation of tree, adding html elements with right class around subexpressions + according to their presence in paths_ok or paths_ko + + :param tree: a luqum tree + :param paths_ok: set of path to nodes (express as tuple of int) that should get ok_class + :param paths_ko: set of path to nodes that should get ko_class + :param parcimonious: only add class when parent node does not have same class + + :return str: expression with html elements surrounding part of expression + with right class attribute according to paths_ok and paths_ko + """ + new_tree = super().__call__(tree, paths_ok, paths_ko, parcimonious) + return new_tree.__str__(head_tail=True) diff --git a/luqum/tree.py b/luqum/tree.py index 5a71965..2961338 100644 --- a/luqum/tree.py +++ b/luqum/tree.py @@ -53,7 +53,7 @@ def clone_item(self, **kwargs): This is particularly useful for the :py:class:`.visitor.TreeTransformer` pattern. - :param dict kwargs: those item will be added to `__init__` call. + :param dict kwargs: those item will be added to `__init__` call. It's a simple way to change some values of target item. """ return self._clone_item(cls=type(self), **kwargs) @@ -444,8 +444,8 @@ class AndOperation(BaseOperation): def create_operation(cls, a, b, op_tail=" "): """Create operation between a and b, merging if a or b is already an operation of same class - :param a: left operand - :param b: right operand + :param a: left operand + :param b: right operand :param op_tail: tail of operation token """ operands = [] diff --git a/luqum/visitor.py b/luqum/visitor.py index 568f3c6..7f66440 100644 --- a/luqum/visitor.py +++ b/luqum/visitor.py @@ -90,17 +90,37 @@ def visit_iter(self, node, context): method = self._get_method(node) yield from method(node, context) + def child_context(self, node, child, context, **kwargs): + """Generate a context for children. + + The context children is distinct from its parent context, + so that visit in a branch does not affect others. + + .. note:: If you need global parameters, + a trick is to put them in dict in a "global" entry + as we do a swallow copy of context, and not a deep one. + + :param luqum.tree.Item node: parent node + :param luqum.tree.Item child: child node + :param dict context: parent context + :return dict: child context + """ + child_context = dict(context) + if self.track_parents: + child_context["parents"] = context.get("parents", ()) + (node,) + return child_context + def generic_visit(self, node, context): """ Default visitor function, called if nothing matches the current node. It simply visit children. + + :param luqum.tree.Item node: current node + :param dict context: context (aka local parameters received from parents) """ - if self.track_parents: - child_context = dict(context, parents=context.get("parents", ()) + (node,)) - else: - child_context = context for child in node.children: + child_context = self.child_context(node, child, context) yield from self.visit_iter(child, context=child_context) @@ -108,7 +128,9 @@ class TreeTransformer(TreeVisitor): """A version of TreeVisitor that is aimed at obtaining a transformed copy of tree. .. note:: It is far better to build a transformed copy, - than to modify in place the original tree, as it is less error prone. + than to modify in place the original tree, as it is less error prone. + + :param bool track_new_parents: do we want to track new parents in the context ? """ def __init__(self, track_new_parents=False, **kwargs): @@ -123,6 +145,11 @@ def _clone_item(self, node): return node.clone_item() def visit(self, tree, context=None): + """Visit the tree, by default building a copy and returning it. + + :param luqum.tree.Item tree: luqum expression tree + :param context: optional initial context + """ if context is None: context = {} try: @@ -138,6 +165,12 @@ def visit(self, tree, context=None): else: raise + def child_context(self, node, child, context, **kwargs): + child_context = super().child_context(node, child, context, **kwargs) + if self.track_new_parents: + child_context["new_parents"] = context.get("new_parents", ()) + (kwargs["new_node"],) + return child_context + def generic_visit(self, node, context): """ Default visitor function, called if nothing matches the current node. @@ -145,7 +178,7 @@ def generic_visit(self, node, context): It simply clone node and children """ new_node = self._clone_item(node) - new_node.children = self.clone_children(node, new_node, context) + new_node.children = list(self.clone_children(node, new_node, context)) yield new_node def clone_children(self, node, new_node, context): @@ -154,16 +187,52 @@ def clone_children(self, node, new_node, context): .. note:: a children may generate more than one children or none, for flexibility but it's up to the transformer to ensure everything is ok """ - child_context = dict(context) - if self.track_parents: - child_context["parents"] = context.get("parents", ()) + (node,) - if self.track_new_parents: - child_context["new_parents"] = context.get("new_parents", ()) + (new_node,) - new_children = [ - new_child - for child in node.children - for new_child in self.visit_iter(child, context=child_context) - ] - # it's list, so we keep the iterator interface - # and it may be easier to debug - return new_children + for child in node.children: + child_context = self.child_context(node, child, context, new_node=new_node) + new_children = self.visit_iter(child, context=child_context) + for new_child in new_children: + yield new_child + + +class PathTrackingMixin: + """It can be useful to compute path of an elements (as tuple of index in parent children) + + This mixin provides base components + """ + + def child_context(self, node, child, context, **kwargs): + """Thanks to "path" and "position" in kwargs, we add the path of children + """ + child_context = super().child_context(node, child, context, **kwargs) + child_context["path"] = context["path"] + (kwargs["position"],) + return child_context + + def visit(self, node, context=None): + """visit the tree while tracking their path + """ + if context is None: + context = {} + context["path"] = () + return super().visit(node, context=context) + + +class PathTrackingVisitor(PathTrackingMixin, TreeVisitor): + """Path tracking version of TreeVisitor + """ + + def generic_visit(self, node, context): + for i, child in enumerate(node.children): + child_context = self.child_context(node, child, context, position=i) + yield from self.visit_iter(child, context=child_context) + + +class PathTrackingTransformer(PathTrackingMixin, TreeTransformer): + """Path tracking version of TreeTransformer + """ + + def clone_children(self, node, new_node, context): + for i, child in enumerate(node.children): + child_context = self.child_context(node, child, context, new_node=new_node, position=i) + new_children = self.visit_iter(child, context=child_context) + for new_child in new_children: + yield new_child diff --git a/tests/test_elasticsearch/book.json b/tests/test_elasticsearch/book.json index afe689c..28921b1 100644 --- a/tests/test_elasticsearch/book.json +++ b/tests/test_elasticsearch/book.json @@ -20,7 +20,8 @@ } ], "publication_date": "1997-06-26", - "n_pages": "223" + "n_pages": "223", + "ref": "HP1" }, { "title": "Harry Potter and the Chamber of Secrets", @@ -42,7 +43,8 @@ } ], "publication_date": "1998-07-02", - "n_pages": "251" + "n_pages": "251", + "ref": "HP2" }, { "title": "Harry Potter and the Prisoner of Azkaban", @@ -64,7 +66,8 @@ } ], "publication_date": "1999-07-08", - "n_pages": "317" + "n_pages": "317", + "ref": "HP3" }, { "title": "Harry Potter and the Goblet of Fire", @@ -85,7 +88,8 @@ } ], "publication_date": "2000-07-08", - "n_pages": "636" + "n_pages": "636", + "ref": "HP4" }, { "title": "Harry Potter and the Order of the Phoenix", @@ -106,7 +110,8 @@ } ], "publication_date": "2003-06-21", - "n_pages": "766" + "n_pages": "766", + "ref": "HP5" }, { "title": "Harry Potter and the Half-Blood Prince", @@ -127,7 +132,8 @@ } ], "publication_date": "2005-07-16", - "n_pages": "607" + "n_pages": "607", + "ref": "HP6" }, { "title": "Harry Potter and the Deathly Hallows", @@ -148,7 +154,8 @@ } ], "publication_date": "2007-07-21", - "n_pages": "607" + "n_pages": "607", + "ref": "HP7" }, { "title": "Harry Potter and the Cursed Child", @@ -159,7 +166,8 @@ }, "illustrators": [], "publication_date": "2016-07-30", - "n_pages": "360" + "n_pages": "360", + "ref": "HP8" }, { "title": "The Tales of Beedle the Bard", @@ -176,7 +184,8 @@ } ], "publication_date": "2008-12-04", - "n_pages": "157" + "n_pages": "157", + "ref": "BB1" } ] } diff --git a/tests/test_elasticsearch/es_integration_utils.py b/tests/test_elasticsearch/es_integration_utils.py index b682d07..232ec30 100644 --- a/tests/test_elasticsearch/es_integration_utils.py +++ b/tests/test_elasticsearch/es_integration_utils.py @@ -65,6 +65,7 @@ class Book(Document): search_analyzer="standard" ) }) + ref = Keyword() if MAJOR_ES > 2 else Text(index="not_analyzed") edition = Text() author = Object(properties={"name": Text(), "birthdate": Date()}) publication_date = Date() @@ -81,9 +82,7 @@ class Index: properties={ "name": Text(), "birthdate": Date(), - "nationality": Keyword() - if MAJOR_ES > 2 - else Text(index="not_analyzed"), + "nationality": Keyword() if MAJOR_ES > 2 else Text(index="not_analyzed"), } ) @@ -94,10 +93,10 @@ class Meta: def add_book_data(es): """Create a "bk" index and fill it with data """ + remove_book_index(es) Book.init() with open(os.path.join(os.path.dirname(__file__), "book.json")) as f: datas = json.load(f) - actions = ( {"_op_type": "index", "_id": i, "_source": d} for i, d in enumerate(datas["books"]) @@ -125,7 +124,6 @@ def book_query_builder(es): """ MESSAGES_SCHEMA = {"mappings": Book._doc_type.mapping.to_dict()} schema_analizer = SchemaAnalyzer(MESSAGES_SCHEMA) - builder_options = schema_analizer.query_builder_options() builder_options['field_options'] = { 'title.no_vowels': { @@ -143,6 +141,6 @@ def remove_book_index(es): if es is None: return if ES6: - Book._index.delete() + Book._index.delete(ignore=404) else: Index("bk").delete(ignore=404) diff --git a/tests/test_elasticsearch/test_es_integration.py b/tests/test_elasticsearch/test_es_integration.py index 3cfbb6c..58d1a74 100644 --- a/tests/test_elasticsearch/test_es_integration.py +++ b/tests/test_elasticsearch/test_es_integration.py @@ -7,7 +7,7 @@ ) -@skipIf(get_es() is None, "Skipping ES test as I can't reach ES") +@skipIf(get_es() is None, "Skipping ES test as ES seems unreachable") class LuqumRequestTestCase(TestCase): @classmethod diff --git a/tests/test_elasticsearch/test_es_naming.py b/tests/test_elasticsearch/test_es_naming.py new file mode 100644 index 0000000..d9611c0 --- /dev/null +++ b/tests/test_elasticsearch/test_es_naming.py @@ -0,0 +1,193 @@ +from unittest import TestCase, skipIf + +from elasticsearch_dsl import Q + +from luqum.naming import ( + auto_name, element_from_name, HTMLMarker, matching_from_names, MatchingPropagator, +) +from luqum.parser import parser + + +from .es_integration_utils import ( + add_book_data, book_query_builder, book_search, get_es, remove_book_index, +) + + +@skipIf(get_es() is None, "Skipping ES test as ES seems unreachable") +class LuqumNamingTestCase(TestCase): + """This test is testing naming of queries integration with ES + """ + + @classmethod + def setUpClass(cls): + cls.es_client = get_es() + if cls.es_client is None: + return + cls.es_builder = book_query_builder(cls.es_client) + cls.propagate_matching = MatchingPropagator() + cls.search = book_search(cls.es_client) + add_book_data(cls.es_client) + cls.make_html = HTMLMarker() + + def test_keyword_naming(self): + ltree = parser.parse("illustrators.nationality:UK") + names = auto_name(ltree) + query = self.es_builder(ltree) + results = list(self.search.filter(query).execute()) + book = results[0] + self.assertEqual(book.meta.matched_queries, ["a"]) + self.assertEqual( + element_from_name(ltree, book.meta.matched_queries[0], names), + ltree, + ) + paths_ok, paths_ko = self.propagate_matching( + ltree, matching_from_names(book.meta.matched_queries, names), + ) + self.assertEqual( + self.make_html(ltree, paths_ok, paths_ko), + 'illustrators.nationality:UK', + ) + + def test_or_operation(self): + ltree = parser.parse("n_pages:360 OR edition:Lumos") + names = auto_name(ltree) + query = self.es_builder(ltree) + # the one matching Lumos + book, = list(self.search.filter(query).filter("term", ref="BB1").execute()) + self.assertEqual(len(book.meta.matched_queries), 1) + paths_ok, paths_ko = self.propagate_matching( + ltree, matching_from_names(book.meta.matched_queries, names), + ) + self.assertEqual( + self.make_html(ltree, paths_ok, paths_ko), + 'n_pages:360 OR edition:Lumos', + ) + # the one matching n_pages + book, = list(self.search.filter(query).filter("term", ref="HP8").execute()) + self.assertEqual(len(book.meta.matched_queries), 1) + paths_ok, paths_ko = self.propagate_matching( + ltree, matching_from_names(book.meta.matched_queries, names), + ) + self.assertEqual( + self.make_html(ltree, paths_ok, paths_ko), + 'n_pages:360 OR edition:Lumos', + ) + # matching None + book, = list(self.search.filter(Q(query) | Q("match_all")).filter(Q("term", ref="HP7")).execute()) + self.assertFalse(hasattr(book.meta, "matched_queries")) + + def test_and_operation_matching(self): + ltree = parser.parse("n_pages:157 AND edition:Lumos") + names = auto_name(ltree) + query = self.es_builder(ltree) + # matching Lumos and n_pages + book, = list(self.search.filter(query).filter("term", ref="BB1").execute()) + self.assertEqual(len(book.meta.matched_queries), 2) + paths_ok, paths_ko = self.propagate_matching( + ltree, matching_from_names(book.meta.matched_queries, names), + ) + self.assertEqual( + self.make_html(ltree, paths_ok, paths_ko), + 'n_pages:157 AND edition:Lumos', + ) + + def test_and_operation_not_matching(self): + ltree = parser.parse("n_pages:360 AND edition:Lumos") + names = auto_name(ltree) + query = self.es_builder(ltree) + # matching only Lumos + book, = list(self.search.filter(Q(query) | Q("term", ref="BB1")).execute()) + self.assertEqual(len(book.meta.matched_queries), 1) + paths_ok, paths_ko = self.propagate_matching( + ltree, matching_from_names(book.meta.matched_queries, names), + ) + self.assertEqual( + self.make_html(ltree, paths_ok, paths_ko), + 'n_pages:360 AND edition:Lumos', + ) + # matching None + book, = list(self.search.filter(Q(query) | Q("term", ref="HP7")).execute()) + self.assertFalse(hasattr(book.meta, "matched_queries")) + + def _negation_test(self, operator): + ltree = parser.parse(f"{operator}n_pages:360 AND edition:Lumos") + names = auto_name(ltree) + query = self.es_builder(ltree) + # matching Lumos + book, = list(self.search.filter(query).filter("term", ref="BB1").execute()) + self.assertEqual(len(book.meta.matched_queries), 1) + paths_ok, paths_ko = self.propagate_matching( + ltree, matching_from_names(book.meta.matched_queries, names), + ) + self.assertEqual( + self.make_html(ltree, paths_ok, paths_ko), + f'{operator}n_pages:360 AND edition:Lumos', + ) + # matching n_pages and not lumos + book, = list(self.search.filter(Q(query) | Q("term", ref="HP8")).exclude(Q("term", ref="BB1")).execute()) + self.assertEqual(len(book.meta.matched_queries), 1) + paths_ok, paths_ko = self.propagate_matching( + ltree, matching_from_names(book.meta.matched_queries, names), + ) + self.assertEqual( + self.make_html(ltree, paths_ok, paths_ko), + f'{operator}n_pages:360 ' + f'AND edition:Lumos', + ) + # matching none + book, = list(self.search.filter(Q(query) | Q("term", ref="HP7")).exclude(Q("term", ref="BB1")).execute()) + self.assertFalse(hasattr(book.meta, "matched_queries")) + + def test_not(self): + self._negation_test("NOT ") + + def test_minus(self): + self._negation_test("-") + + def _simple_test(self, matching_query, ref, num_match=1): + """simple scenario + + :param str matching_query: the query that match the book + :param str ref: ref of expected matching book + """ + ltree = parser.parse(f"{matching_query} OR n_pages:1000") + names = auto_name(ltree) + query = self.es_builder(ltree) + book, = list(self.search.filter(query).execute()) + self.assertEqual(book.ref, ref) + self.assertEqual(len(book.meta.matched_queries), num_match) + paths_ok, paths_ko = self.propagate_matching( + ltree, matching_from_names(book.meta.matched_queries, names), + ) + self.assertEqual( + self.make_html(ltree, paths_ok, paths_ko), + f'{matching_query} OR n_pages:1000', + ) + + def test_fuzzy(self): + self._simple_test("ref:BB~1", "BB1") + + def test_proximity(self): + self._simple_test('title:"Harry Potter Phoenix"~6', "HP5") + + def test_boost(self): + self._simple_test('title:"Phoenix"^4', "HP5") + + def test_plus(self): + self._simple_test('+title:"Phoenix"', "HP5") + + def test_range(self): + self._simple_test('publication_date:[2000-01-01 TO 2001-01-01]', "HP4") + + def test_field_group(self): + self._simple_test('title:(Phoenix AND Potter)', "HP5", num_match=2) + + def test_group(self): + self._simple_test('(title:Phoenix AND ref:HP5)', "HP5", num_match=2) + + def test_unknown_operation(self): + self._simple_test('(title:Phoenix ref:HP5)', "HP5", num_match=2) + + @classmethod + def tearDownClass(cls): + remove_book_index(cls.es_client) diff --git a/tests/test_elasticsearch/test_naming.py b/tests/test_elasticsearch/test_naming.py new file mode 100644 index 0000000..a78c218 --- /dev/null +++ b/tests/test_elasticsearch/test_naming.py @@ -0,0 +1,250 @@ +from unittest import TestCase + +from luqum.tree import ( + AndOperation, Word, Prohibit, OrOperation, Not, Phrase, SearchField, + UnknownOperation, Boost, Fuzzy, Proximity, Range, Group, FieldGroup, + Plus) +from luqum.naming import auto_name, set_name +from luqum.elasticsearch.visitor import ElasticsearchQueryBuilder + + +class ElasticsearchTreeTransformerTestCase(TestCase): + + @classmethod + def setUpClass(cls): + cls.transformer = ElasticsearchQueryBuilder( + default_field="text", + not_analyzed_fields=['not_analyzed_field', 'text', 'author.tag'], + nested_fields={ + 'author': ['name', 'tag'] + }, + object_fields=["book.title", "author.rewards.name"], + sub_fields=["book.title.raw"], + ) + + def test_named_queries_match(self): + tree = SearchField("spam", Word("bar")) + set_name(tree, "a") + result = self.transformer(tree) + self.assertEqual( + result, + { + "match": { + "spam": { + "query": "bar", + "_name": "a", + "zero_terms_query": "none", + }, + }, + }, + ) + + tree = SearchField("spam", Phrase('"foo bar"')) + set_name(tree, "a") + result = self.transformer(tree) + self.assertEqual( + result, + { + "match_phrase": { + "spam": { + "query": "foo bar", + "_name": "a", + }, + }, + }, + ) + + def test_named_queries_term(self): + tree = SearchField("text", Word("bar")) + set_name(tree, "a") + result = self.transformer(tree) + self.assertEqual( + result, + {"term": {"text": {"value": "bar", "_name": "a"}}}, + ) + + tree = SearchField("text", Phrase('"foo bar"')) + set_name(tree, "a") + result = self.transformer(tree) + self.assertEqual( + result, + {"term": {"text": {"value": "foo bar", "_name": "a"}}}, + ) + + def test_named_queries_fuzzy(self): + tree = SearchField("text", Fuzzy(Word('bar'))) + set_name(tree.children[0], "a") + result = self.transformer(tree) + self.assertEqual( + result, + {"fuzzy": {"text": {"value": "bar", "_name": "a", 'fuzziness': 0.5}}}, + ) + + def test_named_queries_proximity(self): + tree = SearchField("spam", Proximity(Phrase('"foo bar"'))) + set_name(tree.children[0], "a") + result = self.transformer(tree) + self.assertEqual( + result, + {"match_phrase": {"spam": {"query": "foo bar", "_name": "a", 'slop': 1.0}}}, + ) + + def test_named_queries_boost(self): + tree = SearchField("text", Boost(Phrase('"foo bar"'), force=2)) + set_name(tree.children[0], "a") + result = self.transformer(tree) + self.assertEqual( + result, + {"term": {"text": {"value": "foo bar", "_name": "a", 'boost': 2.0}}}, + ) + + def test_named_queries_or(self): + tree = OrOperation(SearchField("text", Word("foo")), SearchField("spam", Word("bar"))) + set_name(tree.operands[0], "a") + set_name(tree.operands[1], "b") + result = self.transformer(tree) + self.assertEqual( + result, + {'bool': {'should': [ + {'term': {'text': {'_name': 'a', 'value': 'foo'}}}, + {'match': {'spam': {'_name': 'b', 'query': 'bar', 'zero_terms_query': 'none'}}} + ]}} + ) + + def test_named_queries_and(self): + tree = AndOperation(SearchField("text", Word("foo")), SearchField("spam", Word("bar"))) + set_name(tree.operands[0], "a") + set_name(tree.operands[1], "b") + result = self.transformer(tree) + self.assertEqual( + result, + {'bool': {'must': [ + {'term': {'text': {'_name': 'a', 'value': 'foo'}}}, + {'match': {'spam': {'_name': 'b', 'query': 'bar', 'zero_terms_query': 'all'}}} + ]}} + ) + + def test_named_queries_unknown(self): + tree = UnknownOperation(SearchField("text", Word("foo")), SearchField("spam", Word("bar"))) + set_name(tree.operands[0], "a") + set_name(tree.operands[1], "b") + result = self.transformer(tree) + self.assertEqual( + result, + {'bool': {'should': [ + {'term': {'text': {'_name': 'a', 'value': 'foo'}}}, + {'match': {'spam': {'_name': 'b', 'query': 'bar', 'zero_terms_query': 'none'}}} + ]}} + ) + + def test_named_queries_not(self): + tree = Not(SearchField("text", Word("foo"))) + set_name(tree, "a") + result = self.transformer(tree) + self.assertEqual( + result, + {'bool': {'must_not': [{'term': {'text': {'_name': 'a', 'value': 'foo'}}}]}} + ) + + tree = Prohibit(SearchField("text", Word("foo"))) + set_name(tree, "a") + result = self.transformer(tree) + self.assertEqual( + result, + {'bool': {'must_not': [{'term': {'text': {'_name': 'a', 'value': 'foo'}}}]}} + ) + + def test_named_queries_plus(self): + tree = Plus(SearchField("text", Word("foo"))) + set_name(tree, "a") + result = self.transformer(tree) + self.assertEqual( + result, + {'bool': {'must': [{'term': {'text': {'_name': 'a', 'value': 'foo'}}}]}} + ) + + def test_named_queries_range(self): + tree = SearchField("text", Range(Word("x"), Word("z"))) + set_name(tree, "a") + result = self.transformer(tree) + self.assertEqual(result, {'range': {'text': {'_name': 'a', 'gte': 'x', 'lte': 'z'}}}) + + def test_named_queries_group(self): + tree = SearchField("text", FieldGroup(Word("bar"))) + set_name(tree.children[0], "a") + result = self.transformer(tree) + self.assertEqual(result, {"term": {"text": {"value": "bar", "_name": "a"}}},) + + tree = Group(SearchField("text", Word("bar"))) + set_name(tree, "a") + result = self.transformer(tree) + self.assertEqual(result, {"term": {"text": {"value": "bar", "_name": "a"}}},) + + def test_named_queries_complex(self): + tree = ( + AndOperation( + SearchField("text", Phrase('"foo bar"')), + Group( + OrOperation( + Word("bar"), + SearchField("spam", Word("baz")), + ), + ), + ) + ) + and_op = tree + search_text = and_op.operands[0] + or_op = and_op.operands[1].children[0] + bar = or_op.operands[0] + search_spam = or_op.operands[1] + set_name(search_text, "foo_bar") + set_name(bar, "bar") + set_name(search_spam, "baz") + + expected = { + 'bool': {'must': [ + {'term': {'text': {'_name': 'foo_bar', 'value': 'foo bar'}}}, + {'bool': {'should': [ + {'term': {'text': {'_name': 'bar', 'value': 'bar'}}}, + {'match': {'spam': { + '_name': 'baz', + 'query': 'baz', + 'zero_terms_query': 'none' + }}} + ]}} + ]} + } + + result = self.transformer(tree) + self.assertEqual(result, expected) + + def test_auto_name_integration(self): + tree = ( + AndOperation( + SearchField("text", Phrase('"foo bar"')), + Group( + OrOperation( + Word("bar"), + SearchField("spam", Word("baz")), + ), + ), + ) + ) + auto_name(tree) + + expected = { + 'bool': {'must': [ + {'term': {'text': {'_name': 'a', 'value': 'foo bar'}}}, + {'bool': {'should': [ + {'term': {'text': {'_name': 'c', 'value': 'bar'}}}, + {'match': {'spam': { + '_name': 'd', + 'query': 'baz', + 'zero_terms_query': 'none' + }}} + ]}} + ]} + } + + result = self.transformer(tree) + self.assertEqual(result, expected) diff --git a/tests/test_elasticsearch/tests.py b/tests/test_elasticsearch/tests.py index a00a346..286f623 100644 --- a/tests/test_elasticsearch/tests.py +++ b/tests/test_elasticsearch/tests.py @@ -7,15 +7,15 @@ AndOperation, Word, Prohibit, OrOperation, Not, Phrase, SearchField, UnknownOperation, Boost, Fuzzy, Proximity, Range, Group, FieldGroup, Plus) -from luqum.naming import set_name from luqum.elasticsearch.tree import ElasticSearchItemFactory from luqum.elasticsearch.visitor import EWord, ElasticsearchQueryBuilder class ElasticsearchTreeTransformerTestCase(TestCase): - def setUp(self): - self.transformer = ElasticsearchQueryBuilder( + @classmethod + def setUpClass(cls): + cls.transformer = ElasticsearchQueryBuilder( default_field="text", not_analyzed_fields=['not_analyzed_field', 'text', 'author.tag'], nested_fields={ @@ -484,110 +484,6 @@ def test_no_analyze_should_follow_nested(self): result = self.transformer(tree) self.assertDictEqual(result, expected) - def test_named_queries_simple(self): - tree = SearchField("spam", Word("bar")) - set_name(tree.children[0], "0") - result = self.transformer(tree) - self.assertEqual( - result, - { - "match": { - "spam": { - "query": "bar", - "_name": "0", - "zero_terms_query": "none", - }, - }, - }, - ) - - tree = SearchField("spam", Phrase('"foo bar"')) - set_name(tree.children[0], "0") - result = self.transformer(tree) - self.assertEqual( - result, - { - "match_phrase": { - "spam": { - "query": "foo bar", - "_name": "0", - }, - }, - }, - ) - - tree = SearchField("text", Word("bar")) - set_name(tree.children[0], "0") - result = self.transformer(tree) - self.assertEqual( - result, - {"term": {"text": {"value": "bar", "_name": "0"}}}, - ) - - tree = SearchField("text", Phrase('"foo bar"')) - set_name(tree.children[0], "0") - result = self.transformer(tree) - self.assertEqual( - result, - {"term": {"text": {"value": "foo bar", "_name": "0"}}}, - ) - - tree = SearchField("text", Fuzzy(Word('bar'))) - set_name(tree.expr.term, "0") - result = self.transformer(tree) - self.assertEqual( - result, - {"fuzzy": {"text": {"value": "bar", "_name": "0", 'fuzziness': 0.5}}}, - ) - - tree = SearchField("text", Fuzzy(Phrase('"foo bar"'))) - set_name(tree.expr.term, "0") - result = self.transformer(tree) - self.assertEqual( - result, - {"fuzzy": {"text": {"value": "foo bar", "_name": "0", 'fuzziness': 0.5}}}, - ) - - def test_named_queries_nested_auto_name(self): - tree = ( - AndOperation( - SearchField("text", Phrase('"foo bar"')), - Group( - OrOperation( - Word("bar"), - SearchField("spam", Word("baz")), - ), - ), - ) - ) - and_op = tree - foo_bar = and_op.operands[0].expr - or_op = and_op.operands[1].expr - bar = or_op.operands[0] - baz = or_op.operands[1].expr - set_name(and_op, "and_op") - set_name(foo_bar, "foo_bar") - set_name(or_op, "or_op") - set_name(bar, "bar") - set_name(baz, "baz") - - expected = { - 'bool': {'must': [ - {'term': {'text': {'_name': 'foo_bar', 'value': 'foo bar'}}}, - {'bool': {'should': [ - {'term': {'text': {'_name': 'bar', 'value': 'bar'}}}, - {'match': {'spam': { - '_name': 'baz', - 'query': 'baz', - 'zero_terms_query': 'none' - }}} - ]}} - ]} - } - - result = self.transformer(tree) - self.assertEqual(result, expected) - def test_match_word_as_phrase_option(self): tree = AndOperation( SearchField("foo", Word("bar")), @@ -763,7 +659,8 @@ class ElasticsearchTreeTransformerRealQueriesTestCase(TestCase): """Those are tests issued from bugs found thanks to jurismarches requests """ - def setUp(self): + @classmethod + def setUpClass(cls): NO_ANALYZE = [ "type", "statut", "pays", "pays_acheteur", "pays_acheteur_display", @@ -775,7 +672,7 @@ def setUp(self): "profils_en_cours", "profils_exclus", "profils_historiques" ] - self.transformer = ElasticsearchQueryBuilder( + cls.transformer = ElasticsearchQueryBuilder( default_field="text", not_analyzed_fields=NO_ANALYZE, default_operator=ElasticsearchQueryBuilder.MUST, @@ -928,7 +825,8 @@ class NestedAndObjectFieldsTestCase(TestCase): """Test around nested fields and object fields """ - def setUp(self): + @classmethod + def setUpClass(cls): NO_ANALYZE = [ 'author.book.format.type', @@ -972,7 +870,7 @@ def setUp(self): 'author.book.isbn.ref.lower', ] - self.transformer = ElasticsearchQueryBuilder( + cls.transformer = ElasticsearchQueryBuilder( default_field="text", not_analyzed_fields=NO_ANALYZE, nested_fields=NESTED_FIELDS, diff --git a/tests/test_naming.py b/tests/test_naming.py index 4376bf6..b80466c 100644 --- a/tests/test_naming.py +++ b/tests/test_naming.py @@ -1,27 +1,78 @@ # -*- coding: utf-8 -*- from unittest import TestCase -from luqum.naming import get_name, set_name, auto_name, name_index, extract +from luqum.naming import ( + auto_name, element_from_name, element_from_path, ExpressionMarker, get_name, + HTMLMarker, matching_from_names, MatchingPropagator, set_name, +) from luqum.parser import parser from luqum.tree import ( AndOperation, OrOperation, UnknownOperation, SearchField, - Fuzzy, Proximity, Word, Phrase, Range, Group) + Fuzzy, Proximity, Word, Phrase, Range, Regex, Group, FieldGroup, + Plus, Not, Prohibit, Boost, Term, +) + + +def names_to_path(node, path=()): + names = {} + node_name = get_name(node) + if node_name: + names[node_name] = path + for i, child in enumerate(node.children): + names.update(names_to_path(child, path=path + (i,))) + return names + + +def simple_naming(node, names=None, path=()): + """utility to name a big tree, using the node class name or its content if it's a term + + If a name is repeated, it will add numbers. For example : `"or", "or2", "or3, …`. + + return dict: mapping names to path + """ + if names is None: + names = {} + if isinstance(node, (Term)): + node_name = node.value.strip('"').strip("/").lower() + else: + node_name = type(node).__name__.lower() + if node_name.endswith("operation"): + node_name = node_name[:-9] + if node_name in names: + node_name += str(1 + sum(1 for n in names if n.startswith(node_name))) + set_name(node, node_name) + names[node_name] = path + for i, child in enumerate(node.children): + simple_naming(child, names, path=path + (i,)) + return names + + +def paths_to_names(tree, paths): + return {get_name(element_from_path(tree, path)) for path in paths} class AutoNameTestCase(TestCase): def test_auto_name_one_term(self): tree = Word("test") - auto_name(tree) - self.assertEqual(get_name(tree), "0") + names = auto_name(tree) + self.assertEqual(get_name(tree), "a") + self.assertEqual(names, {"a": ()}) tree = Phrase('"test"') - auto_name(tree) - self.assertEqual(get_name(tree), "0") + names = auto_name(tree) + self.assertEqual(get_name(tree), "a") + self.assertEqual(names, {"a": ()}) + + tree = Range(Word("test"), Word("*")) + names = auto_name(tree) + self.assertEqual(get_name(tree), "a") + self.assertEqual(names, {"a": ()}) - tree = Range("test", "*") - auto_name(tree) - self.assertEqual(get_name(tree), "0") + tree = Regex("/test/") + names = auto_name(tree) + self.assertEqual(get_name(tree), "a") + self.assertEqual(names, {"a": ()}) def test_auto_name_simple_op(self): for OpCls in AndOperation, OrOperation, UnknownOperation: @@ -30,10 +81,11 @@ def test_auto_name_simple_op(self): Word("test"), Phrase('"test"'), ) - auto_name(tree) - self.assertEqual(get_name(tree), "0") - self.assertEqual(get_name(tree.children[0]), "0_0") - self.assertEqual(get_name(tree.children[1]), "0_1") + names = auto_name(tree) + self.assertEqual(get_name(tree), None) + self.assertEqual(get_name(tree.children[0]), "a") + self.assertEqual(get_name(tree.children[1]), "b") + self.assertEqual(names, {"a": (0,), "b": (1,)}) def test_auto_name_nested(self): tree = AndOperation( @@ -51,118 +103,343 @@ def test_auto_name_nested(self): ), ), ) - - auto_name(tree) + names = auto_name(tree) + self.assertEqual(sorted(names.keys()), list("abcdefgh")) # and and1 = tree - self.assertEqual(get_name(and1), "0") + self.assertEqual(get_name(and1), None) # - or or1 = and1.children[0] - self.assertEqual(get_name(or1), "0_0") - # --- search field word + self.assertEqual(get_name(or1), "a") + self.assertEqual(names["a"], (0,)) + # -- search field sfield1 = or1.children[0] - self.assertFalse(get_name(sfield1)) - self.assertEqual(get_name(sfield1.expr), "0_0_0") - # --- and + self.assertEqual(get_name(sfield1), "c") + self.assertEqual(names["c"], (0, 0)) + self.assertEqual(get_name(sfield1.expr), None) + # -- and and2 = or1.children[1] - self.assertEqual(get_name(and2), "0_0_1") - # ----- proximity phrase - self.assertEqual(get_name(and2.children[0].term), "0_0_1_0") - # ----- search field word + self.assertEqual(get_name(and2), "d") + self.assertEqual(names["d"], (0, 1)) + # --- proximity phrase + self.assertEqual(get_name(and2.children[0]), "e") + self.assertEqual(names["e"], (0, 1, 0)) + self.assertEqual(get_name(and2.children[0].term), None) + # --- search field sfield2 = and2.children[1] - self.assertFalse(get_name(sfield2)) - self.assertEqual(get_name(sfield2.expr), "0_0_1_1") + self.assertEqual(get_name(sfield2), "f") + self.assertEqual(names["f"], (0, 1, 1)) + self.assertEqual(get_name(sfield2.expr), None) # - group group1 = and1.children[1] - self.assertEqual(get_name(group1), None) - # --- unknown op + self.assertEqual(get_name(group1), "b") + self.assertEqual(names["b"], (1,)) + # -- unknown op unknownop1 = group1.children[0] - self.assertEqual(get_name(unknownop1), "0_1") - # ----- fuzzy word - self.assertEqual(get_name(unknownop1.children[0].term), "0_1_0") - # ----- phrase - self.assertEqual(get_name(unknownop1.children[1]), "0_1_1") - - -class NameIndexTestCase(TestCase): - - def test_name_index_simple_term(self): - tree = Word("bar") - set_name(tree, "0") - self.assertEqual(name_index(tree), {"0": (0, len(str(tree)))}) - - phrase = Phrase('"baz"') - tree = Group(phrase) - set_name(phrase, "0") - self.assertEqual(name_index(tree), {"0": (1, len(str(phrase)))}) - set_name(phrase, "funny") - self.assertEqual(name_index(tree), {"funny": (1, len(str(phrase)))}) - - def test_name_index_nested(self): - # we use parsing for this way, it's more evident to see index is right - expr = 'bar:baz OR (foo~2 AND "foo bar") OR spam:(bazz dazz)' - - tree = parser.parse(expr) - self.assertEqual(str(tree), expr) # needs to be sure - - root_or = tree - bar_search_field = tree.children[0] - baz = bar_search_field.expr - and_op = tree.children[1].children[0] - foo = and_op.children[0].term - foo_bar = and_op.children[1] - spam_search_field = tree.children[2] - unknown_op = spam_search_field.expr.children[0] - bazz = unknown_op.children[0] - dazz = unknown_op.children[1] - set_name(root_or, "root_or") - set_name(baz, "baz") - set_name(and_op, "and_op") - set_name(foo, "foo") - set_name(foo_bar, "foo_bar") - set_name(unknown_op, "unknown_op") - set_name(bazz, "bazz") - set_name(dazz, "dazz") - - result = name_index(tree) - - def _extract(name): - return extract(expr, name, result) - - self.assertEqual(_extract("root_or"), expr) - self.assertEqual(_extract("baz"), "baz") - self.assertEqual(_extract("and_op"), 'foo~2 AND "foo bar"') - self.assertEqual(_extract("foo"), "foo") - self.assertEqual(_extract("foo_bar"), '"foo bar"') - self.assertEqual(_extract("unknown_op"), "bazz dazz") - self.assertEqual(_extract("bazz"), "bazz") - self.assertEqual(_extract("dazz"), "dazz") - - def test_name_index_nested2(self): - # an expression where outer node does not have a name - expr = '(objet:(bar OR (foo AND "foo bar")))' - - tree = parser.parse(expr) - self.assertEqual(str(tree), expr) # needs to be sure - - or_op = tree.expr.expr.expr # group, field, fieldgroup - bar = or_op.operands[0] - and_op = or_op.operands[1].expr - foo = and_op.operands[0] - foo_bar = and_op.operands[1] - set_name(or_op, "or_op") - set_name(bar, "bar") - set_name(and_op, "and_op") - set_name(foo, "foo") - set_name(foo_bar, "foo_bar") - - result = name_index(tree) - - def _extract(name): - return extract(expr, name, result) - - self.assertEqual(_extract("or_op"), 'bar OR (foo AND "foo bar")') - self.assertEqual(_extract("bar"), "bar") - self.assertEqual(_extract("and_op"), 'foo AND "foo bar"') - self.assertEqual(_extract("foo"), "foo") - self.assertEqual(_extract("foo_bar"), '"foo bar"') + self.assertEqual(get_name(unknownop1), None) + # --- fuzzy word + self.assertEqual(get_name(unknownop1.children[0]), "g") + self.assertEqual(names["g"], (1, 0, 0)) + self.assertEqual(get_name(unknownop1.children[0].term), None) + # --- phrase + self.assertEqual(get_name(unknownop1.children[1]), "h") + self.assertEqual(names["h"], (1, 0, 1)) + + +class UtilitiesTestCase(TestCase): + + def test_matching_from_name(self): + names = {"a": (0,), "b": (1,), "c": (0, 0), "d": (0, 1), "e": (1, 0, 1)} + self.assertEqual(matching_from_names([], names), set()) + self.assertEqual(matching_from_names(["a", "b"], names), {(0,), (1,)}) + self.assertEqual(matching_from_names(["a", "e"], names), {(0,), (1, 0, 1)}) + self.assertEqual(matching_from_names(["c"], names), {(0, 0)}) + with self.assertRaises(KeyError): + matching_from_names(["x"], names) + + def test_element_from_path(self): + tree = AndOperation( + OrOperation( + SearchField("bar", Word("test")), + Group( + AndOperation( + Proximity(Phrase('"test"'), 2), + SearchField("baz", Word("test")), + Fuzzy(Word("test")), + Phrase('"test"'), + ), + ), + ), + ) + names = {"a": (), "b": (0, 1), "c": (0, 1, 0, 2), "d": (0, 1, 0, 2, 0), "e": (0, 1, 0, 3)} + self.assertEqual(element_from_path(tree, ()), tree) + self.assertEqual(element_from_name(tree, "a", names), tree) + self.assertEqual(element_from_path(tree, (0, 1)), tree.children[0].children[1]) + self.assertEqual(element_from_name(tree, "b", names), tree.children[0].children[1]) + self.assertEqual(element_from_path(tree, (0, 1, 0, 2)), Fuzzy(Word("test"))) + self.assertEqual(element_from_name(tree, "c", names), Fuzzy(Word("test"))) + self.assertEqual(element_from_path(tree, (0, 1, 0, 2, 0)), Word("test")) + self.assertEqual(element_from_name(tree, "d", names), Word("test")) + self.assertEqual(element_from_path(tree, (0, 1, 0, 3)), Phrase('"test"')) + self.assertEqual(element_from_name(tree, "e", names), Phrase('"test"')) + with self.assertRaises(IndexError): + element_from_path(tree, (1,)) + + +class PropagateMatchingTestCase(TestCase): + + @classmethod + def setUpClass(cls): + cls.propagate_matching = MatchingPropagator() + + def test_or_operation(self): + tree = OrOperation(Word("foo"), Phrase('"bar"'), Word("baz")) + + paths_ok, paths_ko = self.propagate_matching(tree, set()) + self.assertEqual(paths_ok, set()) + self.assertEqual(paths_ko, {(), (0,), (1,), (2,), }) + + paths_ok, paths_ko = self.propagate_matching(tree, {(2, )}) + self.assertEqual(paths_ok, {(), (2, )}) + self.assertEqual(paths_ko, {(0,), (1,)}) + + paths_ok, paths_ko = self.propagate_matching(tree, {(0, ), (2, )}) + self.assertEqual(paths_ok, {(), (0, ), (2,)}) + self.assertEqual(paths_ko, {(1,)}) + + paths_ok, paths_ko = self.propagate_matching(tree, {(0, ), (1, ), (2, )}) + self.assertEqual(paths_ok, {(), (0,), (1,), (2, )}) + self.assertEqual(paths_ko, set()) + + def test_and_operation(self): + tree = AndOperation(Word("foo"), Phrase('"bar"'), Word("baz")) + + paths_ok, paths_ko = self.propagate_matching(tree, set()) + self.assertEqual(paths_ok, set()) + self.assertEqual(paths_ko, {(), (0,), (1,), (2,), }) + + paths_ok, paths_ko = self.propagate_matching(tree, {(2, )}) + self.assertEqual(paths_ok, {(2, )}) + self.assertEqual(paths_ko, {(), (0,), (1,)}) + + paths_ok, paths_ko = self.propagate_matching(tree, {(0, ), (2, )}) + self.assertEqual(paths_ok, {(0, ), (2,)}) + self.assertEqual(paths_ko, {(), (1,)}) + + paths_ok, paths_ko = self.propagate_matching(tree, {(0, ), (1, ), (2, )}) + self.assertEqual(paths_ok, {(), (0,), (1,), (2, )}) + self.assertEqual(paths_ko, set()) + + def test_unknown_operation(self): + tree = UnknownOperation(Word("foo"), Phrase('"bar"'), Word("baz")) + + tree_or = OrOperation(Word("foo"), Phrase('"bar"'), Word("baz")) + tree_and = AndOperation(Word("foo"), Phrase('"bar"'), Word("baz")) + propagate_or = self.propagate_matching + propagate_and = MatchingPropagator(default_operation=AndOperation) + + for matching in [set(), {(2, )}, {(0, ), (2, )}, {(0, ), (1, ), (2, )}]: + self.assertEqual(propagate_or(tree, matching), self.propagate_matching(tree_or, matching)) + self.assertEqual(propagate_and(tree, matching), self.propagate_matching(tree_and, matching)) + + def test_negation(self): + for tree in [Prohibit(Word("foo")), Not(Word("foo"))]: + with self.subTest("%r" % type(tree)): + paths_ok, paths_ko = self.propagate_matching(tree, set()) + self.assertEqual(paths_ok, {(), (0,)}) + self.assertEqual(paths_ko, set()) + + paths_ok, paths_ko = self.propagate_matching(tree, {(0, )}) + self.assertEqual(paths_ok, set()) + self.assertEqual(paths_ko, {(), (0,)}) + + def test_single_element(self): + for tree in [Word("a"), Phrase('"a"'), Regex("/a/")]: + with self.subTest("%r" % type(tree)): + paths_ok, paths_ko = self.propagate_matching(tree, set()) + self.assertEqual(paths_ok, set()) + self.assertEqual(paths_ko, {()}) + + paths_ok, paths_ko = self.propagate_matching(tree, {()}) + self.assertEqual(paths_ok, {()}) + self.assertEqual(paths_ko, set()) + + def test_no_propagation(self): + for tree in [Range(Word("a"), Word("b")), Fuzzy(Word("foo")), Proximity('"bar baz"', 2)]: + with self.subTest("%r" % type(tree)): + paths_ok, paths_ko = self.propagate_matching(tree, set()) + + # no down propagation + self.assertEqual(paths_ok, set()) + self.assertEqual(paths_ko, {()}) + + paths_ok, paths_ko = self.propagate_matching(tree, {()}) + self.assertEqual(paths_ok, {()}) + self.assertEqual(paths_ko, set()) + + def test_simple_propagation(self): + # propagation in nodes with only one children + tree = Boost(Group(SearchField("foo", FieldGroup(Plus(Word("bar"))))), force=2) + + paths_ok, paths_ko = self.propagate_matching(tree, set()) + self.assertEqual(paths_ok, set()) + self.assertEqual(paths_ko, {(), (0,), (0, 0), (0, 0, 0), (0, 0, 0, 0), (0, 0, 0, 0, 0)}) + + paths_ok, paths_ko = self.propagate_matching(tree, {(0, 0, 0, 0, 0)}) + self.assertEqual(paths_ok, {(), (0,), (0, 0), (0, 0, 0), (0, 0, 0, 0), (0, 0, 0, 0, 0)}) + self.assertEqual(paths_ko, set()) + + def test_combination(self): + tree = AndOperation( + OrOperation( + SearchField( + "mine", + FieldGroup( + Plus( + AndOperation( + Word("foo"), + Regex("/fizz/"), + ), + ), + ), + ), + Boost( + Group( + AndOperation( + Phrase('"ham"'), + Word("spam"), + Prohibit(Fuzzy(Word("fuzz"))), + ), + ), + force=2, + ), + ), + Not( + OrOperation( + Word('"bar"'), + Word('"baz"'), + ), + ), + ) + to_path = simple_naming(tree) + + paths_ok, paths_ko = self.propagate_matching(tree, set()) + # prohibit and not make non matching parts to match + self.assertEqual( + paths_to_names(tree, paths_ok), + {"prohibit", "fuzzy", "not", "or2", "bar", "baz"}, + ) + self.assertEqual( + paths_to_names(tree, paths_ko), + { + "and", "or", "searchfield", "fieldgroup", "plus", "and2", "foo", "fizz", + "boost", "group", "and3", "ham", "spam", + }, + ) + + # adding matching just enough positive expressions, so that complete expression matches + paths_ok, paths_ko = self.propagate_matching( + tree, {to_path["foo"], to_path["fizz"], to_path["ham"]}, + ) + self.assertEqual( + paths_to_names(tree, paths_ok), + {"and", "or", "searchfield", "fieldgroup", "plus", "and2", "foo", "fizz", "ham", + "prohibit", "fuzzy", "not", "or2", "bar", "baz", } + ) + self.assertEqual(paths_to_names(tree, paths_ko), {"boost", "group", "and3", "spam", }) + + # making everything match + paths_ok, paths_ko = self.propagate_matching( + tree, {to_path["foo"], to_path["fizz"], to_path["ham"], to_path["spam"]}, + ) + self.assertEqual( + paths_to_names(tree, paths_ok), + {"and", "or", "searchfield", "fieldgroup", "plus", "and2", "foo", "fizz", "ham", + "prohibit", "fuzzy", "boost", "group", "and3", "spam", "not", "or2", "bar", "baz", } + ) + self.assertEqual(paths_to_names(tree, paths_ko), set()) + + # making everything match, but some negative expression + paths_ok, paths_ko = self.propagate_matching( + tree, + { + to_path["foo"], to_path["fizz"], to_path["ham"], to_path["spam"], + to_path["fuzzy"], to_path["bar"], + }, + ) + self.assertEqual( + paths_to_names(tree, paths_ok), + { + "or", "searchfield", "fieldgroup", "plus", "and2", + "foo", "fizz", "ham", "spam", "baz", + }, + ) + self.assertEqual( + paths_to_names(tree, paths_ko), + { + "and", "boost", "group", "and3", "prohibit", "fuzzy", + "boost", "group", "and3", "not", "or2", "bar" + }, + ) + + +class HTMLMarkerTestCase(TestCase): + + mark_html = HTMLMarker() + + def test_single_element(self): + ltree = parser.parse('"b"') + out = self.mark_html(ltree, {()}, set()) + self.assertEqual(out, '"b"') + out = self.mark_html(ltree, {()}, set(), parcimonious=False) + self.assertEqual(out, '"b"') + out = self.mark_html(ltree, set(), {()}) + self.assertEqual(out, '"b"') + out = self.mark_html(ltree, set(), {()}, parcimonious=False) + self.assertEqual(out, '"b"') + + def test_multiple_elements(self): + ltree = parser.parse('(foo OR bar~2 OR baz^2) AND NOT spam') + + names = simple_naming(ltree) + foo, bar, baz, spam = names["foo"], names["fuzzy"], names["boost"], names["spam"] + or_, and_, not_ = names["or"], names["and"], names["not"] + + out = self.mark_html(ltree, {foo, bar, baz, or_, and_, not_}, {spam}) + self.assertEqual( + out, + '(foo OR bar~2 OR baz^2) AND NOT spam', + ) + out = self.mark_html(ltree, {foo, bar, baz, or_, and_, not_}, {spam}, parcimonious=False) + self.assertEqual( + out, + '' + '(foo OR bar~2 OR' + ' baz^2) ' + 'AND' + ' NOT spam' + '', + ) + + out = self.mark_html(ltree, {not_}, {foo, bar, baz, or_, and_, spam}) + self.assertEqual( + out, + '(foo OR bar~2 OR baz^2) AND' + ' NOT spam', + ) + + # changing class name and element name + mark = HTMLMarker(ok_class="success", ko_class="failure", element="li") + out = mark(ltree, {not_}, {foo, bar, baz, or_, and_, spam}) + self.assertEqual( + out, + '
  • (foo OR bar~2 OR baz^2) AND' + '
  • NOT
  • spam
  • ', + ) + + def test_expression_marker(self): + # only for coverage ! + ltree = parser.parse("foo AND bar") + mark = ExpressionMarker() + out = mark(ltree, {(), (0,), (1,)}, {}) + self.assertEqual(out, ltree) diff --git a/tests/test_visitor.py b/tests/test_visitor.py index b7523b4..d2a855d 100644 --- a/tests/test_visitor.py +++ b/tests/test_visitor.py @@ -4,8 +4,11 @@ from luqum.tree import ( NONE_ITEM, Group, Word, Phrase, AndOperation, OrOperation, Proximity, SearchField, + Boost, Fuzzy, Regex, +) +from luqum.visitor import ( + PathTrackingTransformer, PathTrackingVisitor, TreeTransformer, TreeVisitor, ) -from luqum.visitor import TreeTransformer, TreeVisitor class TreeVisitorTestCase(TestCase): @@ -217,3 +220,75 @@ def test_value_error_pass_through(self): with self.assertRaises(ValueError) as raised: self.RaisingTreeTransformer2().visit(tree) self.assertEqual("Random error", str(raised.exception)) + + +class PathTrackingVisitorTestCase(TestCase): + + class TermPathVisitor(PathTrackingVisitor): + + def visit_term(self, node, context): + yield (context["path"], node.value) + + @classmethod + def setUpClass(cls): + cls.visit = cls.TermPathVisitor().visit + + def test_visit_simple_term(self): + paths = self.visit(Word("foo")) + self.assertEqual(paths, [((), "foo")]) + + def test_visit_complex(self): + tree = AndOperation( + Group(OrOperation(Word("foo"), Word("bar"), Boost(Fuzzy(Word("baz")), force=2))), + Proximity(Phrase('"spam ham"')), + SearchField("fizz", Regex("/fuzz/")), + ) + paths = self.visit(tree) + self.assertEqual( + sorted(paths), + [ + ((0, 0, 0), "foo"), + ((0, 0, 1), "bar"), + ((0, 0, 2, 0, 0), "baz"), + ((1, 0), '"spam ham"'), + ((2, 0), '/fuzz/'), + ] + ) + + +class PathTrackingTransformerTestCase(TestCase): + + class TermPathTransformer(PathTrackingTransformer): + + def visit_term(self, node, context): + path = '-'.join(str(i) for i in context['path']) + quote = '"' if isinstance(node, Phrase) else "/" if isinstance(node, Regex) else "" + value = node.value.strip(quote) + new_node = node.clone_item(value=f"{quote}{value}@{path}{quote}") + yield new_node + + @classmethod + def setUpClass(cls): + cls.transform = cls.TermPathTransformer().visit + + def test_visit_simple_term(self): + tree = self.transform(Word("foo")) + self.assertEqual(tree, Word("foo@")) + + def test_visit_complex(self): + tree = AndOperation( + Group(OrOperation(Word("foo"), Word("bar"), Boost(Fuzzy(Word("baz")), force=2))), + Proximity(Phrase('"spam ham"')), + SearchField("fizz", Regex("/fuzz/")), + ) + transformed = self.transform(tree) + expected = AndOperation( + Group(OrOperation( + Word("foo@0-0-0"), + Word("bar@0-0-1"), + Boost(Fuzzy(Word("baz@0-0-2-0-0")), force=2), + )), + Proximity(Phrase('"spam ham@1-0"')), + SearchField("fizz", Regex("/fuzz@2-0/")), + ) + self.assertEqual(transformed, expected) From 6963fe555184c52a07078698fbd34995be6038b9 Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Tue, 27 Oct 2020 18:22:26 +0100 Subject: [PATCH 3/7] some more tests --- docs/source/quick_start.rst | 2 +- luqum/visitor.py | 2 +- tests/test_elasticsearch/test_es_naming.py | 6 ++++ tests/test_elasticsearch/test_naming.py | 32 ++++++++++++++++++++++ 4 files changed, 40 insertions(+), 2 deletions(-) diff --git a/docs/source/quick_start.rst b/docs/source/quick_start.rst index dcb2375..a791489 100644 --- a/docs/source/quick_start.rst +++ b/docs/source/quick_start.rst @@ -360,7 +360,7 @@ We can use :py:func:`auto_name` to automatically add names:: names contains a dict association names to path in the luqum tree. For example the first name "a" is associated with element "foo", -and we can retrieve it easly thanks to small utils for navigating the tree:: +and we can retrieve it easily thanks to small utils for navigating the tree:: >>> from luqum.naming import element_from_path, element_from_name >>> element_from_name(tree, "a", names) diff --git a/luqum/visitor.py b/luqum/visitor.py index 7f66440..8d5fa92 100644 --- a/luqum/visitor.py +++ b/luqum/visitor.py @@ -195,7 +195,7 @@ def clone_children(self, node, new_node, context): class PathTrackingMixin: - """It can be useful to compute path of an elements (as tuple of index in parent children) + """It can be useful to compute path of an element (as tuple of index in parent children) This mixin provides base components """ diff --git a/tests/test_elasticsearch/test_es_naming.py b/tests/test_elasticsearch/test_es_naming.py index d9611c0..21e04b1 100644 --- a/tests/test_elasticsearch/test_es_naming.py +++ b/tests/test_elasticsearch/test_es_naming.py @@ -188,6 +188,12 @@ def test_group(self): def test_unknown_operation(self): self._simple_test('(title:Phoenix ref:HP5)', "HP5", num_match=2) + def test_nested(self): + self._simple_test('(illustrators.name:Greenfield)', "HP4") + + def test_object(self): + self._simple_test('(author.name:Rowling AND ref:HP4)', "HP4", num_match=2) + @classmethod def tearDownClass(cls): remove_book_index(cls.es_client) diff --git a/tests/test_elasticsearch/test_naming.py b/tests/test_elasticsearch/test_naming.py index a78c218..2a73eff 100644 --- a/tests/test_elasticsearch/test_naming.py +++ b/tests/test_elasticsearch/test_naming.py @@ -169,6 +169,38 @@ def test_named_queries_range(self): result = self.transformer(tree) self.assertEqual(result, {'range': {'text': {'_name': 'a', 'gte': 'x', 'lte': 'z'}}}) + def test_named_queries_nested(self): + tree = SearchField("author.name", Word("Monthy")) + set_name(tree, "a") + result = self.transformer(tree) + # name is repeated on query, but it's not a big deal… + self.assertEqual( + result, + { + 'nested': { + '_name': 'a', + 'path': 'author', + 'query': {'match': {'author.name': { + '_name': 'a', 'query': 'Monthy', 'zero_terms_query':'none', + }}}, + }, + } + ) + + def test_named_queries_object(self): + tree = SearchField("book.title", Word("Circus")) + set_name(tree, "a") + result = self.transformer(tree) + # name is repeated on query, but it's not a big deal… + self.assertEqual( + result, + { + 'match': {'book.title': { + '_name': 'a', 'query': 'Circus', 'zero_terms_query': 'none' + }} + } + ) + def test_named_queries_group(self): tree = SearchField("text", FieldGroup(Word("bar"))) set_name(tree.children[0], "a") From 3fd5c5d567bb4a79ea6e4aabea372ecb45469454 Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Thu, 29 Oct 2020 11:45:26 +0100 Subject: [PATCH 4/7] fix naming on exists query --- luqum/elasticsearch/tree.py | 6 +++++- tests/test_elasticsearch/test_es_naming.py | 15 +++++++++++---- tests/test_elasticsearch/test_naming.py | 8 +++++++- tests/test_naming.py | 10 ++++++++-- 4 files changed, 31 insertions(+), 8 deletions(-) diff --git a/luqum/elasticsearch/tree.py b/luqum/elasticsearch/tree.py index b5064aa..7926308 100644 --- a/luqum/elasticsearch/tree.py +++ b/luqum/elasticsearch/tree.py @@ -126,7 +126,11 @@ def __init__(self, q, *args, **kwargs): def json(self): # field:* is transformed to exists query if self.q == '*': - return {"exists": {"field": self.field}} + query = {"exists": {"field": self.field}} + name = getattr(self, "_name", None) + if name is not None: + query["exists"]["_name"] = name + return query return super().json diff --git a/tests/test_elasticsearch/test_es_naming.py b/tests/test_elasticsearch/test_es_naming.py index 21e04b1..66e7b5e 100644 --- a/tests/test_elasticsearch/test_es_naming.py +++ b/tests/test_elasticsearch/test_es_naming.py @@ -73,7 +73,9 @@ def test_or_operation(self): 'n_pages:360 OR edition:Lumos', ) # matching None - book, = list(self.search.filter(Q(query) | Q("match_all")).filter(Q("term", ref="HP7")).execute()) + book, = list( + self.search.filter(Q(query) | Q("match_all")).filter(Q("term", ref="HP7")).execute() + ) self.assertFalse(hasattr(book.meta, "matched_queries")) def test_and_operation_matching(self): @@ -124,7 +126,8 @@ def _negation_test(self, operator): f'{operator}n_pages:360 AND edition:Lumos', ) # matching n_pages and not lumos - book, = list(self.search.filter(Q(query) | Q("term", ref="HP8")).exclude(Q("term", ref="BB1")).execute()) + search = self.search.filter(Q(query) | Q("term", ref="HP8")).exclude(Q("term", ref="BB1")) + book, = list(search.execute()) self.assertEqual(len(book.meta.matched_queries), 1) paths_ok, paths_ko = self.propagate_matching( ltree, matching_from_names(book.meta.matched_queries, names), @@ -135,7 +138,8 @@ def _negation_test(self, operator): f'AND edition:Lumos', ) # matching none - book, = list(self.search.filter(Q(query) | Q("term", ref="HP7")).exclude(Q("term", ref="BB1")).execute()) + search = self.search.filter(Q(query) | Q("term", ref="HP7")).exclude(Q("term", ref="BB1")) + book, = list(search.execute()) self.assertFalse(hasattr(book.meta, "matched_queries")) def test_not(self): @@ -143,7 +147,7 @@ def test_not(self): def test_minus(self): self._negation_test("-") - + def _simple_test(self, matching_query, ref, num_match=1): """simple scenario @@ -176,6 +180,9 @@ def test_boost(self): def test_plus(self): self._simple_test('+title:"Phoenix"', "HP5") + def test_exists(self): + self._simple_test('(ref:* AND title:Phoenix)', "HP5", num_match=2) + def test_range(self): self._simple_test('publication_date:[2000-01-01 TO 2001-01-01]', "HP4") diff --git a/tests/test_elasticsearch/test_naming.py b/tests/test_elasticsearch/test_naming.py index 2a73eff..afaf1c9 100644 --- a/tests/test_elasticsearch/test_naming.py +++ b/tests/test_elasticsearch/test_naming.py @@ -181,7 +181,7 @@ def test_named_queries_nested(self): '_name': 'a', 'path': 'author', 'query': {'match': {'author.name': { - '_name': 'a', 'query': 'Monthy', 'zero_terms_query':'none', + '_name': 'a', 'query': 'Monthy', 'zero_terms_query': 'none', }}}, }, } @@ -212,6 +212,12 @@ def test_named_queries_group(self): result = self.transformer(tree) self.assertEqual(result, {"term": {"text": {"value": "bar", "_name": "a"}}},) + def test_named_queries_exists(self): + tree = SearchField("text", Word("*")) + set_name(tree.children[0], "a") + result = self.transformer(tree) + self.assertEqual(result, {"exists": {"field": "text", "_name": "a"}},) + def test_named_queries_complex(self): tree = ( AndOperation( diff --git a/tests/test_naming.py b/tests/test_naming.py index b80466c..840c3fb 100644 --- a/tests/test_naming.py +++ b/tests/test_naming.py @@ -239,8 +239,14 @@ def test_unknown_operation(self): propagate_and = MatchingPropagator(default_operation=AndOperation) for matching in [set(), {(2, )}, {(0, ), (2, )}, {(0, ), (1, ), (2, )}]: - self.assertEqual(propagate_or(tree, matching), self.propagate_matching(tree_or, matching)) - self.assertEqual(propagate_and(tree, matching), self.propagate_matching(tree_and, matching)) + self.assertEqual( + propagate_or(tree, matching), + self.propagate_matching(tree_or, matching), + ) + self.assertEqual( + propagate_and(tree, matching), + self.propagate_matching(tree_and, matching), + ) def test_negation(self): for tree in [Prohibit(Word("foo")), Not(Word("foo"))]: From 3e5e2eb3da7ddbbdca5f7b2b6ca19154da2b894d Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Sun, 1 Nov 2020 23:02:36 +0100 Subject: [PATCH 5/7] fix nested handling in naming and match propagation --- docs/source/quick_start.rst | 4 +- luqum/elasticsearch/nested.py | 95 +++++++++++++ luqum/naming.py | 82 ++++++----- tests/test_elasticsearch/test_es_naming.py | 156 +++++++++++++++++++-- tests/test_elasticsearch/test_nested.py | 138 ++++++++++++++++++ tests/test_naming.py | 130 ++++++++++++----- 6 files changed, 518 insertions(+), 87 deletions(-) create mode 100644 luqum/elasticsearch/nested.py create mode 100644 tests/test_elasticsearch/test_nested.py diff --git a/docs/source/quick_start.rst b/docs/source/quick_start.rst index a791489..86831b5 100644 --- a/docs/source/quick_start.rst +++ b/docs/source/quick_start.rst @@ -396,14 +396,14 @@ first identifying every matching element using :py:class:`MatchingPropagator`:: >>> from luqum.naming import MatchingPropagator, matching_from_names >>> propagate_matching = MatchingPropagator() - >>> paths_ok, paths_ko = propagate_matching(tree, matching_from_names(matched_queries, names)) + >>> paths_ok, paths_ko = propagate_matching(tree, *matching_from_names(matched_queries, names)) And then using :py:class:`HTMLMarker` to display it in html (you could make your own also):: >>> from luqum.naming import HTMLMarker >>> mark_html = HTMLMarker() # you can customize some parameters, refer to doc >>> mark_html(tree, paths_ok, paths_ko) - 'foo~2 OR (bar AND baz)' + 'foo~2 OR (bar AND baz)' __ https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html#request-body-search-queries-and-filters diff --git a/luqum/elasticsearch/nested.py b/luqum/elasticsearch/nested.py new file mode 100644 index 0000000..0f047c9 --- /dev/null +++ b/luqum/elasticsearch/nested.py @@ -0,0 +1,95 @@ +"""If you have a query with a nested query containing operations, +when using named queries, Elasticsearch won't report inner matching. + +This is a problem if you extensively use it. +""" + + +def get_first_name(query): + if isinstance(query, dict): + if "_name" in query: + return query["_name"] + elif "bool" in query: + # do not go down bool + return None + else: + children = query.values() + elif isinstance(query, list): + children = query + else: + return None + iter_candidates = (get_first_name(child) for child in children) + candidates = [candidate for candidate in iter_candidates if candidate is not None] + return candidates[0] if candidates else None + + +def extract_nested_queries(query, query_nester=None): + """given a query, + extract all queries that are under a nested query and boolean operations, + returning an atomic nested version of them. + Those nested queries, also take care of changing the name to the nearest inner name, + + This is useful for Elasticsearch won't go down explaining why a nested query is matching. + + :param dict query: elasticsearch query to analyze + :param callable query_nester: this is the function called to nest sub queries, leave it default + :return list: queries that you should run to get all matching + + .. note:: because we re-nest part of bool queries, results might not be accurate + for:: + {"bool": "must" : [ + {"nested": {"path": "a", "match": {"x": "y"}}}, + {"nested": {"path": "a", "match": {"x": "z"}}} + ]} + is not the same as:: + {"nested": {"path": "a", "bool": "must": [{"match": {"x": "y"}}, {"match": {"x": "z"}}]}} + + if x is multivalued. + The first would match `{"a": [{"x": "y"}, {"x": "z"}]}` + While the second would only match if `x` contains `"y z"` or `"z y"` + """ + queries = [] # this contains our result + in_nested = query_nester is not None + sub_query_nester = query_nester + if isinstance(query, dict): + if "nested" in query: + params = {k: v for k, v in query["nested"].items() if k not in ("query", "name")} + + def sub_query_nester(req, name): + nested = {"nested": {"query": req, **params}} + if query_nester is not None: + nested = query_nester(nested, name) + if name is not None: + nested["nested"]["_name"] = name + return nested + + bool_param = {"must", "should", "must_not"} & set(query.keys()) + if bool_param and in_nested: + # we are in a list of operations in a bool inside a nested, + # make a query with nested on sub arguments + op, = bool_param # must or should or must_not + # normalize to a list + sub_queries = query[op] if isinstance(query[op], list) else [query[op]] + # add nesting + nested_sub_queries = [ + query_nester(sub_query, get_first_name(sub_query)) for sub_query in sub_queries + ] + # those are queries we want to return + queries.extend(nested_sub_queries) + # continue processing in each sub query + # (before nesting, nesting is contained in query_nester) + children = sub_queries + else: + children = query.values() + elif isinstance(query, list): + children = query + else: + # leaf: final recursivity + children = [] + + # recurse + for child_query in children: + queries.extend( + extract_nested_queries(child_query, query_nester=sub_query_nester) + ) + return queries diff --git a/luqum/naming.py b/luqum/naming.py index 9e4de7e..f38c71d 100644 --- a/luqum/naming.py +++ b/luqum/naming.py @@ -98,9 +98,10 @@ def matching_from_names(names, name_to_path): :param list names: list of names :param dict name_to_path: association of names with path to children - :return set: corresponding list of matching path + :return tuple: (set of matching paths, set of other known paths) """ - return {name_to_path[name] for name in names} + matching = {name_to_path[name] for name in names} + return (matching, set(name_to_path.values()) - matching) def element_from_path(tree, path): @@ -143,7 +144,19 @@ def __init__(self, default_operation=tree.OrOperation): if default_operation is tree.OrOperation: self.OR_NODES = self.OR_NODES + (tree.UnknownOperation,) - def _propagate(self, node, matching, path): + def _status_from_parent(self, path, matching, other): + """Get status from nearest parent in hierarchie which had a name + """ + if path in matching: + return True + elif path in other: + return False + elif not path: + return False + else: + return self._status_from_parent(path[:-1], matching, other) + + def _propagate(self, node, matching, other, path): """recursively propagate matching return tuple: ( @@ -151,37 +164,37 @@ def _propagate(self, node, matching, path): set of pathes of matching sub nodes, set of pathes of non matching sub nodes) """ - if path not in matching: - if node.children and not isinstance(node, self.NO_CHILDREN_PROPAGATE): - paths_ok = set() # path of nodes that are matching - paths_ko = set() # path of nodes that are not matching - children_status = [] # bool for each children, indicating if it matches or not - # children first, for our result may depend on them - for i, child in enumerate(node.children): - child_ok, sub_ok, sub_ko = self._propagate(child, matching, path + (i,)) - paths_ok.update(sub_ok) - paths_ko.update(sub_ko) - children_status.append(child_ok) - # compute parent success from children - operator = any if isinstance(node, self.OR_NODES) else all - node_ok = operator(children_status) - # eventually negate result - if isinstance(node, self.NEGATION_NODES): - node_ok = not node_ok - paths_ok, paths_ko = paths_ko, paths_ok - # add path nod to the right set - target_set = paths_ok if node_ok else paths_ko - target_set.add(path) - # return result - return node_ok, paths_ok, paths_ko - else: - # non matching final node - return False, set(), {path} + paths_ok = set() # path of nodes that are matching + paths_ko = set() # path of nodes that are not matching + children_status = [] # bool for each children, indicating if it matches or not + # recurse children + if node.children and not isinstance(node, self.NO_CHILDREN_PROPAGATE): + for i, child in enumerate(node.children): + child_ok, sub_ok, sub_ko = self._propagate( + child, matching, other, path + (i,), + ) + paths_ok.update(sub_ok) + paths_ko.update(sub_ko) + children_status.append(child_ok) + # resolve node status + if path in matching: + node_ok = True + elif children_status: # compute from children + # compute parent success from children + operator = any if isinstance(node, self.OR_NODES) else all + node_ok = operator(children_status) else: - # single node matching - return True, {path}, set() - - def __call__(self, tree, matching=None): + node_ok = self._status_from_parent(path, matching, other) + if isinstance(node, self.NEGATION_NODES): + # negate result + node_ok = not node_ok + # add node to the right set + target_set = paths_ok if node_ok else paths_ko + target_set.add(path) + # return result + return node_ok, paths_ok, paths_ko + + def __call__(self, tree, matching, other=frozenset()): """ Given a list of paths that are known to match, return all pathes in the tree that are matches. @@ -192,12 +205,13 @@ def __call__(self, tree, matching=None): Descending would mean risking to give non consistent information. :param list matching: list of path of matching nodes (each path is a tuple) + :param list other: list of other path that had a name, but were not reported as matching :return tuple: ( set of matching path after propagation, set of non matching pathes after propagation) """ - tree_ok, paths_ok, paths_ko = self._propagate(tree, matching, ()) + tree_ok, paths_ok, paths_ko = self._propagate(tree, matching, other, ()) return paths_ok, paths_ko diff --git a/tests/test_elasticsearch/test_es_naming.py b/tests/test_elasticsearch/test_es_naming.py index 66e7b5e..484202d 100644 --- a/tests/test_elasticsearch/test_es_naming.py +++ b/tests/test_elasticsearch/test_es_naming.py @@ -2,6 +2,7 @@ from elasticsearch_dsl import Q +from luqum.elasticsearch.nested import extract_nested_queries from luqum.naming import ( auto_name, element_from_name, HTMLMarker, matching_from_names, MatchingPropagator, ) @@ -41,7 +42,7 @@ def test_keyword_naming(self): ltree, ) paths_ok, paths_ko = self.propagate_matching( - ltree, matching_from_names(book.meta.matched_queries, names), + ltree, *matching_from_names(book.meta.matched_queries, names), ) self.assertEqual( self.make_html(ltree, paths_ok, paths_ko), @@ -56,7 +57,7 @@ def test_or_operation(self): book, = list(self.search.filter(query).filter("term", ref="BB1").execute()) self.assertEqual(len(book.meta.matched_queries), 1) paths_ok, paths_ko = self.propagate_matching( - ltree, matching_from_names(book.meta.matched_queries, names), + ltree, *matching_from_names(book.meta.matched_queries, names), ) self.assertEqual( self.make_html(ltree, paths_ok, paths_ko), @@ -66,7 +67,7 @@ def test_or_operation(self): book, = list(self.search.filter(query).filter("term", ref="HP8").execute()) self.assertEqual(len(book.meta.matched_queries), 1) paths_ok, paths_ko = self.propagate_matching( - ltree, matching_from_names(book.meta.matched_queries, names), + ltree, *matching_from_names(book.meta.matched_queries, names), ) self.assertEqual( self.make_html(ltree, paths_ok, paths_ko), @@ -86,7 +87,7 @@ def test_and_operation_matching(self): book, = list(self.search.filter(query).filter("term", ref="BB1").execute()) self.assertEqual(len(book.meta.matched_queries), 2) paths_ok, paths_ko = self.propagate_matching( - ltree, matching_from_names(book.meta.matched_queries, names), + ltree, *matching_from_names(book.meta.matched_queries, names), ) self.assertEqual( self.make_html(ltree, paths_ok, paths_ko), @@ -101,7 +102,7 @@ def test_and_operation_not_matching(self): book, = list(self.search.filter(Q(query) | Q("term", ref="BB1")).execute()) self.assertEqual(len(book.meta.matched_queries), 1) paths_ok, paths_ko = self.propagate_matching( - ltree, matching_from_names(book.meta.matched_queries, names), + ltree, *matching_from_names(book.meta.matched_queries, names), ) self.assertEqual( self.make_html(ltree, paths_ok, paths_ko), @@ -110,44 +111,90 @@ def test_and_operation_not_matching(self): # matching None book, = list(self.search.filter(Q(query) | Q("term", ref="HP7")).execute()) self.assertFalse(hasattr(book.meta, "matched_queries")) + paths_ok, paths_ko = self.propagate_matching(ltree, *matching_from_names([], names)) + self.assertEqual( + self.make_html(ltree, paths_ok, paths_ko), + 'n_pages:360 AND edition:Lumos', + ) def _negation_test(self, operator): - ltree = parser.parse(f"{operator}n_pages:360 AND edition:Lumos") + ltree = parser.parse(f"{operator} n_pages:360 AND edition:Lumos") names = auto_name(ltree) query = self.es_builder(ltree) # matching Lumos book, = list(self.search.filter(query).filter("term", ref="BB1").execute()) self.assertEqual(len(book.meta.matched_queries), 1) paths_ok, paths_ko = self.propagate_matching( - ltree, matching_from_names(book.meta.matched_queries, names), + ltree, *matching_from_names(book.meta.matched_queries, names), ) self.assertEqual( self.make_html(ltree, paths_ok, paths_ko), - f'{operator}n_pages:360 AND edition:Lumos', + f'{operator} n_pages:360 AND' + ' edition:Lumos', ) # matching n_pages and not lumos - search = self.search.filter(Q(query) | Q("term", ref="HP8")).exclude(Q("term", ref="BB1")) + search = self.search.filter(Q(query) | Q("term", ref="HP8")).filter(Q("term", ref="HP8")) book, = list(search.execute()) self.assertEqual(len(book.meta.matched_queries), 1) paths_ok, paths_ko = self.propagate_matching( - ltree, matching_from_names(book.meta.matched_queries, names), + ltree, *matching_from_names(book.meta.matched_queries, names), ) self.assertEqual( self.make_html(ltree, paths_ok, paths_ko), - f'{operator}n_pages:360 ' + f'{operator} n_pages:360 ' f'AND edition:Lumos', ) # matching none - search = self.search.filter(Q(query) | Q("term", ref="HP7")).exclude(Q("term", ref="BB1")) + search = self.search.filter(Q(query) | Q("term", ref="HP7")).filter(Q("term", ref="HP7")) book, = list(search.execute()) self.assertFalse(hasattr(book.meta, "matched_queries")) + paths_ok, paths_ko = self.propagate_matching( + ltree, *matching_from_names([], names), + ) + self.assertEqual( + self.make_html(ltree, paths_ok, paths_ko), + f'{operator}' + ' n_pages:360 ' + 'AND edition:Lumos', + ) def test_not(self): - self._negation_test("NOT ") + self._negation_test("NOT") def test_minus(self): self._negation_test("-") + def test_double_negation(self): + ltree = parser.parse("NOT (n_pages:360 AND - edition:Lumos) AND ref:*") + names = auto_name(ltree) + query = self.es_builder(ltree) + # matching Lumos double negation + book, = list(self.search.filter(query).filter("term", ref="BB1").execute()) + self.assertEqual(len(book.meta.matched_queries), 2) + paths_ok, paths_ko = self.propagate_matching( + ltree, *matching_from_names(book.meta.matched_queries, names), + ) + self.assertEqual( + self.make_html(ltree, paths_ok, paths_ko), + 'NOT' + ' (n_pages:360 AND - edition:Lumos) ' + 'AND ref:*', + ) + # not matching Lumos double negation + book, = list( + self.search.filter(Q(query) | Q("term", ref="HP8")).filter("term", ref="HP8").execute() + ) + self.assertEqual(len(book.meta.matched_queries), 2) + paths_ok, paths_ko = self.propagate_matching( + ltree, *matching_from_names(book.meta.matched_queries, names), + ) + self.assertEqual( + self.make_html(ltree, paths_ok, paths_ko), + 'NOT' + ' (n_pages:360 AND - edition:Lumos) ' + 'AND ref:*', + ) + def _simple_test(self, matching_query, ref, num_match=1): """simple scenario @@ -161,13 +208,40 @@ def _simple_test(self, matching_query, ref, num_match=1): self.assertEqual(book.ref, ref) self.assertEqual(len(book.meta.matched_queries), num_match) paths_ok, paths_ko = self.propagate_matching( - ltree, matching_from_names(book.meta.matched_queries, names), + ltree, *matching_from_names(book.meta.matched_queries, names) ) self.assertEqual( self.make_html(ltree, paths_ok, paths_ko), f'{matching_query} OR n_pages:1000', ) + def _nested_test(self, query, html, ref, num_match=1): + """scenario taking into account nested + + :param str matching_query: the query that match the book + :param str ref: ref of expected matching book + """ + ltree = parser.parse(query) + names = auto_name(ltree) + query = self.es_builder(ltree) + queries = [query] + extract_nested_queries(query) + matched_queries = [] + # we have to force book matching by adding condition + for sub_query in queries: + search = self.search.filter(Q(sub_query) | Q("term", ref=ref)).filter("term", ref=ref) + book, = list(search.execute()) + self.assertEqual(book.ref, ref) + matched_queries.extend(getattr(book.meta, "matched_queries", [])) + self.assertEqual(len(matched_queries), num_match) + paths_ok, paths_ko = self.propagate_matching( + ltree, *matching_from_names(matched_queries, names), + ) + self.assertEqual( + self.make_html(ltree, paths_ok, paths_ko), + html, + ) + return matched_queries, html + def test_fuzzy(self): self._simple_test("ref:BB~1", "BB1") @@ -201,6 +275,60 @@ def test_nested(self): def test_object(self): self._simple_test('(author.name:Rowling AND ref:HP4)', "HP4", num_match=2) + def test_nested_operation(self): + self._nested_test( + '(illustrators:(name:Greenfield AND nationality:UK))', + '(illustrators:(name:Greenfield AND nationality:UK))', + "HP4", + num_match=2, + ) + + def test_nested_operation_not_all_matching(self): + self._nested_test( + '(illustrators:(name:Greenfield OR nationality:FR) AND n_pages:636)', + '(illustrators:(name:Greenfield OR' + ' nationality:FR) AND n_pages:636)', + "HP4", + num_match=3, + ) + + def test_nested_operation_with_not(self): + self._nested_test( + '(NOT illustrators:(-name:Greenfield AND nationality:FR))', + '(NOT illustrators:' + '(-name:Greenfield AND nationality:FR)' + ')', + "HP4", + num_match=1, + ) + + def test_nested_operation_not_perfect(self): + # this test, test a known limitation, that is it test the bug is there + # this is because ES do not propagate name matching in subfield + self._nested_test( + '(illustrators:(name:Greenfield AND nationality:US) AND n_pages:1000)', + # see contradiction beetween the fact that it should not match, + # yet our analyzer sees inner illustrators query as matching + '' + '(illustrators:(name:Greenfield AND nationality:US) ' + 'AND n_pages:1000)' + '', + "HP4", + num_match=2, + ) + # but global result is weel kept + self._nested_test( + '(illustrators:(name:Greenfield AND nationality:US) AND n_pages:636)', + # see contradiction beetween the fact that it should not match, + # yet our analyzer sees inner illustrators query as matching + '' + '(illustrators:(name:Greenfield AND nationality:US) ' + 'AND n_pages:636)' + '', + "HP4", + num_match=3, + ) + @classmethod def tearDownClass(cls): remove_book_index(cls.es_client) diff --git a/tests/test_elasticsearch/test_nested.py b/tests/test_elasticsearch/test_nested.py new file mode 100644 index 0000000..47a9f6f --- /dev/null +++ b/tests/test_elasticsearch/test_nested.py @@ -0,0 +1,138 @@ +from unittest import TestCase + +from luqum.elasticsearch.nested import extract_nested_queries + + +class NestedQueriesTestCase(TestCase): + + def test_no_nested(self): + queries = extract_nested_queries({"term": {"text": {"value": "spam", "_name": "spam"}}}) + self.assertEqual(queries, []) + + queries = extract_nested_queries( + {"bool": {"must": [ + {"term": {"text": {"value": "spam", "_name": "spam"}}}, + {"term": {"text": {"value": "ham", "_name": "ham"}}}, + ]}} + ) + self.assertEqual(queries, []) + + def test_nested_no_bool_inside(self): + queries = extract_nested_queries( + {"nested": { + "path": "my", + "query": {"term": {"text": {"value": "spam", "_name": "spam"}}} + }} + ) + self.assertEqual(queries, []) + + def test_nested_bool_inside(self): + term1 = {"term": {"text": {"value": "spam", "_name": "spam"}}} + term2 = {"term": {"text": {"value": "ham", "_name": "ham"}}} + bool_query = {"bool": {"must": [term1, term2]}} + queries = extract_nested_queries({"nested": {"path": "my", "query": bool_query}}) + self.assertEqual( + queries, + [ + {"nested": {"path": "my", "query": term1, "_name": "spam"}}, + {"nested": {"path": "my", "query": term2, "_name": "ham"}}, + ], + ) + + def test_nested_in_bool_with_bool_inside(self): + term1 = {"term": {"text": {"value": "spam", "_name": "spam"}}} + term2 = {"term": {"text": {"value": "ham", "_name": "ham"}}} + term3 = {"term": {"text": {"value": "foo", "_name": "foo"}}} + bool_query = {"bool": {"must": [term1, term2]}} + queries = extract_nested_queries( + {"bool": {"should": [term3, {"nested": {"path": "my", "query": bool_query}}]}} + ) + self.assertEqual( + queries, + [ + {"nested": {"path": "my", "query": term1, "_name": "spam"}}, + {"nested": {"path": "my", "query": term2, "_name": "ham"}}, + ], + ) + + def test_nested_bool_inside_bool(self): + term1 = {"term": {"text": {"value": "bar", "_name": "bar"}}} + term2 = {"term": {"text": {"value": "baz", "_name": "baz"}}} + term3 = {"term": {"text": {"value": "spam", "_name": "spam"}}} + bool_query1 = {"bool": {"should": [term1, term2]}} + bool_query2 = {"bool": {"must": [term3, bool_query1]}} + queries = extract_nested_queries({"nested": {"path": "my", "query": bool_query2}}) + self.assertEqual(queries, [ + {"nested": {"path": "my", "query": term3, "_name": "spam"}}, + {"nested": {"path": "my", "query": bool_query1}}, + {"nested": {"path": "my", "query": term1, "_name": "bar"}}, + {"nested": {"path": "my", "query": term2, "_name": "baz"}}, + ]) + + def test_nested_inside_nested(self): + term1 = {"term": {"text": {"value": "bar", "_name": "bar"}}} + term2 = {"term": {"text": {"value": "baz", "_name": "baz"}}} + term3 = {"term": {"text": {"value": "spam", "_name": "spam"}}} + bool_query1 = {"bool": {"should": [term1, term2]}} + inner_nested = {"nested": {"path": "my.your", "query": bool_query1}} + bool_query2 = {"bool": {"must": [term3, inner_nested]}} + queries = extract_nested_queries({"nested": {"path": "my", "query": bool_query2}}) + self.assertEqual(queries, [ + {"nested": {"path": "my", "query": term3, "_name": "spam"}}, + {"nested": {"path": "my", "query": inner_nested}}, + {"nested": {"path": "my", "_name": "bar", "query": {"nested": { + "path": "my.your", "query": term1, + }}}}, + {"nested": {"path": "my", "_name": "baz", "query": {"nested": { + "path": "my.your", "query": term2, + }}}}, + ]) + + def test_nested_inside_nested_with_nested_bool(self): + term1 = {"term": {"text": {"value": "bar", "_name": "bar"}}} + term2 = {"term": {"text": {"value": "foo", "_name": "foo"}}} + term3 = {"term": {"text": {"value": "spam", "_name": "spam"}}} + bool_query1 = {"bool": {"must_not": [term1]}} + bool_query2 = {"bool": {"should": [term2, bool_query1]}} + inner_nested = {"nested": {"path": "my.your", "query": bool_query2}} + bool_query3 = {"bool": {"must_not": [inner_nested]}} + bool_query4 = {"bool": {"must": [term3, bool_query3]}} + queries = extract_nested_queries({"nested": {"path": "my", "query": bool_query4}}) + self.assertEqual(queries, [ + {"nested": {"path": "my", "query": term3, "_name": "spam"}}, + {"nested": {"path": "my", "query": bool_query3}}, + {"nested": {"path": "my", "query": inner_nested}}, + {"nested": {"path": "my", "_name": "foo", "query": { + "nested": {"path": "my.your", "query": term2} + }}}, + {"nested": { + "path": "my", "query": {"nested": {"path": "my.your", "query": bool_query1}}, + }}, + {"nested": {"path": "my", "_name": "bar", "query": { + "nested": {"path": "my.your", "query": term1} + }}}, + ]) + + def test_multiple_parallel_nested(self): + term1 = {"term": {"text": {"value": "bar", "_name": "bar"}}} + term2 = {"term": {"text": {"value": "foo", "_name": "foo"}}} + term3 = {"term": {"text": {"value": "spam", "_name": "spam"}}} + bool_query1 = {"bool": {"should": [term1]}} + bool_query2 = {"bool": {"must_not": [term2]}} + nested1 = {"nested": {"path": "my.your", "query": bool_query1}} + nested2 = {"nested": {"path": "my.his", "query": bool_query2}} + bool_query3 = {"bool": {"should": [nested2, nested1]}} + bool_query4 = {"bool": {"must": [term3, bool_query3]}} + queries = extract_nested_queries({"nested": {"path": "my", "query": bool_query4}}) + self.assertEqual(queries, [ + {"nested": {"path": "my", "query": term3, "_name": "spam"}}, + {"nested": {"path": "my", "query": bool_query3}}, + {"nested": {"path": "my", "query": nested2}}, + {"nested": {"path": "my", "query": nested1}}, + {"nested": {"path": "my", "_name": "foo", "query": { + "nested": {"path": "my.his", "query": term2} + }}}, + {"nested": {"path": "my", "_name": "bar", "query": { + "nested": {"path": "my.your", "query": term1} + }}}, + ]) diff --git a/tests/test_naming.py b/tests/test_naming.py index 840c3fb..c327534 100644 --- a/tests/test_naming.py +++ b/tests/test_naming.py @@ -150,10 +150,18 @@ class UtilitiesTestCase(TestCase): def test_matching_from_name(self): names = {"a": (0,), "b": (1,), "c": (0, 0), "d": (0, 1), "e": (1, 0, 1)} - self.assertEqual(matching_from_names([], names), set()) - self.assertEqual(matching_from_names(["a", "b"], names), {(0,), (1,)}) - self.assertEqual(matching_from_names(["a", "e"], names), {(0,), (1, 0, 1)}) - self.assertEqual(matching_from_names(["c"], names), {(0, 0)}) + self.assertEqual( + matching_from_names([], names), (set(), {(0,), (1,), (0, 0), (0, 1), (1, 0, 1)}) + ) + self.assertEqual( + matching_from_names(["a", "b"], names), ({(0,), (1,)}, {(0, 0), (0, 1), (1, 0, 1)}) + ) + self.assertEqual( + matching_from_names(["a", "e"], names), ({(0,), (1, 0, 1)}, {(1,), (0, 0), (0, 1)}) + ) + self.assertEqual( + matching_from_names(["c"], names), ({(0, 0)}, {(0,), (1,), (0, 1), (1, 0, 1)}) + ) with self.assertRaises(KeyError): matching_from_names(["x"], names) @@ -194,39 +202,49 @@ def setUpClass(cls): def test_or_operation(self): tree = OrOperation(Word("foo"), Phrase('"bar"'), Word("baz")) + all_paths = {(0,), (1,), (2,)} - paths_ok, paths_ko = self.propagate_matching(tree, set()) + matching = set() + paths_ok, paths_ko = self.propagate_matching(tree, matching, all_paths - matching) self.assertEqual(paths_ok, set()) self.assertEqual(paths_ko, {(), (0,), (1,), (2,), }) - paths_ok, paths_ko = self.propagate_matching(tree, {(2, )}) + matching = {(2, )} + paths_ok, paths_ko = self.propagate_matching(tree, matching, all_paths - matching) self.assertEqual(paths_ok, {(), (2, )}) self.assertEqual(paths_ko, {(0,), (1,)}) - paths_ok, paths_ko = self.propagate_matching(tree, {(0, ), (2, )}) + matching = {(0, ), (2, )} + paths_ok, paths_ko = self.propagate_matching(tree, matching, all_paths - matching) self.assertEqual(paths_ok, {(), (0, ), (2,)}) self.assertEqual(paths_ko, {(1,)}) - paths_ok, paths_ko = self.propagate_matching(tree, {(0, ), (1, ), (2, )}) + matching = {(0, ), (1, ), (2, )} + paths_ok, paths_ko = self.propagate_matching(tree, matching, all_paths - matching) self.assertEqual(paths_ok, {(), (0,), (1,), (2, )}) self.assertEqual(paths_ko, set()) def test_and_operation(self): tree = AndOperation(Word("foo"), Phrase('"bar"'), Word("baz")) + all_paths = {(0,), (1,), (2,)} - paths_ok, paths_ko = self.propagate_matching(tree, set()) + matching = set() + paths_ok, paths_ko = self.propagate_matching(tree, matching, all_paths - matching) self.assertEqual(paths_ok, set()) self.assertEqual(paths_ko, {(), (0,), (1,), (2,), }) - paths_ok, paths_ko = self.propagate_matching(tree, {(2, )}) + matching = {(2, )} + paths_ok, paths_ko = self.propagate_matching(tree, matching, all_paths - matching) self.assertEqual(paths_ok, {(2, )}) self.assertEqual(paths_ko, {(), (0,), (1,)}) - paths_ok, paths_ko = self.propagate_matching(tree, {(0, ), (2, )}) + matching = {(0, ), (2, )} + paths_ok, paths_ko = self.propagate_matching(tree, matching, all_paths - matching) self.assertEqual(paths_ok, {(0, ), (2,)}) self.assertEqual(paths_ko, {(), (1,)}) - paths_ok, paths_ko = self.propagate_matching(tree, {(0, ), (1, ), (2, )}) + matching = {(0, ), (1, ), (2, )} + paths_ok, paths_ko = self.propagate_matching(tree, matching, all_paths - matching) self.assertEqual(paths_ok, {(), (0,), (1,), (2, )}) self.assertEqual(paths_ko, set()) @@ -237,49 +255,83 @@ def test_unknown_operation(self): tree_and = AndOperation(Word("foo"), Phrase('"bar"'), Word("baz")) propagate_or = self.propagate_matching propagate_and = MatchingPropagator(default_operation=AndOperation) + all_paths = {(0,), (1,), (2,)} for matching in [set(), {(2, )}, {(0, ), (2, )}, {(0, ), (1, ), (2, )}]: self.assertEqual( propagate_or(tree, matching), - self.propagate_matching(tree_or, matching), + self.propagate_matching(tree_or, matching, matching - all_paths), ) self.assertEqual( propagate_and(tree, matching), - self.propagate_matching(tree_and, matching), + self.propagate_matching(tree_and, matching, matching - all_paths), ) def test_negation(self): for tree in [Prohibit(Word("foo")), Not(Word("foo"))]: with self.subTest("%r" % type(tree)): - paths_ok, paths_ko = self.propagate_matching(tree, set()) - self.assertEqual(paths_ok, {(), (0,)}) - self.assertEqual(paths_ko, set()) + paths_ok, paths_ko = self.propagate_matching(tree, set(), {(0, )}) + self.assertEqual(paths_ok, {()}) + self.assertEqual(paths_ko, {(0,)}) - paths_ok, paths_ko = self.propagate_matching(tree, {(0, )}) - self.assertEqual(paths_ok, set()) - self.assertEqual(paths_ko, {(), (0,)}) + paths_ok, paths_ko = self.propagate_matching(tree, {(0, )}, set()) + self.assertEqual(paths_ok, {(0,)}) + self.assertEqual(paths_ko, {()}) + + def test_nested_negation(self): + for NegClass in (Prohibit, Not): + with self.subTest("%r" % NegClass): + tree = AndOperation( + NegClass(OrOperation( + NegClass(AndOperation( + NegClass(Word("a")), + Word("b"), + )), + Word("c"), + )), + Word("d"), + ) + a, b, c, d = (0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 1), (0, 0, 1), (1,) + not_a, ab, not_ab = (0, 0, 0, 0, 0), (0, 0, 0, 0), (0, 0, 0) + abc, not_abc, abcd = (0, 0), (0,), () + + paths_ok, paths_ko = self.propagate_matching(tree, set(), {a, b, c, d}) + self.assertEqual(paths_ok, {not_a, not_ab, abc}) + self.assertEqual(paths_ko, {a, b, ab, c, not_abc, d, abcd}) + + paths_ok, paths_ko = self.propagate_matching(tree, {b, d}, {a, c}) + self.assertEqual(paths_ok, {not_a, b, ab, not_abc, d, abcd}) + self.assertEqual(paths_ko, {a, not_ab, c, abc}) + + paths_ok, paths_ko = self.propagate_matching(tree, {a, b, c}, {d}) + self.assertEqual(paths_ok, {a, b, not_ab, c, abc}) + self.assertEqual(paths_ko, {not_a, ab, not_abc, d, abcd}) + + paths_ok, paths_ko = self.propagate_matching(tree, {a, b, c, d}, set()) + self.assertEqual(paths_ok, {a, b, not_ab, c, abc, d}) + self.assertEqual(paths_ko, {not_a, ab, not_abc, abcd}) def test_single_element(self): for tree in [Word("a"), Phrase('"a"'), Regex("/a/")]: with self.subTest("%r" % type(tree)): paths_ok, paths_ko = self.propagate_matching(tree, set()) - self.assertEqual(paths_ok, set()) + self.assertEqual(paths_ok, set(), {()}) self.assertEqual(paths_ko, {()}) paths_ok, paths_ko = self.propagate_matching(tree, {()}) - self.assertEqual(paths_ok, {()}) + self.assertEqual(paths_ok, {()}, set()) self.assertEqual(paths_ko, set()) def test_no_propagation(self): for tree in [Range(Word("a"), Word("b")), Fuzzy(Word("foo")), Proximity('"bar baz"', 2)]: with self.subTest("%r" % type(tree)): - paths_ok, paths_ko = self.propagate_matching(tree, set()) + paths_ok, paths_ko = self.propagate_matching(tree, set(), {()}) # no down propagation self.assertEqual(paths_ok, set()) self.assertEqual(paths_ko, {()}) - paths_ok, paths_ko = self.propagate_matching(tree, {()}) + paths_ok, paths_ko = self.propagate_matching(tree, {()}, set()) self.assertEqual(paths_ok, {()}) self.assertEqual(paths_ko, set()) @@ -287,11 +339,11 @@ def test_simple_propagation(self): # propagation in nodes with only one children tree = Boost(Group(SearchField("foo", FieldGroup(Plus(Word("bar"))))), force=2) - paths_ok, paths_ko = self.propagate_matching(tree, set()) + paths_ok, paths_ko = self.propagate_matching(tree, set(), {()}) self.assertEqual(paths_ok, set()) self.assertEqual(paths_ko, {(), (0,), (0, 0), (0, 0, 0), (0, 0, 0, 0), (0, 0, 0, 0, 0)}) - paths_ok, paths_ko = self.propagate_matching(tree, {(0, 0, 0, 0, 0)}) + paths_ok, paths_ko = self.propagate_matching(tree, {()}, set()) self.assertEqual(paths_ok, {(), (0,), (0, 0), (0, 0, 0), (0, 0, 0, 0), (0, 0, 0, 0, 0)}) self.assertEqual(paths_ko, set()) @@ -330,16 +382,15 @@ def test_combination(self): to_path = simple_naming(tree) paths_ok, paths_ko = self.propagate_matching(tree, set()) - # prohibit and not make non matching parts to match self.assertEqual( paths_to_names(tree, paths_ok), - {"prohibit", "fuzzy", "not", "or2", "bar", "baz"}, + {"prohibit", "not"}, ) self.assertEqual( paths_to_names(tree, paths_ko), { "and", "or", "searchfield", "fieldgroup", "plus", "and2", "foo", "fizz", - "boost", "group", "and3", "ham", "spam", + "boost", "group", "and3", "ham", "spam", "fuzzy", "or2", "bar", "baz" }, ) @@ -349,10 +400,15 @@ def test_combination(self): ) self.assertEqual( paths_to_names(tree, paths_ok), - {"and", "or", "searchfield", "fieldgroup", "plus", "and2", "foo", "fizz", "ham", - "prohibit", "fuzzy", "not", "or2", "bar", "baz", } + { + "and", "or", "searchfield", "fieldgroup", "plus", "and2", "foo", "fizz", "ham", + "prohibit", "not" + }, + ) + self.assertEqual( + paths_to_names(tree, paths_ko), + {"boost", "group", "and3", "spam", "fuzzy", "or2", "bar", "baz"}, ) - self.assertEqual(paths_to_names(tree, paths_ko), {"boost", "group", "and3", "spam", }) # making everything match paths_ok, paths_ko = self.propagate_matching( @@ -361,9 +417,9 @@ def test_combination(self): self.assertEqual( paths_to_names(tree, paths_ok), {"and", "or", "searchfield", "fieldgroup", "plus", "and2", "foo", "fizz", "ham", - "prohibit", "fuzzy", "boost", "group", "and3", "spam", "not", "or2", "bar", "baz", } + "prohibit", "boost", "group", "and3", "spam", "not"} ) - self.assertEqual(paths_to_names(tree, paths_ko), set()) + self.assertEqual(paths_to_names(tree, paths_ko), {"fuzzy", "or2", "bar", "baz"}) # making everything match, but some negative expression paths_ok, paths_ko = self.propagate_matching( @@ -377,14 +433,14 @@ def test_combination(self): paths_to_names(tree, paths_ok), { "or", "searchfield", "fieldgroup", "plus", "and2", - "foo", "fizz", "ham", "spam", "baz", + "foo", "fizz", "ham", "spam", "fuzzy", "or2", "bar", }, ) self.assertEqual( paths_to_names(tree, paths_ko), { - "and", "boost", "group", "and3", "prohibit", "fuzzy", - "boost", "group", "and3", "not", "or2", "bar" + "and", "boost", "group", "and3", "prohibit", + "boost", "group", "and3", "not", "baz", }, ) From aab0d5809da9d000af106d4a989dbd8e5805db68 Mon Sep 17 00:00:00 2001 From: pierre chene Date: Tue, 5 Jan 2021 15:09:35 +0100 Subject: [PATCH 6/7] Fix coverage --- .gitignore | 1 + luqum/elasticsearch/nested.py | 1 + tests/test_elasticsearch/test_nested.py | 8 +++++++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 07fbf8e..19d7924 100644 --- a/.gitignore +++ b/.gitignore @@ -47,6 +47,7 @@ htmlcov/ .coverage .coverage.* .cache +.venv nosetests.xml coverage.xml *,cover diff --git a/luqum/elasticsearch/nested.py b/luqum/elasticsearch/nested.py index 0f047c9..35073dd 100644 --- a/luqum/elasticsearch/nested.py +++ b/luqum/elasticsearch/nested.py @@ -51,6 +51,7 @@ def extract_nested_queries(query, query_nester=None): queries = [] # this contains our result in_nested = query_nester is not None sub_query_nester = query_nester + if isinstance(query, dict): if "nested" in query: params = {k: v for k, v in query["nested"].items() if k not in ("query", "name")} diff --git a/tests/test_elasticsearch/test_nested.py b/tests/test_elasticsearch/test_nested.py index 47a9f6f..9e1ed69 100644 --- a/tests/test_elasticsearch/test_nested.py +++ b/tests/test_elasticsearch/test_nested.py @@ -1,6 +1,6 @@ from unittest import TestCase -from luqum.elasticsearch.nested import extract_nested_queries +from luqum.elasticsearch.nested import extract_nested_queries, get_first_name class NestedQueriesTestCase(TestCase): @@ -136,3 +136,9 @@ def test_multiple_parallel_nested(self): "nested": {"path": "my.your", "query": term1} }}}, ]) + + def test_get_first_name(self): + term = {"term": {"text": {"value": "bar", "_name": "bar"}}} + query = [{"query": term, "_name": "spam"}, {"query": term, "_name": "beurre"}] + name = get_first_name(query) + self.assertEqual(name, "spam") From 6ec12bb243eb2e6ed01f3781ed6460a2956f0d0d Mon Sep 17 00:00:00 2001 From: pierre chene Date: Tue, 5 Jan 2021 15:35:22 +0100 Subject: [PATCH 7/7] Fix style --- luqum/elasticsearch/nested.py | 1 - tests/test_elasticsearch/test_nested.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/luqum/elasticsearch/nested.py b/luqum/elasticsearch/nested.py index 35073dd..0f047c9 100644 --- a/luqum/elasticsearch/nested.py +++ b/luqum/elasticsearch/nested.py @@ -51,7 +51,6 @@ def extract_nested_queries(query, query_nester=None): queries = [] # this contains our result in_nested = query_nester is not None sub_query_nester = query_nester - if isinstance(query, dict): if "nested" in query: params = {k: v for k, v in query["nested"].items() if k not in ("query", "name")} diff --git a/tests/test_elasticsearch/test_nested.py b/tests/test_elasticsearch/test_nested.py index 9e1ed69..140b8fa 100644 --- a/tests/test_elasticsearch/test_nested.py +++ b/tests/test_elasticsearch/test_nested.py @@ -136,7 +136,7 @@ def test_multiple_parallel_nested(self): "nested": {"path": "my.your", "query": term1} }}}, ]) - + def test_get_first_name(self): term = {"term": {"text": {"value": "bar", "_name": "bar"}}} query = [{"query": term, "_name": "spam"}, {"query": term, "_name": "beurre"}]