diff --git a/documentation/src/main/asciidoc/backend-elasticsearch.asciidoc b/documentation/src/main/asciidoc/backend-elasticsearch.asciidoc index 8198b3fb596..a8c6df91355 100644 --- a/documentation/src/main/asciidoc/backend-elasticsearch.asciidoc +++ b/documentation/src/main/asciidoc/backend-elasticsearch.asciidoc @@ -421,21 +421,72 @@ The Elasticsearch `date` type does not support the whole range of years that can // Search 5 anchors backward compatibility [[elasticsearch-mapping-analyzer]] -[IMPORTANT] +<> is the text processing performed by analyzers, +both when indexing (document processing) +and when searching (query processing). + +All built-in Elasticsearch analyzers can be used transparently, +without any configuration in Hibernate Search: +just use their name wherever Hibernate Search expects an analyzer name. +However, in order to define custom analyzers, +analysis must be configured explicitly. + +[CAUTION] ==== -This section is currently incomplete. -A decent introduction is included in the getting started guide: see <>. +Elasticsearch analysis configuration is not applied immediately on startup: +it needs to be pushed to the Elasticsearch cluster. +Hibernate Search will only push the configuration to the cluster if specific conditions are met, +and only if instructed to do so +through the <>. ==== To configure analysis in an Elasticsearch backend, you will need to: -* Implement a bean that implements the `org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurer` interface. -* Configure your backend to use that bean by setting the configuration property +* Define a class that implements the `org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurer` interface. +* Configure the backend to use that implementation by setting the configuration property `hibernate.search.backends..analysis.configurer` -to a <> pointing to your bean. +to a <> pointing to the implementation. + +Hibernate Search will call the `configure` method of this implementation on startup, +and the configurer will be able to take advantage of a DSL to define analyzers: + +.Implementing and using an analysis configurer with the Elasticsearch backend +==== +[source, JAVA, indent=0, subs="+callouts"] +---- +include::{sourcedir}/org/hibernate/search/documentation/analysis/MyElasticsearchAnalysisConfigurer.java[tags=include] +---- +<1> Define a custom analyzer named "english", because it will be used to analyze English text such as book titles. +<2> Set the tokenizer to a standard tokenizer. +<3> Set the char filters. Char filters are applied in the order they are given, before the tokenizer. +<4> Set the token filters. Token filters are applied in the order they are given, after the tokenizer. +<5> Note that, for Elasticsearch, any parameterized char filter, tokenizer or token filter +must be defined separately and assigned a name. +<6> Set the value of a parameter for the char filter/tokenizer/token filter being defined. +<7> Normalizers are defined in a similar way, the only difference being that they cannot use a tokenizer. +<8> Multiple analyzers/normalizers can be defined in the same configurer. + +[source, XML, indent=0, subs="+callouts"] +---- +include::{resourcesdir}/analysis/elasticsearch-simple.properties[] +---- +<1> Assign the configurer to the backend `myBackend` using a Hibernate Search configuration property. +==== -// TODO add a simple example: configurer implementation + settings +It is also possible to assign a name to a parameterized built-in analyzer: +.Naming a parameterized built-in analyzer in the Elasticsearch backend +==== +[source, JAVA, indent=0, subs="+callouts"] +---- +include::{sourcedir}/org/hibernate/search/documentation/analysis/AdvancedElasticsearchAnalysisConfigurer.java[tags=type] +---- +<1> Define an analyzer with the given name and type. +<2> Set the value of a parameter for the analyzer being defined. +==== + +[TIP] +==== To know which character filters, tokenizers and token filters are available, refer to the documentation: @@ -445,9 +496,7 @@ refer to the documentation: {elasticsearchDocUrl}/analysis-charfilters.html[character filters], {elasticsearchDocUrl}/analysis-tokenizers.html[tokenizers], {elasticsearchDocUrl}/analysis-tokenfilters.html[token filters]. - - -// TODO add detailed description of each use case: normalizer, analyzer, custom, builtin type, ... +==== [[backend-elasticsearch-multi-tenancy]] == Multi-tenancy diff --git a/documentation/src/main/asciidoc/backend-lucene.asciidoc b/documentation/src/main/asciidoc/backend-lucene.asciidoc index 46d1f2350c0..34168b3987d 100644 --- a/documentation/src/main/asciidoc/backend-lucene.asciidoc +++ b/documentation/src/main/asciidoc/backend-lucene.asciidoc @@ -367,37 +367,60 @@ Date/time types do not support the whole range of years that can be represented [[backend-lucene-analysis]] == Analysis -[IMPORTANT] -==== -This section is currently incomplete. -A decent introduction is included in the getting started guide: see <>. -==== +<> is the text processing performed by analyzers, +both when indexing (document processing) +and when searching (query processing). To configure analysis in a Lucene backend, you will need to: -* Implement a bean that implements the `org.hibernate.search.backend.lucene.analysis.LuceneAnalysisConfigurer` interface. -* Configure your backend to use that bean by setting the configuration property +* Define a class that implements the `org.hibernate.search.backend.lucene.analysis.LuceneAnalysisConfigurer` interface. +* Configure the backend to use that implementation by setting the configuration property `hibernate.search.backends..analysis.configurer` -to a <> pointing to your bean. +to a <> pointing to the implementation. -// TODO add a simple example: configurer implementation + settings +Hibernate Search will call the `configure` method of this implementation on startup, +and the configurer will be able to take advantage of a DSL to define analyzers: -To know which character filters, tokenizers and token filters are available, -either browse the Lucene Javadoc or read the corresponding section on the -link:http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters[Solr Wiki]. - -[NOTE] +.Implementing and using an analysis configurer with the Lucene backend ==== -Why the reference to the Apache Solr wiki for Lucene? +[source, JAVA, indent=0, subs="+callouts"] +---- +include::{sourcedir}/org/hibernate/search/documentation/analysis/MyLuceneAnalysisConfigurer.java[tags=include] +---- +<1> Define a custom analyzer named "english", because it will be used to analyze English text such as book titles. +<2> Set the tokenizer to a standard tokenizer: components are referenced by their factory class. +<3> Set the char filters. Char filters are applied in the order they are given, before the tokenizer. +<4> Set the token filters. Token filters are applied in the order they are given, after the tokenizer. +<5> Set the value of a parameter for the last added char filter/tokenizer/token filter. +<6> Normalizers are defined in a similar way, the only difference being that they cannot use a tokenizer. +<7> Multiple analyzers/normalizers can be defined in the same configurer. + +[source, XML, indent=0, subs="+callouts"] +---- +include::{resourcesdir}/analysis/lucene-simple.properties[] +---- +<1> Assign the configurer to the backend `myBackend` using a Hibernate Search configuration property. +==== + +It is also possible to assign a name to a built-in analyzer, +or a custom analyzer implementation: -The analyzer factory framework was originally created in the Apache Solr project. -Most of these implementations have been moved to Apache Lucene, but the -documentation for these additional analyzers can still be found in the Solr Wiki. You might find -other documentation referring to the "Solr Analyzer Framework"; just remember you don't need to -depend on Apache Solr anymore: the required classes are part of the core Lucene distribution. +.Naming an analyzer instance in the Lucene backend +==== +[source, JAVA, indent=0, subs="+callouts"] +---- +include::{sourcedir}/org/hibernate/search/documentation/analysis/AdvancedLuceneAnalysisConfigurer.java[tags=instance] +---- ==== -// TODO add detailed description of each use case: normalizer, analyzer, by instance, by factory, ... +[TIP] +==== +To know which analyzers, character filters, tokenizers and token filters are available, +either browse the Lucene Javadoc or read the corresponding section on the +link:http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters[Solr Wiki] +(you don't need Solr to use these analyzers, +it's just that there is no documentation page for Lucene proper). +==== [[backend-lucene-multi-tenancy]] == Multi-tenancy diff --git a/documentation/src/main/asciidoc/concepts.asciidoc b/documentation/src/main/asciidoc/concepts.asciidoc index eb0309bbf52..cecf8c017bb 100644 --- a/documentation/src/main/asciidoc/concepts.asciidoc +++ b/documentation/src/main/asciidoc/concepts.asciidoc @@ -3,37 +3,156 @@ [[concepts-full-text]] == Full-text search -include::todo-placeholder.asciidoc[] +Full-text search is a set of techniques for searching, +in a corpus of text documents, +the documents that best match a given query. -// TODO maybe give a short introduction to full-text search and full-text indexes? +The main difference with traditional search -- for example in an SQL database -- +is that the stored text is not considered as a single block of text, +but as a collection of tokens (words). + +Hibernate Search relies on either http://lucene.apache.org/[Apache Lucene] +or https://www.elastic.co/products/elasticsearch[Elasticsearch] +to implement full-text search. +Since Elasticsearch uses Lucene internally, +they share a lot of characteristics and their general approach to full-text search. + +To simplify, these search engines are based on the concept of inverted indexes: +a dictionary where the key is a token (word) found in a document, +and the value is the list of identifiers of every document containing this token. + +Still simplifying, once all documents are indexed, +searching for documents involves three steps: + +. extracting tokens (words) from the query; +. looking up these tokens in the index to find matching documents; +. aggregating the results of the lookups to produce a list of matching documents. + +[NOTE] +==== +Lucene and Elasticsearch are not limited to just text search: numeric data is also supported, +enabling support for integers, doubles, longs, dates, etc. +These types are indexed and queried using a slightly different approach, +which obviously does not involve text processing. +==== [[concepts-mapping]] == Mapping -include::todo-placeholder.asciidoc[] +Applications targeted by Hibernate search generally use an entity-based model to represent data. +In this model, each entity is a single object with a few properties of atomic type +(`String`, `Integer`, `LocalDate`, ...). +Each entity can have multiple associations to one or even many other entities. + +Entities are thus organized as a graph, +where each node is an entity and each association is an edge. + +By contrast, Lucene and Elasticsearch work with documents. +Each document is a collection of "fields", +each field being assigned a name -- a unique string -- +and a value -- which can be text, but also numeric data such as an integer or a date. +Fields also have a type, which not only determines the type of values (text/numeric), +but more importantly the way this value will be stored: indexed, stored, with doc values, etc. +It is possible to introduce nested documents, but not real associations. + +Documents are thus organized, at best, as a collection of trees, +where each tree is a document, optionally with nested documents. -// TODO maybe explain what we mean by "mapping"? -// TODO explain what an "entity" is and what it implies +There are multiple mismatches between the entity model and the document model: +properties vs. fields, associations vs. nested documents, graph vs. collection of trees. + +The goal of _mapping_, in Hibernate search, is to resolve these mismatches +by defining how to transform one or more entities into a document, +and how to resolve a search hit back into the original entity. +This is the main added value of Hibernate Search, +the basis for everything else from automatic indexing to the various search DSLs. + +Mapping is usually configured using annotations in the entity model, +but this can also be achieved using a programmatic API. +To learn more about how to configure mapping, see <>. + +To learn how to index the resulting documents, see <> +(hint: it's automatic). + +To learn how to search with an API +that takes advantage of the mapping to be closer to the entity model, +in particular by returning hits as entities instead of just document identifiers, +see <>. [[concepts-analysis]] == Analysis // Search 5 anchors backward compatibility [[analyzer]] -[IMPORTANT] +As mentioned in <>, +the full-text engine works on tokens, +which means text has to be processed +both when indexing (document processing, to build the token -> document index) +and when searching (query processing, to generate a list of tokens to look up). + +However, the processing is not *just* about "tokenizing". +Index lookups are *exact* lookups, +which means that looking up `Great` (capitalized) will not return documents containing only `great` (all lowercase). +An extra step is performed when processing text to address this caveat: +token filtering, which normalizes tokens. +Thanks to that "normalization", +`Great` will be indexed as `great`, +so that an index lookup for the query `great` will match as expected. + +In the Lucene world (Lucene, Elasticsearch, Solr, ...), +text processing during both the indexing and searching phases +is called "analysis" and is performed by an "analyzer". + +The analyzer is made up of three types of components, +which will each process the text successively in the following order: + +. Character filter: transforms the input characters. Replaces, adds or removes characters. +. Tokenizer: splits the text into several words, called "tokens". +. Token filter: transforms the tokens. Replaces, add or removes characters in a token, +derives new tokens from the existing ones, removes tokens based on some condition, ... + +The tokenizer usually splits on whitespaces (though there are other options). +Token filters are usually where customization takes place. +They can remove accented characters, +remove meaningless suffixes (`-ing`, `-s`, ...) +or tokens (`a`, `the`, ...), +replace tokens with a chosen spelling (`wi-fi` => `wifi`), +etc. + +[TIP] ==== -This section is currently incomplete. -A decent introduction is included in the getting started guide: see <>. +Character filters, though useful, are rarely used, +because they have no knowledge of token boundaries. + +Unless you know what you are doing, +you should generally favor token filters. ==== -//// -TODO The getting started section has a link pointing here and expects the section to -include a detailed explanation of analysis, how it works and how to configure it in HSearch. -We also need to explain the difference between analyzer and normalizer. -//// +In some cases, it is necessary to index text in one block, +without any tokenization: + +* For some types of text, such as SKUs or other business codes, +tokenization simply does not make sense: the text is a single "keyword". +* For sorts by field value, tokenization is not necessary. +It is also forbidden in Hibernate Search due to performance issues; +only non-tokenized fields can be sorted on. + +To address these use cases, +a special type of analyzer, called "normalizer", is available. +Normalizers are simply analyzers that are guaranteed not to use a tokenizer: +they can only use character filters and token filters. + +In Hibernate Search, analyzers and normalizers are referenced by their name, +for example <>. +Analyzers and normalizers have two separate namespaces. + +Some names are already assigned to built-in analyzers (in Elasticsearch in particular), +but it is possible (and recommended) to assign names to custom analyzers and normalizers, +assembled using built-in components (tokenizers, filters) to address your specific needs. -For more information about how to configure analysis, -see the documentation of each backend: +Each backend exposes its own APIs to define analyzers and normalizers, +and generally to configure analysis. +See the documentation of each backend for more information: * <> * <> diff --git a/documentation/src/main/asciidoc/getting-started.asciidoc b/documentation/src/main/asciidoc/getting-started.asciidoc index f0de21e8728..f974349b0d3 100644 --- a/documentation/src/main/asciidoc/getting-started.asciidoc +++ b/documentation/src/main/asciidoc/getting-started.asciidoc @@ -338,33 +338,29 @@ See <> for more information. Exact matches are well and good, but obviously not what you would expect from a full-text search engine. -For non-exact matches, you will need to configure *analysis*. - -=== Concept - -In the Lucene world (Lucene, Elasticsearch, Solr, ...), -non-exact matches can be achieved by applying what is called an "analyzer" -to *both* documents (when indexing) and search terms (when querying). - -The analyzer will perform three steps, delegated to the following components, in the following order: - -. Character filter: transforms the input text: replaces, adds or removes characters. -This step is rarely used, generally text is transformed in the third step. -. Tokenizer: splits the text into several words, called "tokens". -. Token filter: transforms the tokens: replaces, add or removes characters in a token, -derives new tokens from the existing ones, removes tokens based on some condition, ... - -In order to perform non-exact matches, you will need to either pick a pre-defined analyzer, -or define your own by combining character filters, a tokenizer, and token filters. - -The following section will give a reasonable example of a general-purpose analyzer. -For more advanced use cases, refer to the <> section. - -=== Configuration - -Once you know what analysis is and which analyzer you want to apply, -you will need to define it, or at least give it a name in Hibernate Search. -This is done though analysis configurers, which are defined per backend: +For non-exact matches, you will need to configure *analysis*, +i.e. how text is supposed to be processed when indexing and searching. +This involves _analyzers_, +which are made up of three types of components, applied one after the other: + +* (rarely) zero or more character filters, to clean up the input text: +`A GREAT résume` => `A GREAT résume`. +* a tokenizer, to split the input text into words, called "tokens": +`A GREAT résume` => `[A, GREAT, résume]`. +* zero or more token filters, to normalize the tokens and remove meaningless tokens. +`[A, GREAT, résume]` => `[great, resume]`. + +There are built-in analyzers, +but it is generally better to build your own +by picking the filters and tokenizer most suited to your specific needs. + +The following paragraphs will explain how to configure and use +a simple yet reasonably useful analyzer. +For more information about analysis and how to configure it, +refer to the <> section. + +Each custom analyzer needs to be given a name in Hibernate Search. +This is done through analysis configurers, which are defined per backend: . First, you need to implement an analysis configurer, a Java class that implements a backend-specific interface: `LuceneAnalysisConfigurer` or `ElasticsearchAnalysisConfigurer`. diff --git a/documentation/src/test/java/org/hibernate/search/documentation/analysis/AdvancedElasticsearchAnalysisConfigurer.java b/documentation/src/test/java/org/hibernate/search/documentation/analysis/AdvancedElasticsearchAnalysisConfigurer.java new file mode 100644 index 00000000000..1b086d5e7cb --- /dev/null +++ b/documentation/src/test/java/org/hibernate/search/documentation/analysis/AdvancedElasticsearchAnalysisConfigurer.java @@ -0,0 +1,21 @@ +/* + * Hibernate Search, full-text search for your domain model + * + * License: GNU Lesser General Public License (LGPL), version 2.1 or later + * See the lgpl.txt file in the root directory or . + */ +// tag::include[] +package org.hibernate.search.documentation.analysis; + +import org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurationContext; +import org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurer; + +public class AdvancedElasticsearchAnalysisConfigurer implements ElasticsearchAnalysisConfigurer { + @Override + public void configure(ElasticsearchAnalysisConfigurationContext context) { + // tag::type[] + context.analyzer( "english_stopwords" ).type( "standard" ) // <1> + .param( "stopwords", "_english_" ); // <2> + // end::type[] + } +} diff --git a/documentation/src/test/java/org/hibernate/search/documentation/analysis/AdvancedLuceneAnalysisConfigurer.java b/documentation/src/test/java/org/hibernate/search/documentation/analysis/AdvancedLuceneAnalysisConfigurer.java new file mode 100644 index 00000000000..8c479214c3b --- /dev/null +++ b/documentation/src/test/java/org/hibernate/search/documentation/analysis/AdvancedLuceneAnalysisConfigurer.java @@ -0,0 +1,21 @@ +/* + * Hibernate Search, full-text search for your domain model + * + * License: GNU Lesser General Public License (LGPL), version 2.1 or later + * See the lgpl.txt file in the root directory or . + */ +package org.hibernate.search.documentation.analysis; + +import org.hibernate.search.backend.lucene.analysis.LuceneAnalysisConfigurationContext; +import org.hibernate.search.backend.lucene.analysis.LuceneAnalysisConfigurer; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; + +public class AdvancedLuceneAnalysisConfigurer implements LuceneAnalysisConfigurer { + @Override + public void configure(LuceneAnalysisConfigurationContext context) { + // tag::instance[] + context.analyzer( "standard" ).instance( new StandardAnalyzer() ); + // end::instance[] + } +} diff --git a/documentation/src/test/java/org/hibernate/search/documentation/analysis/AnalysisIT.java b/documentation/src/test/java/org/hibernate/search/documentation/analysis/AnalysisIT.java new file mode 100644 index 00000000000..22d39d7456f --- /dev/null +++ b/documentation/src/test/java/org/hibernate/search/documentation/analysis/AnalysisIT.java @@ -0,0 +1,228 @@ +/* + * Hibernate Search, full-text search for your domain model + * + * License: GNU Lesser General Public License (LGPL), version 2.1 or later + * See the lgpl.txt file in the root directory or . + */ +package org.hibernate.search.documentation.analysis; + +import static org.assertj.core.api.Assertions.assertThat; + +import javax.persistence.Entity; +import javax.persistence.EntityManagerFactory; +import javax.persistence.GeneratedValue; +import javax.persistence.Id; + +import org.hibernate.search.backend.elasticsearch.cfg.ElasticsearchBackendSettings; +import org.hibernate.search.backend.lucene.cfg.LuceneBackendSettings; +import org.hibernate.search.documentation.testsupport.BackendConfigurations; +import org.hibernate.search.documentation.testsupport.ElasticsearchBackendConfiguration; +import org.hibernate.search.documentation.testsupport.LuceneBackendConfiguration; +import org.hibernate.search.mapper.orm.Search; +import org.hibernate.search.mapper.orm.automaticindexing.AutomaticIndexingSynchronizationStrategyName; +import org.hibernate.search.mapper.orm.cfg.HibernateOrmMapperSettings; +import org.hibernate.search.mapper.orm.mapping.HibernateOrmSearchMappingConfigurer; +import org.hibernate.search.mapper.orm.session.SearchSession; +import org.hibernate.search.mapper.pojo.mapping.definition.annotation.Indexed; +import org.hibernate.search.util.impl.integrationtest.common.rule.BackendConfiguration; +import org.hibernate.search.util.impl.integrationtest.orm.OrmSetupHelper; +import org.hibernate.search.util.impl.integrationtest.orm.OrmUtils; + +import org.junit.Assume; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +@RunWith(Parameterized.class) +public class AnalysisIT { + + private static final String BACKEND_NAME = "myBackend"; // Don't change, the same name is used in property files + + @Parameterized.Parameters(name = "{0}") + public static Object[] backendConfigurations() { + return BackendConfigurations.simple().toArray(); + } + + @Rule + public OrmSetupHelper setupHelper; + + private final BackendConfiguration backendConfiguration; + + public AnalysisIT(BackendConfiguration backendConfiguration) { + this.setupHelper = OrmSetupHelper.withSingleBackend( BACKEND_NAME, backendConfiguration ); + this.backendConfiguration = backendConfiguration; + } + + @Test + public void simple() { + EntityManagerFactory entityManagerFactory = setupHelper.start() + .withProperty( + HibernateOrmMapperSettings.AUTOMATIC_INDEXING_SYNCHRONIZATION_STRATEGY, + AutomaticIndexingSynchronizationStrategyName.SEARCHABLE + ) + .withProperties( + backendConfiguration instanceof LuceneBackendConfiguration + ? "/analysis/lucene-simple.properties" + : "/analysis/elasticsearch-simple.properties" + ) + .withProperty( + HibernateOrmMapperSettings.MAPPING_CONFIGURER, + (HibernateOrmSearchMappingConfigurer) context -> context.programmaticMapping() + .type( IndexedEntity.class ) + .property( "text" ) + .fullTextField( "english" ).analyzer( "english" ) + .fullTextField( "french" ).analyzer( "french" ) + .keywordField( "lowercase" ).normalizer( "lowercase" ) + ) + .setup( IndexedEntity.class ); + + OrmUtils.withinJPATransaction( entityManagerFactory, entityManager -> { + IndexedEntity entity = new IndexedEntity(); + // Mix French and English to test multiple analyzers with different stemmers + entity.setText( "THE châtié wording" ); + entityManager.persist( entity ); + } ); + + OrmUtils.withinJPATransaction( entityManagerFactory, entityManager -> { + SearchSession searchSession = Search.session( entityManager ); + + assertThat( + searchSession.search( IndexedEntity.class ) + .predicate( factory -> factory.match() + .field( "english" ) + .matching( "worded" ) + ) + .fetchHits( 20 ) + ) + .hasSize( 1 ); + + assertThat( + searchSession.search( IndexedEntity.class ) + .predicate( factory -> factory.match() + .field( "french" ) + .matching( "châtier" ) + ) + .fetchHits( 20 ) + ) + .hasSize( 1 ); + + assertThat( + searchSession.search( IndexedEntity.class ) + .predicate( factory -> factory.match() + .field( "lowercase" ) + .matching( "the châtié WORDING" ) + ) + .fetchHits( 20 ) + ) + .hasSize( 1 ); + } ); + } + + @Test + public void lucene_advanced() { + Assume.assumeTrue( backendConfiguration instanceof LuceneBackendConfiguration ); + + EntityManagerFactory entityManagerFactory = setupHelper.start() + .withProperty( + HibernateOrmMapperSettings.AUTOMATIC_INDEXING_SYNCHRONIZATION_STRATEGY, + AutomaticIndexingSynchronizationStrategyName.SEARCHABLE + ) + .withBackendProperty( + BACKEND_NAME, LuceneBackendSettings.ANALYSIS_CONFIGURER, + new AdvancedLuceneAnalysisConfigurer() + ) + .withProperty( + HibernateOrmMapperSettings.MAPPING_CONFIGURER, + (HibernateOrmSearchMappingConfigurer) context -> context.programmaticMapping() + .type( IndexedEntity.class ) + .property( "text" ) + .fullTextField( "standard" ).analyzer( "standard" ) + ) + .setup( IndexedEntity.class ); + + OrmUtils.withinJPATransaction( entityManagerFactory, entityManager -> { + IndexedEntity entity = new IndexedEntity(); + entity.setText( "the Wording" ); + entityManager.persist( entity ); + } ); + + OrmUtils.withinJPATransaction( entityManagerFactory, entityManager -> { + SearchSession searchSession = Search.session( entityManager ); + + assertThat( + searchSession.search( IndexedEntity.class ) + .predicate( factory -> factory.match() + .field( "standard" ) + .matching( "wording" ) + ) + .fetchHits( 20 ) + ) + .hasSize( 1 ); + } ); + } + + @Test + public void elasticsearch_advanced() { + Assume.assumeTrue( backendConfiguration instanceof ElasticsearchBackendConfiguration ); + + EntityManagerFactory entityManagerFactory = setupHelper.start() + .withProperty( + HibernateOrmMapperSettings.AUTOMATIC_INDEXING_SYNCHRONIZATION_STRATEGY, + AutomaticIndexingSynchronizationStrategyName.SEARCHABLE + ) + .withBackendProperty( + BACKEND_NAME, ElasticsearchBackendSettings.ANALYSIS_CONFIGURER, + new AdvancedElasticsearchAnalysisConfigurer() + ) + .withProperty( + HibernateOrmMapperSettings.MAPPING_CONFIGURER, + (HibernateOrmSearchMappingConfigurer) context -> context.programmaticMapping() + .type( IndexedEntity.class ) + .property( "text" ) + .fullTextField( "standard" ).analyzer( "standard" ) + ) + .setup( IndexedEntity.class ); + + OrmUtils.withinJPATransaction( entityManagerFactory, entityManager -> { + IndexedEntity entity = new IndexedEntity(); + entity.setText( "the Wording" ); + entityManager.persist( entity ); + } ); + + OrmUtils.withinJPATransaction( entityManagerFactory, entityManager -> { + SearchSession searchSession = Search.session( entityManager ); + + assertThat( + searchSession.search( IndexedEntity.class ) + .predicate( factory -> factory.match() + .field( "standard" ) + .matching( "wording" ) + ) + .fetchHits( 20 ) + ) + .hasSize( 1 ); + } ); + } + + @Entity(name = IndexedEntity.NAME) + @Indexed(index = IndexedEntity.NAME) + static class IndexedEntity { + + static final String NAME = "indexed"; + + @Id + @GeneratedValue + private Integer id; + + private String text; + + public String getText() { + return text; + } + + public void setText(String text) { + this.text = text; + } + } +} diff --git a/documentation/src/test/java/org/hibernate/search/documentation/analysis/MyElasticsearchAnalysisConfigurer.java b/documentation/src/test/java/org/hibernate/search/documentation/analysis/MyElasticsearchAnalysisConfigurer.java new file mode 100644 index 00000000000..3ad3cc39e0c --- /dev/null +++ b/documentation/src/test/java/org/hibernate/search/documentation/analysis/MyElasticsearchAnalysisConfigurer.java @@ -0,0 +1,37 @@ +/* + * Hibernate Search, full-text search for your domain model + * + * License: GNU Lesser General Public License (LGPL), version 2.1 or later + * See the lgpl.txt file in the root directory or . + */ +// tag::include[] +package org.hibernate.search.documentation.analysis; + +import org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurationContext; +import org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurer; + +public class MyElasticsearchAnalysisConfigurer implements ElasticsearchAnalysisConfigurer { + @Override + public void configure(ElasticsearchAnalysisConfigurationContext context) { + context.analyzer( "english" ).custom() // <1> + .tokenizer( "standard" ) // <2> + .charFilters( "html_strip" ) // <3> + .tokenFilters( "lowercase", "snowball_english", "asciifolding" ); // <4> + + context.tokenFilter( "snowball_english" ) // <5> + .type( "snowball" ) + .param( "language", "English" ); // <6> + + context.normalizer( "lowercase" ).custom() // <7> + .tokenFilters( "lowercase", "asciifolding" ); + + context.analyzer( "french" ).custom() // <8> + .tokenizer( "standard" ) + .tokenFilters( "lowercase", "snowball_french", "asciifolding" ); + + context.tokenFilter( "snowball_french" ) + .type( "snowball" ) + .param( "language", "French" ); + } +} +// end::include[] diff --git a/documentation/src/test/java/org/hibernate/search/documentation/analysis/MyLuceneAnalysisConfigurer.java b/documentation/src/test/java/org/hibernate/search/documentation/analysis/MyLuceneAnalysisConfigurer.java new file mode 100644 index 00000000000..d96981c0ac2 --- /dev/null +++ b/documentation/src/test/java/org/hibernate/search/documentation/analysis/MyLuceneAnalysisConfigurer.java @@ -0,0 +1,43 @@ +/* + * Hibernate Search, full-text search for your domain model + * + * License: GNU Lesser General Public License (LGPL), version 2.1 or later + * See the lgpl.txt file in the root directory or . + */ +// tag::include[] +package org.hibernate.search.documentation.analysis; + +import org.hibernate.search.backend.lucene.analysis.LuceneAnalysisConfigurationContext; +import org.hibernate.search.backend.lucene.analysis.LuceneAnalysisConfigurer; + +import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory; +import org.apache.lucene.analysis.core.LowerCaseFilterFactory; +import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory; +import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory; +import org.apache.lucene.analysis.standard.StandardTokenizerFactory; + +public class MyLuceneAnalysisConfigurer implements LuceneAnalysisConfigurer { + @Override + public void configure(LuceneAnalysisConfigurationContext context) { + context.analyzer( "english" ).custom() // <1> + .tokenizer( StandardTokenizerFactory.class ) // <2> + .charFilter( HTMLStripCharFilterFactory.class ) // <3> + .tokenFilter( LowerCaseFilterFactory.class ) // <4> + .tokenFilter( SnowballPorterFilterFactory.class ) // <4> + .param( "language", "English" ) // <5> + .tokenFilter( ASCIIFoldingFilterFactory.class ); + + context.normalizer( "lowercase" ).custom() // <6> + .tokenFilter( LowerCaseFilterFactory.class ) + .tokenFilter( ASCIIFoldingFilterFactory.class ); + + context.analyzer( "french" ).custom() // <7> + .tokenizer( StandardTokenizerFactory.class ) + .charFilter( HTMLStripCharFilterFactory.class ) + .tokenFilter( LowerCaseFilterFactory.class ) + .tokenFilter( SnowballPorterFilterFactory.class ) + .param( "language", "French" ) + .tokenFilter( ASCIIFoldingFilterFactory.class ); + } +} +// end::include[] diff --git a/documentation/src/test/java/org/hibernate/search/documentation/gettingstarted/withhsearch/withanalysis/MyElasticsearchAnalysisConfigurer.java b/documentation/src/test/java/org/hibernate/search/documentation/gettingstarted/withhsearch/withanalysis/MyElasticsearchAnalysisConfigurer.java index f2b27a3e657..e3fee49f603 100644 --- a/documentation/src/test/java/org/hibernate/search/documentation/gettingstarted/withhsearch/withanalysis/MyElasticsearchAnalysisConfigurer.java +++ b/documentation/src/test/java/org/hibernate/search/documentation/gettingstarted/withhsearch/withanalysis/MyElasticsearchAnalysisConfigurer.java @@ -15,7 +15,7 @@ public class MyElasticsearchAnalysisConfigurer implements ElasticsearchAnalysisC public void configure(ElasticsearchAnalysisConfigurationContext context) { context.analyzer( "english" ).custom() // <1> .tokenizer( "standard" ) // <2> - .tokenFilters( "asciifolding", "lowercase", "snowball_english" ); // <3> + .tokenFilters( "lowercase", "snowball_english", "asciifolding" ); // <3> context.tokenFilter( "snowball_english" ) // <4> .type( "snowball" ) @@ -23,7 +23,7 @@ public void configure(ElasticsearchAnalysisConfigurationContext context) { context.analyzer( "name" ).custom() // <6> .tokenizer( "standard" ) - .tokenFilters( "asciifolding", "lowercase" ); + .tokenFilters( "lowercase", "asciifolding" ); } } // end::include[] diff --git a/documentation/src/test/java/org/hibernate/search/documentation/gettingstarted/withhsearch/withanalysis/MyLuceneAnalysisConfigurer.java b/documentation/src/test/java/org/hibernate/search/documentation/gettingstarted/withhsearch/withanalysis/MyLuceneAnalysisConfigurer.java index 52f36edfc8c..7c88dd312e6 100644 --- a/documentation/src/test/java/org/hibernate/search/documentation/gettingstarted/withhsearch/withanalysis/MyLuceneAnalysisConfigurer.java +++ b/documentation/src/test/java/org/hibernate/search/documentation/gettingstarted/withhsearch/withanalysis/MyLuceneAnalysisConfigurer.java @@ -20,15 +20,15 @@ public class MyLuceneAnalysisConfigurer implements LuceneAnalysisConfigurer { public void configure(LuceneAnalysisConfigurationContext context) { context.analyzer( "english" ).custom() // <1> .tokenizer( StandardTokenizerFactory.class ) // <2> - .tokenFilter( ASCIIFoldingFilterFactory.class ) // <3> .tokenFilter( LowerCaseFilterFactory.class ) // <3> .tokenFilter( SnowballPorterFilterFactory.class ) // <3> - .param( "language", "English" ); // <4> + .param( "language", "English" ) // <4> + .tokenFilter( ASCIIFoldingFilterFactory.class ); context.analyzer( "name" ).custom() // <5> .tokenizer( StandardTokenizerFactory.class ) - .tokenFilter( ASCIIFoldingFilterFactory.class ) - .tokenFilter( LowerCaseFilterFactory.class ); + .tokenFilter( LowerCaseFilterFactory.class ) + .tokenFilter( ASCIIFoldingFilterFactory.class ); } } // end::include[] diff --git a/documentation/src/test/java/org/hibernate/search/documentation/testsupport/ElasticsearchBackendConfiguration.java b/documentation/src/test/java/org/hibernate/search/documentation/testsupport/ElasticsearchBackendConfiguration.java index 9311b0e40cb..7fe652fee8f 100644 --- a/documentation/src/test/java/org/hibernate/search/documentation/testsupport/ElasticsearchBackendConfiguration.java +++ b/documentation/src/test/java/org/hibernate/search/documentation/testsupport/ElasticsearchBackendConfiguration.java @@ -17,7 +17,6 @@ public class ElasticsearchBackendConfiguration extends AbstractDocumentationBackendConfiguration { - // This will be used in a later commit protected final TestElasticsearchClient testElasticsearchClient = new TestElasticsearchClient(); @Override diff --git a/documentation/src/test/java/org/hibernate/search/documentation/testsupport/ElasticsearchSimpleMappingAnalysisConfigurer.java b/documentation/src/test/java/org/hibernate/search/documentation/testsupport/ElasticsearchSimpleMappingAnalysisConfigurer.java index 71fb89de244..18aba34e5c4 100644 --- a/documentation/src/test/java/org/hibernate/search/documentation/testsupport/ElasticsearchSimpleMappingAnalysisConfigurer.java +++ b/documentation/src/test/java/org/hibernate/search/documentation/testsupport/ElasticsearchSimpleMappingAnalysisConfigurer.java @@ -14,7 +14,7 @@ class ElasticsearchSimpleMappingAnalysisConfigurer implements ElasticsearchAnaly public void configure(ElasticsearchAnalysisConfigurationContext context) { context.analyzer( "english" ).custom() .tokenizer( "standard" ) - .tokenFilters( "asciifolding", "lowercase", "snowball_english" ); + .tokenFilters( "lowercase", "snowball_english", "asciifolding" ); context.tokenFilter( "snowball_english" ) .type( "snowball" ) @@ -22,11 +22,11 @@ public void configure(ElasticsearchAnalysisConfigurationContext context) { context.analyzer( "name" ).custom() .tokenizer( "standard" ) - .tokenFilters( "asciifolding", "lowercase" ); + .tokenFilters( "lowercase", "asciifolding" ); context.analyzer( "autocomplete_indexing" ).custom() .tokenizer( "standard" ) - .tokenFilters( "asciifolding", "lowercase", "snowball_english", "autocomplete_edge_ngram" ); + .tokenFilters( "lowercase", "snowball_english", "asciifolding", "autocomplete_edge_ngram" ); context.tokenFilter( "autocomplete_edge_ngram" ) .type( "edge_ngram" ) @@ -36,14 +36,14 @@ public void configure(ElasticsearchAnalysisConfigurationContext context) { // Same as "autocomplete-indexing", but without the edge-ngram filter context.analyzer( "autocomplete_query" ).custom() .tokenizer( "standard" ) - .tokenFilters( "asciifolding", "lowercase", "snowball_english" ); + .tokenFilters( "lowercase", "snowball_english", "asciifolding" ); // Normalizers context.normalizer( "english" ).custom() - .tokenFilters( "asciifolding", "lowercase" ); + .tokenFilters( "lowercase", "asciifolding" ); context.normalizer( "name" ).custom() - .tokenFilters( "asciifolding", "lowercase" ); + .tokenFilters( "lowercase", "asciifolding" ); } } diff --git a/documentation/src/test/java/org/hibernate/search/documentation/testsupport/LuceneSimpleMappingAnalysisConfigurer.java b/documentation/src/test/java/org/hibernate/search/documentation/testsupport/LuceneSimpleMappingAnalysisConfigurer.java index d8e5d0b28c7..1be59bc3e26 100644 --- a/documentation/src/test/java/org/hibernate/search/documentation/testsupport/LuceneSimpleMappingAnalysisConfigurer.java +++ b/documentation/src/test/java/org/hibernate/search/documentation/testsupport/LuceneSimpleMappingAnalysisConfigurer.java @@ -20,22 +20,22 @@ class LuceneSimpleMappingAnalysisConfigurer implements LuceneAnalysisConfigurer public void configure(LuceneAnalysisConfigurationContext context) { context.analyzer( "english" ).custom() .tokenizer( StandardTokenizerFactory.class ) - .tokenFilter( ASCIIFoldingFilterFactory.class ) .tokenFilter( LowerCaseFilterFactory.class ) .tokenFilter( SnowballPorterFilterFactory.class ) - .param( "language", "English" ); + .param( "language", "English" ) + .tokenFilter( ASCIIFoldingFilterFactory.class ); context.analyzer( "name" ).custom() .tokenizer( StandardTokenizerFactory.class ) - .tokenFilter( ASCIIFoldingFilterFactory.class ) - .tokenFilter( LowerCaseFilterFactory.class ); + .tokenFilter( LowerCaseFilterFactory.class ) + .tokenFilter( ASCIIFoldingFilterFactory.class ); context.analyzer( "autocomplete_indexing" ).custom() .tokenizer( StandardTokenizerFactory.class ) - .tokenFilter( ASCIIFoldingFilterFactory.class ) .tokenFilter( LowerCaseFilterFactory.class ) .tokenFilter( SnowballPorterFilterFactory.class ) .param( "language", "English" ) + .tokenFilter( ASCIIFoldingFilterFactory.class ) .tokenFilter( EdgeNGramFilterFactory.class ) .param( "minGramSize", "3" ) .param( "maxGramSize", "7" ); @@ -43,19 +43,19 @@ public void configure(LuceneAnalysisConfigurationContext context) { // Same as "autocomplete-indexing", but without the edge-ngram filter context.analyzer( "autocomplete_query" ).custom() .tokenizer( StandardTokenizerFactory.class ) - .tokenFilter( ASCIIFoldingFilterFactory.class ) .tokenFilter( LowerCaseFilterFactory.class ) .tokenFilter( SnowballPorterFilterFactory.class ) - .param( "language", "English" ); + .param( "language", "English" ) + .tokenFilter( ASCIIFoldingFilterFactory.class ); // Normalizers context.normalizer( "english" ).custom() - .tokenFilter( ASCIIFoldingFilterFactory.class ) - .tokenFilter( LowerCaseFilterFactory.class ); + .tokenFilter( LowerCaseFilterFactory.class ) + .tokenFilter( ASCIIFoldingFilterFactory.class ); context.normalizer( "name" ).custom() - .tokenFilter( ASCIIFoldingFilterFactory.class ) - .tokenFilter( LowerCaseFilterFactory.class ); + .tokenFilter( LowerCaseFilterFactory.class ) + .tokenFilter( ASCIIFoldingFilterFactory.class ); } } diff --git a/documentation/src/test/resources/analysis/elasticsearch-simple.properties b/documentation/src/test/resources/analysis/elasticsearch-simple.properties new file mode 100644 index 00000000000..f6c31255970 --- /dev/null +++ b/documentation/src/test/resources/analysis/elasticsearch-simple.properties @@ -0,0 +1,2 @@ +# <1> +hibernate.search.backends.myBackend.analysis.configurer org.hibernate.search.documentation.analysis.MyElasticsearchAnalysisConfigurer \ No newline at end of file diff --git a/documentation/src/test/resources/analysis/lucene-simple.properties b/documentation/src/test/resources/analysis/lucene-simple.properties new file mode 100644 index 00000000000..32d729e1d6a --- /dev/null +++ b/documentation/src/test/resources/analysis/lucene-simple.properties @@ -0,0 +1,2 @@ +# <1> +hibernate.search.backends.myBackend.analysis.configurer org.hibernate.search.documentation.analysis.MyLuceneAnalysisConfigurer \ No newline at end of file diff --git a/integrationtest/jdk/java-modules/src/main/java/org/hibernate/search/integrationtest/java/module/config/MyElasticsearchAnalysisConfigurer.java b/integrationtest/jdk/java-modules/src/main/java/org/hibernate/search/integrationtest/java/module/config/MyElasticsearchAnalysisConfigurer.java index fdce747f2a3..7c4ee8e14dc 100644 --- a/integrationtest/jdk/java-modules/src/main/java/org/hibernate/search/integrationtest/java/module/config/MyElasticsearchAnalysisConfigurer.java +++ b/integrationtest/jdk/java-modules/src/main/java/org/hibernate/search/integrationtest/java/module/config/MyElasticsearchAnalysisConfigurer.java @@ -17,7 +17,7 @@ public class MyElasticsearchAnalysisConfigurer implements ElasticsearchAnalysisC public void configure(ElasticsearchAnalysisConfigurationContext context) { context.analyzer( MY_ANALYZER ).custom() .withTokenizer( "standard" ) - .withTokenFilters( "asciifolding", "lowercase", "mySnowballFilter" ); + .withTokenFilters( "lowercase", "mySnowballFilter", "asciifolding" ); context.tokenFilter( "mySnowballFilter" ) .type( "snowball" ) diff --git a/integrationtest/showcase/library/src/main/java/org/hibernate/search/integrationtest/showcase/library/analysis/ElasticsearchLibraryAnalysisConfigurer.java b/integrationtest/showcase/library/src/main/java/org/hibernate/search/integrationtest/showcase/library/analysis/ElasticsearchLibraryAnalysisConfigurer.java index 44b9250ff86..d8f46328263 100644 --- a/integrationtest/showcase/library/src/main/java/org/hibernate/search/integrationtest/showcase/library/analysis/ElasticsearchLibraryAnalysisConfigurer.java +++ b/integrationtest/showcase/library/src/main/java/org/hibernate/search/integrationtest/showcase/library/analysis/ElasticsearchLibraryAnalysisConfigurer.java @@ -17,7 +17,7 @@ public class ElasticsearchLibraryAnalysisConfigurer implements ElasticsearchAnal @Override public void configure(ElasticsearchAnalysisConfigurationContext context) { context.normalizer( LibraryAnalyzers.NORMALIZER_SORT ).custom() - .tokenFilters( "asciifolding", "lowercase" ); + .tokenFilters( "lowercase", "asciifolding" ); context.normalizer( LibraryAnalyzers.NORMALIZER_ISBN ).custom() .charFilters( "removeHyphens" ); context.charFilter( "removeHyphens" ).type( "pattern_replace" ) diff --git a/integrationtest/showcase/library/src/main/java/org/hibernate/search/integrationtest/showcase/library/analysis/LuceneLibraryAnalysisConfigurer.java b/integrationtest/showcase/library/src/main/java/org/hibernate/search/integrationtest/showcase/library/analysis/LuceneLibraryAnalysisConfigurer.java index 7bf70a39eae..f697b6c7912 100644 --- a/integrationtest/showcase/library/src/main/java/org/hibernate/search/integrationtest/showcase/library/analysis/LuceneLibraryAnalysisConfigurer.java +++ b/integrationtest/showcase/library/src/main/java/org/hibernate/search/integrationtest/showcase/library/analysis/LuceneLibraryAnalysisConfigurer.java @@ -22,8 +22,8 @@ public class LuceneLibraryAnalysisConfigurer implements LuceneAnalysisConfigurer public void configure(LuceneAnalysisConfigurationContext context) { context.analyzer( LibraryAnalyzers.ANALYZER_DEFAULT ).instance( new StandardAnalyzer() ); context.normalizer( LibraryAnalyzers.NORMALIZER_SORT ).custom() - .tokenFilter( ASCIIFoldingFilterFactory.class ) - .tokenFilter( LowerCaseFilterFactory.class ); + .tokenFilter( LowerCaseFilterFactory.class ) + .tokenFilter( ASCIIFoldingFilterFactory.class ); context.normalizer( LibraryAnalyzers.NORMALIZER_ISBN ).custom() .charFilter( PatternReplaceCharFilterFactory.class ) .param( "pattern", "-+" ) diff --git a/util/internal/integrationtest/common/src/main/java/org/hibernate/search/util/impl/integrationtest/common/rule/MappingSetupHelper.java b/util/internal/integrationtest/common/src/main/java/org/hibernate/search/util/impl/integrationtest/common/rule/MappingSetupHelper.java index 40a3898ea23..0b172797485 100644 --- a/util/internal/integrationtest/common/src/main/java/org/hibernate/search/util/impl/integrationtest/common/rule/MappingSetupHelper.java +++ b/util/internal/integrationtest/common/src/main/java/org/hibernate/search/util/impl/integrationtest/common/rule/MappingSetupHelper.java @@ -83,6 +83,15 @@ public final C withPropertyRadical(String keyRadical, Object value) { public abstract C withProperty(String keyRadical, Object value); + public final C withProperties(Map properties) { + properties.forEach( this::withProperty ); + return thisAsC(); + } + + public final C withProperties(String propertyFilePath) { + return withProperties( configurationProvider.getPropertiesFromFile( propertyFilePath ) ); + } + public final C withBackendProperty(String backendName, String keyRadical, Object value) { return withPropertyRadical( "backends." + backendName + "." + keyRadical, value ); }