Skip to content

Commit

Permalink
HSEARCH-2584 Test built-in analyzers with backends
Browse files Browse the repository at this point in the history
  • Loading branch information
fax4ever authored and yrodiere committed Oct 23, 2020
1 parent 7a4daf3 commit fe2a657
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 14 deletions.
Expand Up @@ -10,7 +10,6 @@

import java.util.Arrays;
import java.util.List;
import java.util.function.Consumer;

import org.hibernate.search.engine.backend.analysis.AnalyzerNames;
import org.hibernate.search.engine.backend.common.DocumentReference;
Expand Down Expand Up @@ -65,17 +64,29 @@ public void setup() {
@Test
public void analyzer_default() {
SimpleFieldModel<String> field = index.binding().defaultAnalyzer;
initData( field );

initData( field, b -> {
b.emptyDocument( "empty" );
b.document( "1", "twowords" );
b.document( "2", "two.words" );
b.document( "3", "two-words" );
b.document( "4", "two words" );
b.document( "5", "TWO WORDS" );
b.document( "6", "two wôrds" );
b.document( "7", "a stopword the stopword" );
} );
// Tokenize on space, hyphen
assertMatchQuery( field, "words" )
.hasDocRefHitsAnyOrder( index.typeName(), "3", "4", "5" );

// Case-insensitive
assertMatchQuery( field, "WORDS" )
.hasDocRefHitsAnyOrder( index.typeName(), "3", "4", "5" );
assertMatchQuery( field, "Words" )
.hasDocRefHitsAnyOrder( index.typeName(), "3", "4", "5" );

// No stopword removal
assertMatchQuery( field, "a" )
.hasDocRefHitsAnyOrder( index.typeName(), "7" );
assertMatchQuery( field, "the" )
.hasDocRefHitsAnyOrder( index.typeName(), "7" );
}

@Test
public void analyzer_standard() {
SimpleFieldModel<String> field = index.binding().standardAnalyzer;
initData( field );

// Tokenize on space, hyphen
assertMatchQuery( field, "words" )
Expand All @@ -94,6 +105,88 @@ public void analyzer_default() {
.hasDocRefHitsAnyOrder( index.typeName(), "7" );
}

@Test
public void analyzer_simple() {
SimpleFieldModel<String> field = index.binding().simpleAnalyzer;
initData( field );

// Tokenize on space, hyphen, dot (basically, any character which is not a letter)
assertMatchQuery( field, "words" )
.hasDocRefHitsAnyOrder( index.typeName(), "2", "3", "4", "5" );

// Case-insensitive
assertMatchQuery( field, "WORDS" )
.hasDocRefHitsAnyOrder( index.typeName(), "2", "3", "4", "5" );
assertMatchQuery( field, "Words" )
.hasDocRefHitsAnyOrder( index.typeName(), "2", "3", "4", "5" );

// No stopword removal
assertMatchQuery( field, "a" )
.hasDocRefHitsAnyOrder( index.typeName(), "7" );
assertMatchQuery( field, "the" )
.hasDocRefHitsAnyOrder( index.typeName(), "7" );
}

@Test
public void analyzer_whitespace() {
SimpleFieldModel<String> field = index.binding().whitespaceAnalyzer;
initData( field );

// Tokenize on space only && case-sensitive
assertMatchQuery( field, "words" )
.hasDocRefHitsAnyOrder( index.typeName(), "4" );

// Case-sensitive
assertMatchQuery( field, "WORDS" )
.hasDocRefHitsAnyOrder( index.typeName(), "5" );
assertMatchQuery( field, "Words" )
.hasNoHits();

// No stopword removal
assertMatchQuery( field, "a" )
.hasDocRefHitsAnyOrder( index.typeName(), "7" );
assertMatchQuery( field, "the" )
.hasDocRefHitsAnyOrder( index.typeName(), "7" );
}

@Test
public void analyzer_stop() {
SimpleFieldModel<String> field = index.binding().stopAnalyzer;
initData( field );

// Tokenize on space, hyphen, dot (basically, any character which is not a letter)
assertMatchQuery( field, "words" )
.hasDocRefHitsAnyOrder( index.typeName(), "2", "3", "4", "5" );

// Case-insensitive
assertMatchQuery( field, "WORDS" )
.hasDocRefHitsAnyOrder( index.typeName(), "2", "3", "4", "5" );
assertMatchQuery( field, "Words" )
.hasDocRefHitsAnyOrder( index.typeName(), "2", "3", "4", "5" );

// Stopword removal
assertMatchQuery( field, "a" )
.hasNoHits();
assertMatchQuery( field, "the" )
.hasNoHits();
}

@Test
public void analyzer_keyword() {
SimpleFieldModel<String> field = index.binding().keywordAnalyzer;
initData( field );

// no match for any partial text
assertMatchQuery( field, "words" ).hasNoHits();
assertMatchQuery( field, "WORDS" ).hasNoHits();
assertMatchQuery( field, "Words" ).hasNoHits();
assertMatchQuery( field, "a" ).hasNoHits();
assertMatchQuery( field, "the" ).hasNoHits();

// as a keyword field, it will match only the whole text
assertMatchQuery( field, "two wôrds" ).hasDocRefHitsAnyOrder( index.typeName(), "6" );
}

private SearchResultAssert<DocumentReference> assertMatchQuery(SimpleFieldModel<String> fieldModel, String valueToMatch) {
StubMappingScope scope = index.createScope();

Expand All @@ -104,20 +197,50 @@ private SearchResultAssert<DocumentReference> assertMatchQuery(SimpleFieldModel<
return assertThatQuery( query );
}

private void initData(SimpleFieldModel<String> fieldModel, Consumer<SingleFieldDocumentBuilder<String>> valueContributor) {
private void initData(SimpleFieldModel<String> fieldModel) {
index.bulkIndexer()
.add( fieldModel.reference, valueContributor )
.add( fieldModel.reference, AnalysisBuiltinIT::buildDocuments )
.join();
}

private static void buildDocuments(SingleFieldDocumentBuilder<String> builder) {
builder.emptyDocument( "empty" );
builder.document( "1", "twowords" );
builder.document( "2", "two.words" );
builder.document( "3", "two-words" );
builder.document( "4", "two words" );
builder.document( "5", "TWO WORDS" );
builder.document( "6", "two wôrds" );
builder.document( "7", "a stopword the stopword" );
}

private static class IndexBinding {
final SimpleFieldModel<String> defaultAnalyzer;
final SimpleFieldModel<String> standardAnalyzer;
final SimpleFieldModel<String> simpleAnalyzer;
final SimpleFieldModel<String> whitespaceAnalyzer;
final SimpleFieldModel<String> stopAnalyzer;
final SimpleFieldModel<String> keywordAnalyzer;

IndexBinding(IndexSchemaElement root) {
this.defaultAnalyzer = SimpleFieldModel.mapperWithOverride( KeywordStringFieldTypeDescriptor.INSTANCE,
f -> f.asString().analyzer( AnalyzerNames.DEFAULT ) )
.map( root, "default" );
this.standardAnalyzer = SimpleFieldModel.mapperWithOverride( KeywordStringFieldTypeDescriptor.INSTANCE,
f -> f.asString().analyzer( AnalyzerNames.STANDARD ) )
.map( root, "standard" );
this.simpleAnalyzer = SimpleFieldModel.mapperWithOverride( KeywordStringFieldTypeDescriptor.INSTANCE,
f -> f.asString().analyzer( AnalyzerNames.SIMPLE ) )
.map( root, "simple" );
this.whitespaceAnalyzer = SimpleFieldModel.mapperWithOverride( KeywordStringFieldTypeDescriptor.INSTANCE,
f -> f.asString().analyzer( AnalyzerNames.WHITESPACE ) )
.map( root, "whitespace" );
this.stopAnalyzer = SimpleFieldModel.mapperWithOverride( KeywordStringFieldTypeDescriptor.INSTANCE,
f -> f.asString().analyzer( AnalyzerNames.STOP ) )
.map( root, "stop" );
this.keywordAnalyzer = SimpleFieldModel.mapperWithOverride( KeywordStringFieldTypeDescriptor.INSTANCE,
f -> f.asString().analyzer( AnalyzerNames.KEYWORD ) )
.map( root, "keyword" );
}
}

}
Expand Up @@ -11,6 +11,7 @@
import java.util.function.Consumer;
import java.util.function.Function;

import org.hibernate.search.engine.backend.analysis.AnalyzerNames;
import org.hibernate.search.engine.backend.document.model.dsl.IndexSchemaElement;
import org.hibernate.search.engine.backend.types.dsl.StandardIndexFieldTypeOptionsStep;
import org.hibernate.search.engine.backend.types.dsl.StringIndexFieldTypeOptionsStep;
Expand All @@ -33,6 +34,8 @@
* <p>
* Backend testing modules are expected to add the definitions
* listed in {@link AnalysisDefinitions}.
* <p>
* Built-in {@link AnalyzerNames} should never be used as names of {@link AnalysisDefinitions}.
*/
public class AnalysisCustomIT {

Expand Down

0 comments on commit fe2a657

Please sign in to comment.