From dfb242ab5874a4fe3d31d3a41965aaa1f04eda22 Mon Sep 17 00:00:00 2001 From: javanna Date: Sat, 9 May 2015 14:17:41 +0200 Subject: [PATCH] Highlighting: nuke XPostingsHighlighter Our own fork of the lucene PostingsHighlighter is not easy to maintain and doesn't give us any added value at this point. In particular, it was introduced to support the require_field_match option and discrete per value highlighting, used in case one wants to highlight the whole content of a field, but get back one snippet per value. These two features won't make it into lucene as they slow things down and shouldn't have been supported from day one on our end probably. One other customization we had was support for a wider range of queries via custom rewrite etc. (yet another way to slow things down), which got added to lucene and works much much better than what we used to do (instead of or rewrite, term s are pulled out of the automata for multi term queries). Removing our fork means the following in terms of features: - dropped support for require_field_match: the postings highlighter will only highlight fields that were queried - the output is different compared to other highlighters in case `fragment_size` is set to 0: one single snippet is returned in case a field has multiple values, rather than one highlighted snipper per value Closes #10625 --- docs/reference/migration/migrate_2_0.asciidoc | 10 + .../CustomPostingsHighlighter.java | 172 +- .../search/postingshighlight/Snippet.java | 2 +- .../XPostingsHighlighter.java | 772 -------- .../search/highlight/HighlightPhase.java | 1 - .../search/highlight/HighlightUtils.java | 3 - .../search/highlight/PostingsHighlighter.java | 173 +- .../CustomPostingsHighlighterTests.java | 346 +--- .../XPostingsHighlighterTests.java | 1691 ----------------- .../highlight/HighlighterSearchTests.java | 232 +-- 10 files changed, 172 insertions(+), 3230 deletions(-) delete mode 100644 src/main/java/org/apache/lucene/search/postingshighlight/XPostingsHighlighter.java delete mode 100644 src/test/java/org/apache/lucene/search/postingshighlight/XPostingsHighlighterTests.java diff --git a/docs/reference/migration/migrate_2_0.asciidoc b/docs/reference/migration/migrate_2_0.asciidoc index 2bdd5fe90d392..9b8976bb33364 100644 --- a/docs/reference/migration/migrate_2_0.asciidoc +++ b/docs/reference/migration/migrate_2_0.asciidoc @@ -508,3 +508,13 @@ Settings settings = ImmutableSettings.settingsBuilder() .put("cluster.name", "myClusterName").build(); Client client = TransportClient.builder().settings(settings).build(); -------------------------------------------------- + +=== Highlighting + +The Postings highlighter doesn't support the `require_field_match` option +anymore, it will only highlight fields that were queried. + +The output returned from the Postings highlighter has changed compared to the +other highlighters when highlighting a field with multiple values in +combination with `fragments_size` set to `0`. One single highlighted snippet +will get returned rather than one snippet per value. diff --git a/src/main/java/org/apache/lucene/search/postingshighlight/CustomPostingsHighlighter.java b/src/main/java/org/apache/lucene/search/postingshighlight/CustomPostingsHighlighter.java index 936fe490a5d38..468d49cd2122d 100644 --- a/src/main/java/org/apache/lucene/search/postingshighlight/CustomPostingsHighlighter.java +++ b/src/main/java/org/apache/lucene/search/postingshighlight/CustomPostingsHighlighter.java @@ -18,11 +18,9 @@ package org.apache.lucene.search.postingshighlight; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexReaderContext; -import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.util.BytesRef; +import org.apache.lucene.search.Query; import org.elasticsearch.common.Strings; import org.elasticsearch.search.highlight.HighlightUtils; @@ -32,105 +30,77 @@ import java.util.Map; /** - * Subclass of the {@link XPostingsHighlighter} that works for a single field in a single document. - * It receives the field values as input and it performs discrete highlighting on each single value - * calling the highlightDoc method multiple times. - * It allows to pass in the query terms to avoid calling extract terms multiple times. - * - * The use that we make of the postings highlighter is not optimal. It would be much better to - * highlight multiple docs in a single call, as we actually lose its sequential IO. But that would require: - * 1) to make our fork more complex and harder to maintain to perform discrete highlighting (needed to return - * a different snippet per value when number_of_fragments=0 and the field has multiple values) - * 2) refactoring of the elasticsearch highlight api which currently works per hit + * Subclass of the {@link PostingsHighlighter} that works for a single field in a single document. + * Uses a custom {@link PassageFormatter}. Accepts field content as a constructor argument, given that loading + * is custom and can be done reading from _source field. Supports using different {@link BreakIterator} to break + * the text into fragments. Considers every distinct field value as a discrete passage for highlighting (unless + * the whole content needs to be highlighted). Supports both returning empty snippets and non highlighted snippets + * when no highlighting can be performed. * + * The use that we make of the postings highlighter is not optimal. It would be much better to highlight + * multiple docs in a single call, as we actually lose its sequential IO. That would require to + * refactor the elasticsearch highlight api which currently works per hit. */ -public final class CustomPostingsHighlighter extends XPostingsHighlighter { +public final class CustomPostingsHighlighter extends PostingsHighlighter { private static final Snippet[] EMPTY_SNIPPET = new Snippet[0]; private static final Passage[] EMPTY_PASSAGE = new Passage[0]; + private final Analyzer analyzer; private final CustomPassageFormatter passageFormatter; - private final int noMatchSize; - private final int totalContentLength; - private final String[] fieldValues; - private final int[] fieldValuesOffsets; - private int currentValueIndex = 0; - - private BreakIterator breakIterator; - - public CustomPostingsHighlighter(CustomPassageFormatter passageFormatter, List fieldValues, boolean mergeValues, int maxLength, int noMatchSize) { - super(maxLength); - this.passageFormatter = passageFormatter; - this.noMatchSize = noMatchSize; - - if (mergeValues) { - String rawValue = Strings.collectionToDelimitedString(fieldValues, String.valueOf(getMultiValuedSeparator(""))); - String fieldValue = rawValue.substring(0, Math.min(rawValue.length(), maxLength)); - this.fieldValues = new String[]{fieldValue}; - this.fieldValuesOffsets = new int[]{0}; - this.totalContentLength = fieldValue.length(); - } else { - this.fieldValues = new String[fieldValues.size()]; - this.fieldValuesOffsets = new int[fieldValues.size()]; - int contentLength = 0; - int offset = 0; - int previousLength = -1; - for (int i = 0; i < fieldValues.size(); i++) { - String rawValue = fieldValues.get(i).toString(); - String fieldValue = rawValue.substring(0, Math.min(rawValue.length(), maxLength)); - this.fieldValues[i] = fieldValue; - contentLength += fieldValue.length(); - offset += previousLength + 1; - this.fieldValuesOffsets[i] = offset; - previousLength = fieldValue.length(); - } - this.totalContentLength = contentLength; - } - } - - /* - Our own api to highlight a single document field, passing in the query terms, and get back our own Snippet object + private final BreakIterator breakIterator; + private final boolean returnNonHighlightedSnippets; + private final List fieldValues; + + /** + * Creates a new instance of {@link CustomPostingsHighlighter} + * + * @param analyzer the analyzer used for the field at index time, used for multi term queries internally + * @param passageFormatter our own {@link PassageFormatter} which generates snippets in forms of {@link Snippet} objects + * @param fieldValues the original field values as constructor argument, loaded from te _source field or the relevant stored field. + * @param maxLength the maximum length of the field content to be read for highlighting + * @param returnNonHighlightedSnippets whether non highlighted snippets should be returned rather than empty snippets when + * no highlighting can be performed */ - public Snippet[] highlightDoc(String field, BytesRef[] terms, IndexReader reader, int docId, int maxPassages) throws IOException { - IndexReaderContext readerContext = reader.getContext(); - List leaves = readerContext.leaves(); - - String[] contents = new String[]{loadCurrentFieldValue()}; - Map snippetsMap = highlightField(field, contents, getBreakIterator(field), terms, new int[]{docId}, leaves, maxPassages); - - //increment the current value index so that next time we'll highlight the next value if available - currentValueIndex++; - - Object snippetObject = snippetsMap.get(docId); - if (snippetObject != null && snippetObject instanceof Snippet[]) { - return (Snippet[]) snippetObject; - } - return EMPTY_SNIPPET; + public CustomPostingsHighlighter(Analyzer analyzer, CustomPassageFormatter passageFormatter, List fieldValues, int maxLength, boolean returnNonHighlightedSnippets) { + this(analyzer, passageFormatter, null, fieldValues, maxLength, returnNonHighlightedSnippets); } - /* - Method provided through our own fork: allows to do proper scoring when doing per value discrete highlighting. - Used to provide the total length of the field (all values) for proper scoring. + /** + * Creates a new instance of {@link CustomPostingsHighlighter} + * + * @param analyzer the analyzer used for the field at index time, used for multi term queries internally + * @param passageFormatter our own {@link PassageFormatter} which generates snippets in forms of {@link Snippet} objects + * @param breakIterator an instance {@link BreakIterator} selected depending on the highlighting options + * @param fieldValues the original field values as constructor argument, loaded from te _source field or the relevant stored field. + * @param maxLength the maximum length of the field content to be read for highlighting + * @param returnNonHighlightedSnippets whether non highlighted snippets should be returned rather than empty snippets when + * no highlighting can be performed */ - @Override - protected int getContentLength(String field, int docId) { - return totalContentLength; + public CustomPostingsHighlighter(Analyzer analyzer, CustomPassageFormatter passageFormatter, BreakIterator breakIterator, List fieldValues, int maxLength, boolean returnNonHighlightedSnippets) { + super(maxLength); + this.analyzer = analyzer; + this.passageFormatter = passageFormatter; + this.breakIterator = breakIterator; + this.returnNonHighlightedSnippets = returnNonHighlightedSnippets; + this.fieldValues = fieldValues; } - /* - Method provided through our own fork: allows to perform proper per value discrete highlighting. - Used to provide the offset for the current value. + /** + * Highlights terms extracted from the provided query within the content of the provided field name */ - @Override - protected int getOffsetForCurrentValue(String field, int docId) { - if (currentValueIndex < fieldValuesOffsets.length) { - return fieldValuesOffsets[currentValueIndex]; + public Snippet[] highlightField(String field, Query query, IndexSearcher searcher, int docId, int maxPassages) throws IOException { + Map fieldsAsObjects = super.highlightFieldsAsObjects(new String[]{field}, query, searcher, new int[]{docId}, new int[]{maxPassages}); + Object[] snippetObjects = fieldsAsObjects.get(field); + if (snippetObjects != null) { + //one single document at a time + assert snippetObjects.length == 1; + Object snippetObject = snippetObjects[0]; + if (snippetObject != null && snippetObject instanceof Snippet[]) { + return (Snippet[]) snippetObject; + } } - throw new IllegalArgumentException("No more values offsets to return"); - } - - public void setBreakIterator(BreakIterator breakIterator) { - this.breakIterator = breakIterator; + return EMPTY_SNIPPET; } @Override @@ -158,29 +128,25 @@ protected char getMultiValuedSeparator(String field) { */ @Override protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { - if (noMatchSize > 0) { + if (returnNonHighlightedSnippets) { //we want to return the first sentence of the first snippet only return super.getEmptyHighlight(fieldName, bi, 1); } return EMPTY_PASSAGE; } - /* - Not needed since we call our own loadCurrentFieldValue explicitly, but we override it anyway for consistency. - */ @Override - protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException { - return new String[][]{new String[]{loadCurrentFieldValue()}}; + protected Analyzer getIndexAnalyzer(String field) { + return analyzer; } - /* - Our own method that returns the field values, which relies on the content that was provided when creating the highlighter. - Supports per value discrete highlighting calling the highlightDoc method multiple times, one per value. - */ - protected String loadCurrentFieldValue() { - if (currentValueIndex < fieldValues.length) { - return fieldValues[currentValueIndex]; - } - throw new IllegalArgumentException("No more values to return"); + @Override + protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException { + //we only highlight one field, one document at a time + assert fields.length == 1; + assert docids.length == 1; + String rawValue = Strings.collectionToDelimitedString(fieldValues, String.valueOf(getMultiValuedSeparator(fields[0]))); + String fieldValue = rawValue.substring(0, Math.min(rawValue.length(), maxLength)); + return new String[][]{new String[]{fieldValue}}; } } diff --git a/src/main/java/org/apache/lucene/search/postingshighlight/Snippet.java b/src/main/java/org/apache/lucene/search/postingshighlight/Snippet.java index bf6802000fac5..a756de6511536 100644 --- a/src/main/java/org/apache/lucene/search/postingshighlight/Snippet.java +++ b/src/main/java/org/apache/lucene/search/postingshighlight/Snippet.java @@ -22,7 +22,7 @@ * Represents a scored highlighted snippet. * It's our own arbitrary object that we get back from the postings highlighter when highlighting a document. * Every snippet contains its formatted text and its score. - * The score is needed since we highlight every single value separately and we might want to return snippets sorted by score. + * The score is needed in case we want to sort snippets by score, they get sorted by position in the text by default. */ public class Snippet { diff --git a/src/main/java/org/apache/lucene/search/postingshighlight/XPostingsHighlighter.java b/src/main/java/org/apache/lucene/search/postingshighlight/XPostingsHighlighter.java deleted file mode 100644 index 6b725c48f4fe6..0000000000000 --- a/src/main/java/org/apache/lucene/search/postingshighlight/XPostingsHighlighter.java +++ /dev/null @@ -1,772 +0,0 @@ -/* - * Licensed to Elasticsearch under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. Elasticsearch licenses this - * file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ - -package org.apache.lucene.search.postingshighlight; - -import org.apache.lucene.index.*; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.TopDocs; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.InPlaceMergeSorter; -import org.apache.lucene.util.UnicodeUtil; - -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.text.BreakIterator; -import java.util.*; - -/* -FORKED from Lucene 4.5 to be able to: -1) support discrete highlighting for multiple values, so that we can return a different snippet per value when highlighting the whole text -2) call the highlightField method directly from subclasses and provide the terms by ourselves -3) Applied LUCENE-4906 to allow PassageFormatter to return arbitrary objects (LUCENE 4.6) - -All our changes start with //BEGIN EDIT - */ -public class XPostingsHighlighter { - - //BEGIN EDIT added method to override offset for current value (default 0) - //we need this to perform discrete highlighting per field - protected int getOffsetForCurrentValue(String field, int docId) { - return 0; - } - //END EDIT - - //BEGIN EDIT - //we need this to fix scoring when highlighting every single value separately, since the score depends on the total length of the field (all values rather than only the current one) - protected int getContentLength(String field, int docId) { - return -1; - } - //END EDIT - - - // TODO: maybe allow re-analysis for tiny fields? currently we require offsets, - // but if the analyzer is really fast and the field is tiny, this might really be - // unnecessary. - - /** for rewriting: we don't want slow processing from MTQs */ - private static final IndexSearcher EMPTY_INDEXSEARCHER; - static { - try { - IndexReader emptyReader = new MultiReader(); - EMPTY_INDEXSEARCHER = new IndexSearcher(emptyReader); - EMPTY_INDEXSEARCHER.setQueryCache(null); - } catch (IOException bogus) { - throw new RuntimeException(bogus); - } - } - - /** Default maximum content size to process. Typically snippets - * closer to the beginning of the document better summarize its content */ - public static final int DEFAULT_MAX_LENGTH = 10000; - - private final int maxLength; - - /** Set the first time {@link #getFormatter} is called, - * and then reused. */ - private PassageFormatter defaultFormatter; - - /** Set the first time {@link #getScorer} is called, - * and then reused. */ - private PassageScorer defaultScorer; - - /** - * Creates a new highlighter with default parameters. - */ - public XPostingsHighlighter() { - this(DEFAULT_MAX_LENGTH); - } - - /** - * Creates a new highlighter, specifying maximum content length. - * @param maxLength maximum content size to process. - * @throws IllegalArgumentException if maxLength is negative or Integer.MAX_VALUE - */ - public XPostingsHighlighter(int maxLength) { - if (maxLength < 0 || maxLength == Integer.MAX_VALUE) { - // two reasons: no overflow problems in BreakIterator.preceding(offset+1), - // our sentinel in the offsets queue uses this value to terminate. - throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE"); - } - this.maxLength = maxLength; - } - - /** Returns the {@link java.text.BreakIterator} to use for - * dividing text into passages. This returns - * {@link java.text.BreakIterator#getSentenceInstance(java.util.Locale)} by default; - * subclasses can override to customize. */ - protected BreakIterator getBreakIterator(String field) { - return BreakIterator.getSentenceInstance(Locale.ROOT); - } - - /** Returns the {@link PassageFormatter} to use for - * formatting passages into highlighted snippets. This - * returns a new {@code PassageFormatter} by default; - * subclasses can override to customize. */ - protected PassageFormatter getFormatter(String field) { - if (defaultFormatter == null) { - defaultFormatter = new DefaultPassageFormatter(); - } - return defaultFormatter; - } - - /** Returns the {@link PassageScorer} to use for - * ranking passages. This - * returns a new {@code PassageScorer} by default; - * subclasses can override to customize. */ - protected PassageScorer getScorer(String field) { - if (defaultScorer == null) { - defaultScorer = new PassageScorer(); - } - return defaultScorer; - } - - /** - * Highlights the top passages from a single field. - * - * @param field field name to highlight. - * Must have a stored string value and also be indexed with offsets. - * @param query query to highlight. - * @param searcher searcher that was previously used to execute the query. - * @param topDocs TopDocs containing the summary result documents to highlight. - * @return Array of formatted snippets corresponding to the documents in topDocs. - * If no highlights were found for a document, the - * first sentence for the field will be returned. - * @throws java.io.IOException if an I/O error occurred during processing - * @throws IllegalArgumentException if field was indexed without - * {@link org.apache.lucene.index.FieldInfo.IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} - */ - public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException { - return highlight(field, query, searcher, topDocs, 1); - } - - /** - * Highlights the top-N passages from a single field. - * - * @param field field name to highlight. - * Must have a stored string value and also be indexed with offsets. - * @param query query to highlight. - * @param searcher searcher that was previously used to execute the query. - * @param topDocs TopDocs containing the summary result documents to highlight. - * @param maxPassages The maximum number of top-N ranked passages used to - * form the highlighted snippets. - * @return Array of formatted snippets corresponding to the documents in topDocs. - * If no highlights were found for a document, the - * first {@code maxPassages} sentences from the - * field will be returned. - * @throws IOException if an I/O error occurred during processing - * @throws IllegalArgumentException if field was indexed without - * {@link org.apache.lucene.index.FieldInfo.IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} - */ - public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages) throws IOException { - Map res = highlightFields(new String[] { field }, query, searcher, topDocs, new int[] { maxPassages }); - return res.get(field); - } - - /** - * Highlights the top passages from multiple fields. - *

- * Conceptually, this behaves as a more efficient form of: - *

-     * Map m = new HashMap();
-     * for (String field : fields) {
-     *   m.put(field, highlight(field, query, searcher, topDocs));
-     * }
-     * return m;
-     * 
- * - * @param fields field names to highlight. - * Must have a stored string value and also be indexed with offsets. - * @param query query to highlight. - * @param searcher searcher that was previously used to execute the query. - * @param topDocs TopDocs containing the summary result documents to highlight. - * @return Map keyed on field name, containing the array of formatted snippets - * corresponding to the documents in topDocs. - * If no highlights were found for a document, the - * first sentence from the field will be returned. - * @throws IOException if an I/O error occurred during processing - * @throws IllegalArgumentException if field was indexed without - * {@link org.apache.lucene.index.FieldInfo.IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} - */ - public Map highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException { - int maxPassages[] = new int[fields.length]; - Arrays.fill(maxPassages, 1); - return highlightFields(fields, query, searcher, topDocs, maxPassages); - } - - /** - * Highlights the top-N passages from multiple fields. - *

- * Conceptually, this behaves as a more efficient form of: - *

-     * Map m = new HashMap();
-     * for (String field : fields) {
-     *   m.put(field, highlight(field, query, searcher, topDocs, maxPassages));
-     * }
-     * return m;
-     * 
- * - * @param fields field names to highlight. - * Must have a stored string value and also be indexed with offsets. - * @param query query to highlight. - * @param searcher searcher that was previously used to execute the query. - * @param topDocs TopDocs containing the summary result documents to highlight. - * @param maxPassages The maximum number of top-N ranked passages per-field used to - * form the highlighted snippets. - * @return Map keyed on field name, containing the array of formatted snippets - * corresponding to the documents in topDocs. - * If no highlights were found for a document, the - * first {@code maxPassages} sentences from the - * field will be returned. - * @throws IOException if an I/O error occurred during processing - * @throws IllegalArgumentException if field was indexed without - * {@link org.apache.lucene.index.FieldInfo.IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} - */ - public Map highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages[]) throws IOException { - final ScoreDoc scoreDocs[] = topDocs.scoreDocs; - int docids[] = new int[scoreDocs.length]; - for (int i = 0; i < docids.length; i++) { - docids[i] = scoreDocs[i].doc; - } - - return highlightFields(fields, query, searcher, docids, maxPassages); - } - - /** - * Highlights the top-N passages from multiple fields, - * for the provided int[] docids. - * - * @param fieldsIn field names to highlight. - * Must have a stored string value and also be indexed with offsets. - * @param query query to highlight. - * @param searcher searcher that was previously used to execute the query. - * @param docidsIn containing the document IDs to highlight. - * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to - * form the highlighted snippets. - * @return Map keyed on field name, containing the array of formatted snippets - * corresponding to the documents in topDocs. - * If no highlights were found for a document, the - * first {@code maxPassages} from the field will - * be returned. - * @throws IOException if an I/O error occurred during processing - * @throws IllegalArgumentException if field was indexed without - * {@link org.apache.lucene.index.FieldInfo.IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} - */ - public Map highlightFields(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException { - Map snippets = new HashMap<>(); - for(Map.Entry ent : highlightFieldsAsObjects(fieldsIn, query, searcher, docidsIn, maxPassagesIn).entrySet()) { - Object[] snippetObjects = ent.getValue(); - String[] snippetStrings = new String[snippetObjects.length]; - snippets.put(ent.getKey(), snippetStrings); - for(int i=0;i highlightFieldsAsObjects(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException { - if (fieldsIn.length < 1) { - throw new IllegalArgumentException("fieldsIn must not be empty"); - } - if (fieldsIn.length != maxPassagesIn.length) { - throw new IllegalArgumentException("invalid number of maxPassagesIn"); - } - SortedSet queryTerms = new TreeSet<>(); - EMPTY_INDEXSEARCHER.createNormalizedWeight(query, false).extractTerms(queryTerms); - - IndexReaderContext readerContext = searcher.getIndexReader().getContext(); - List leaves = readerContext.leaves(); - - // Make our own copies because we sort in-place: - int[] docids = new int[docidsIn.length]; - System.arraycopy(docidsIn, 0, docids, 0, docidsIn.length); - final String fields[] = new String[fieldsIn.length]; - System.arraycopy(fieldsIn, 0, fields, 0, fieldsIn.length); - final int maxPassages[] = new int[maxPassagesIn.length]; - System.arraycopy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.length); - - // sort for sequential io - Arrays.sort(docids); - new InPlaceMergeSorter() { - - @Override - protected void swap(int i, int j) { - String tmp = fields[i]; - fields[i] = fields[j]; - fields[j] = tmp; - int tmp2 = maxPassages[i]; - maxPassages[i] = maxPassages[j]; - maxPassages[j] = tmp2; - } - - @Override - protected int compare(int i, int j) { - return fields[i].compareTo(fields[j]); - } - - }.sort(0, fields.length); - - // pull stored data: - String[][] contents = loadFieldValues(searcher, fields, docids, maxLength); - - Map highlights = new HashMap<>(); - for (int i = 0; i < fields.length; i++) { - String field = fields[i]; - int numPassages = maxPassages[i]; - - Term floor = new Term(field, ""); - Term ceiling = new Term(field, UnicodeUtil.BIG_TERM); - SortedSet fieldTerms = queryTerms.subSet(floor, ceiling); - // TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords) - - // Strip off the redundant field: - BytesRef terms[] = new BytesRef[fieldTerms.size()]; - int termUpto = 0; - for(Term term : fieldTerms) { - terms[termUpto++] = term.bytes(); - } - Map fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages); - - Object[] result = new Object[docids.length]; - for (int j = 0; j < docidsIn.length; j++) { - result[j] = fieldHighlights.get(docidsIn[j]); - } - highlights.put(field, result); - } - return highlights; - } - - /** Loads the String values for each field X docID to be - * highlighted. By default this loads from stored - * fields, but a subclass can change the source. This - * method should allocate the String[fields.length][docids.length] - * and fill all values. The returned Strings must be - * identical to what was indexed. */ - protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException { - String contents[][] = new String[fields.length][docids.length]; - char valueSeparators[] = new char[fields.length]; - for (int i = 0; i < fields.length; i++) { - valueSeparators[i] = getMultiValuedSeparator(fields[i]); - } - LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, valueSeparators, maxLength); - for (int i = 0; i < docids.length; i++) { - searcher.doc(docids[i], visitor); - for (int j = 0; j < fields.length; j++) { - contents[j][i] = visitor.getValue(j); - } - visitor.reset(); - } - return contents; - } - - /** - * Returns the logical separator between values for multi-valued fields. - * The default value is a space character, which means passages can span across values, - * but a subclass can override, for example with {@code U+2029 PARAGRAPH SEPARATOR (PS)} - * if each value holds a discrete passage for highlighting. - */ - protected char getMultiValuedSeparator(String field) { - return ' '; - } - - //BEGIN EDIT: made protected so that we can call from our subclass and pass in the terms by ourselves - protected Map highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List leaves, int maxPassages) throws IOException { - //private Map highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List leaves, int maxPassages) throws IOException { - //END EDIT - - Map highlights = new HashMap<>(); - - // reuse in the real sense... for docs in same segment we just advance our old enum - PostingsEnum postings[] = null; - TermsEnum termsEnum = null; - int lastLeaf = -1; - - PassageFormatter fieldFormatter = getFormatter(field); - if (fieldFormatter == null) { - throw new NullPointerException("PassageFormatter cannot be null"); - } - - for (int i = 0; i < docids.length; i++) { - String content = contents[i]; - if (content.length() == 0) { - continue; // nothing to do - } - bi.setText(content); - int doc = docids[i]; - int leaf = ReaderUtil.subIndex(doc, leaves); - LeafReaderContext subContext = leaves.get(leaf); - LeafReader r = subContext.reader(); - Terms t = r.terms(field); - if (t == null) { - continue; // nothing to do - } - if (!t.hasOffsets()) { - // no offsets available - throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); - } - if (leaf != lastLeaf) { - termsEnum = t.iterator(); - postings = new PostingsEnum[terms.length]; - } - Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages); - if (passages.length == 0) { - passages = getEmptyHighlight(field, bi, maxPassages); - } - if (passages.length > 0) { - // otherwise a null snippet (eg if field is missing - // entirely from the doc) - highlights.put(doc, fieldFormatter.format(passages, content)); - } - lastLeaf = leaf; - } - - return highlights; - } - - // algorithm: treat sentence snippets as miniature documents - // we can intersect these with the postings lists via BreakIterator.preceding(offset),s - // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) - private Passage[] highlightDoc(String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc, - TermsEnum termsEnum, PostingsEnum[] postings, int n) throws IOException { - - //BEGIN EDIT added call to method that returns the offset for the current value (discrete highlighting) - int valueOffset = getOffsetForCurrentValue(field, doc); - //END EDIT - - PassageScorer scorer = getScorer(field); - if (scorer == null) { - throw new NullPointerException("PassageScorer cannot be null"); - } - - - //BEGIN EDIT discrete highlighting - // the scoring needs to be based on the length of the whole field (all values rather than only the current one) - int totalContentLength = getContentLength(field, doc); - if (totalContentLength == -1) { - totalContentLength = contentLength; - } - //END EDIT - - - PriorityQueue pq = new PriorityQueue<>(); - float weights[] = new float[terms.length]; - // initialize postings - for (int i = 0; i < terms.length; i++) { - PostingsEnum de = postings[i]; - int pDoc; - if (de == EMPTY) { - continue; - } else if (de == null) { - postings[i] = EMPTY; // initially - if (!termsEnum.seekExact(terms[i])) { - continue; // term not found - } - de = postings[i] = termsEnum.postings(null, null, PostingsEnum.OFFSETS); - assert de != null; - pDoc = de.advance(doc); - } else { - pDoc = de.docID(); - if (pDoc < doc) { - pDoc = de.advance(doc); - } - } - - if (doc == pDoc) { - //BEGIN EDIT we take into account the length of the whole field (all values) to properly score the snippets - weights[i] = scorer.weight(totalContentLength, de.freq()); - //weights[i] = scorer.weight(contentLength, de.freq()); - //END EDIT - de.nextPosition(); - pq.add(new OffsetsEnum(de, i)); - } - } - - pq.add(new OffsetsEnum(EMPTY, Integer.MAX_VALUE)); // a sentinel for termination - - PriorityQueue passageQueue = new PriorityQueue<>(n, new Comparator() { - @Override - public int compare(Passage left, Passage right) { - if (left.score < right.score) { - return -1; - } else if (left.score > right.score) { - return 1; - } else { - return left.startOffset - right.startOffset; - } - } - }); - Passage current = new Passage(); - - OffsetsEnum off; - while ((off = pq.poll()) != null) { - final PostingsEnum dp = off.dp; - - int start = dp.startOffset(); - assert start >= 0; - int end = dp.endOffset(); - // LUCENE-5166: this hit would span the content limit... however more valid - // hits may exist (they are sorted by start). so we pretend like we never - // saw this term, it won't cause a passage to be added to passageQueue or anything. - assert EMPTY.startOffset() == Integer.MAX_VALUE; - if (start < contentLength && end > contentLength) { - continue; - } - - - //BEGIN EDIT support for discrete highlighting (added block code) - //switch to the first match in the current value if there is one - boolean seenEnough = false; - while (start < valueOffset) { - if (off.pos == dp.freq()) { - seenEnough = true; - break; - } else { - off.pos++; - dp.nextPosition(); - start = dp.startOffset(); - end = dp.endOffset(); - } - } - - //continue with next term if we've already seen the current one all the times it appears - //that means that the current value doesn't hold matches for the current term - if (seenEnough) { - continue; - } - - //we now subtract the offset of the current value to both start and end - start -= valueOffset; - end -= valueOffset; - //END EDIT - - - if (start >= current.endOffset) { - if (current.startOffset >= 0) { - // finalize current - //BEGIN EDIT we take into account the value offset when scoring the snippet based on its position - current.score *= scorer.norm(current.startOffset + valueOffset); - //current.score *= scorer.norm(current.startOffset); - //END EDIT - // new sentence: first add 'current' to queue - if (passageQueue.size() == n && current.score < passageQueue.peek().score) { - current.reset(); // can't compete, just reset it - } else { - passageQueue.offer(current); - if (passageQueue.size() > n) { - current = passageQueue.poll(); - current.reset(); - } else { - current = new Passage(); - } - } - } - // if we exceed limit, we are done - if (start >= contentLength) { - Passage passages[] = new Passage[passageQueue.size()]; - passageQueue.toArray(passages); - for (Passage p : passages) { - p.sort(); - } - // sort in ascending order - Arrays.sort(passages, new Comparator() { - @Override - public int compare(Passage left, Passage right) { - return left.startOffset - right.startOffset; - } - }); - return passages; - } - // advance breakiterator - assert BreakIterator.DONE < 0; - current.startOffset = Math.max(bi.preceding(start+1), 0); - current.endOffset = Math.min(bi.next(), contentLength); - } - int tf = 0; - while (true) { - tf++; - current.addMatch(start, end, terms[off.id]); - if (off.pos == dp.freq()) { - break; // removed from pq - } else { - off.pos++; - dp.nextPosition(); - //BEGIN EDIT support for discrete highlighting - start = dp.startOffset() - valueOffset; - end = dp.endOffset() - valueOffset; - //start = dp.startOffset(); - //end = dp.endOffset(); - //END EDIT - } - if (start >= current.endOffset || end > contentLength) { - pq.offer(off); - break; - } - } - current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset); - } - - // Dead code but compiler disagrees: - assert false; - return null; - } - - /** Called to summarize a document when no hits were - * found. By default this just returns the first - * {@code maxPassages} sentences; subclasses can override - * to customize. */ - protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { - // BreakIterator should be un-next'd: - List passages = new ArrayList<>(); - int pos = bi.current(); - assert pos == 0; - while (passages.size() < maxPassages) { - int next = bi.next(); - if (next == BreakIterator.DONE) { - break; - } - Passage passage = new Passage(); - passage.score = Float.NaN; - passage.startOffset = pos; - passage.endOffset = next; - passages.add(passage); - pos = next; - } - - return passages.toArray(new Passage[passages.size()]); - } - - private static class OffsetsEnum implements Comparable { - PostingsEnum dp; - int pos; - int id; - - OffsetsEnum(PostingsEnum dp, int id) throws IOException { - this.dp = dp; - this.id = id; - this.pos = 1; - } - - @Override - public int compareTo(OffsetsEnum other) { - try { - int off = dp.startOffset(); - int otherOff = other.dp.startOffset(); - if (off == otherOff) { - return id - other.id; - } else { - return Long.signum(((long)off) - otherOff); - } - } catch (IOException e) { - throw new RuntimeException(e); - } - } - } - - private static final PostingsEnum EMPTY = new PostingsEnum() { - - @Override - public int nextPosition() throws IOException { return 0; } - - @Override - public int startOffset() throws IOException { return Integer.MAX_VALUE; } - - @Override - public int endOffset() throws IOException { return Integer.MAX_VALUE; } - - @Override - public BytesRef getPayload() throws IOException { return null; } - - @Override - public int freq() throws IOException { return 0; } - - @Override - public int docID() { return NO_MORE_DOCS; } - - @Override - public int nextDoc() throws IOException { return NO_MORE_DOCS; } - - @Override - public int advance(int target) throws IOException { return NO_MORE_DOCS; } - - @Override - public long cost() { return 0; } - }; - - private static class LimitedStoredFieldVisitor extends StoredFieldVisitor { - private final String fields[]; - private final char valueSeparators[]; - private final int maxLength; - private final StringBuilder builders[]; - private int currentField = -1; - - public LimitedStoredFieldVisitor(String fields[], char valueSeparators[], int maxLength) { - assert fields.length == valueSeparators.length; - this.fields = fields; - this.valueSeparators = valueSeparators; - this.maxLength = maxLength; - builders = new StringBuilder[fields.length]; - for (int i = 0; i < builders.length; i++) { - builders[i] = new StringBuilder(); - } - } - - @Override - public void stringField(FieldInfo fieldInfo, byte[] bytes) throws IOException { - String value = new String(bytes, StandardCharsets.UTF_8); - assert currentField >= 0; - StringBuilder builder = builders[currentField]; - if (builder.length() > 0 && builder.length() < maxLength) { - builder.append(valueSeparators[currentField]); - } - if (builder.length() + value.length() > maxLength) { - builder.append(value, 0, maxLength - builder.length()); - } else { - builder.append(value); - } - } - - @Override - public Status needsField(FieldInfo fieldInfo) throws IOException { - currentField = Arrays.binarySearch(fields, fieldInfo.name); - if (currentField < 0) { - return Status.NO; - } else if (builders[currentField].length() > maxLength) { - return fields.length == 1 ? Status.STOP : Status.NO; - } - return Status.YES; - } - - String getValue(int i) { - return builders[i].toString(); - } - - void reset() { - currentField = -1; - for (int i = 0; i < fields.length; i++) { - builders[i].setLength(0); - } - } - } -} diff --git a/src/main/java/org/elasticsearch/search/highlight/HighlightPhase.java b/src/main/java/org/elasticsearch/search/highlight/HighlightPhase.java index cd3c12591f706..d7d303138d86b 100644 --- a/src/main/java/org/elasticsearch/search/highlight/HighlightPhase.java +++ b/src/main/java/org/elasticsearch/search/highlight/HighlightPhase.java @@ -22,7 +22,6 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import org.apache.lucene.index.IndexOptions; -import org.elasticsearch.ElasticsearchException; import org.elasticsearch.common.component.AbstractComponent; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.regex.Regex; diff --git a/src/main/java/org/elasticsearch/search/highlight/HighlightUtils.java b/src/main/java/org/elasticsearch/search/highlight/HighlightUtils.java index 0012ad20d52a7..351c1dead14bd 100644 --- a/src/main/java/org/elasticsearch/search/highlight/HighlightUtils.java +++ b/src/main/java/org/elasticsearch/search/highlight/HighlightUtils.java @@ -20,8 +20,6 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; - -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.highlight.DefaultEncoder; import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.highlight.SimpleHTMLEncoder; @@ -29,7 +27,6 @@ import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.search.fetch.FetchSubPhase; import org.elasticsearch.search.internal.SearchContext; -import org.elasticsearch.search.lookup.SearchLookup; import org.elasticsearch.search.lookup.SourceLookup; import java.io.IOException; diff --git a/src/main/java/org/elasticsearch/search/highlight/PostingsHighlighter.java b/src/main/java/org/elasticsearch/search/highlight/PostingsHighlighter.java index f30a0545d9522..9524ffb20921c 100644 --- a/src/main/java/org/elasticsearch/search/highlight/PostingsHighlighter.java +++ b/src/main/java/org/elasticsearch/search/highlight/PostingsHighlighter.java @@ -18,31 +18,17 @@ */ package org.elasticsearch.search.highlight; -import com.google.common.collect.Lists; import com.google.common.collect.Maps; - +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.ConstantScoreQuery; -import org.apache.lucene.search.FilteredQuery; -import org.apache.lucene.search.MultiTermQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoringRewrite; -import org.apache.lucene.search.TopTermsRewrite; -import org.apache.lucene.search.Weight; +import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.postingshighlight.CustomPassageFormatter; import org.apache.lucene.search.postingshighlight.CustomPostingsHighlighter; import org.apache.lucene.search.postingshighlight.Snippet; import org.apache.lucene.search.postingshighlight.WholeBreakIterator; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CollectionUtil; -import org.apache.lucene.util.UnicodeUtil; import org.elasticsearch.common.Strings; -import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.common.text.StringText; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.search.fetch.FetchPhaseExecutionException; @@ -51,13 +37,7 @@ import java.io.IOException; import java.text.BreakIterator; -import java.util.ArrayList; -import java.util.Comparator; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.SortedSet; -import java.util.TreeSet; +import java.util.*; public class PostingsHighlighter implements Highlighter { @@ -81,15 +61,7 @@ public HighlightField highlight(HighlighterContext highlighterContext) { FetchSubPhase.HitContext hitContext = highlighterContext.hitContext; if (!hitContext.cache().containsKey(CACHE_KEY)) { - //get the non rewritten query and rewrite it - Query query; - try { - query = rewrite(highlighterContext, hitContext.topLevelReader()); - SortedSet queryTerms = extractTerms(context.searcher().createNormalizedWeight(query, false)); - hitContext.cache().put(CACHE_KEY, new HighlighterEntry(queryTerms)); - } catch (IOException e) { - throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e); - } + hitContext.cache().put(CACHE_KEY, new HighlighterEntry()); } HighlighterEntry highlighterEntry = (HighlighterEntry) hitContext.cache().get(CACHE_KEY); @@ -98,37 +70,28 @@ public HighlightField highlight(HighlighterContext highlighterContext) { if (mapperHighlighterEntry == null) { Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT; CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0], encoder); - BytesRef[] filteredQueryTerms = filterTerms(highlighterEntry.queryTerms, fieldMapper.names().indexName(), field.fieldOptions().requireFieldMatch()); - mapperHighlighterEntry = new MapperHighlighterEntry(passageFormatter, filteredQueryTerms); + mapperHighlighterEntry = new MapperHighlighterEntry(passageFormatter); } - //we merge back multiple values into a single value using the paragraph separator, unless we have to highlight every single value separately (number_of_fragments=0). - boolean mergeValues = field.fieldOptions().numberOfFragments() != 0; List snippets = new ArrayList<>(); int numberOfFragments; - try { - //we manually load the field values (from source if needed) - List textsToHighlight = HighlightUtils.loadFieldValues(field, fieldMapper, context, hitContext); - CustomPostingsHighlighter highlighter = new CustomPostingsHighlighter(mapperHighlighterEntry.passageFormatter, textsToHighlight, mergeValues, Integer.MAX_VALUE-1, field.fieldOptions().noMatchSize()); - - if (field.fieldOptions().numberOfFragments() == 0) { - highlighter.setBreakIterator(new WholeBreakIterator()); - numberOfFragments = 1; //1 per value since we highlight per value + Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().type()).mappers().indexAnalyzer(); + List fieldValues = HighlightUtils.loadFieldValues(field, fieldMapper, context, hitContext); + CustomPostingsHighlighter highlighter; + if (field.fieldOptions().numberOfFragments() == 0) { + highlighter = new CustomPostingsHighlighter(analyzer, mapperHighlighterEntry.passageFormatter, new WholeBreakIterator(), fieldValues, Integer.MAX_VALUE-1, field.fieldOptions().noMatchSize() > 0); + numberOfFragments = 1; //we are highlighting the whole content, will get back a single fragment for it } else { + highlighter = new CustomPostingsHighlighter(analyzer, mapperHighlighterEntry.passageFormatter, fieldValues, Integer.MAX_VALUE-1, field.fieldOptions().noMatchSize() > 0); numberOfFragments = field.fieldOptions().numberOfFragments(); } - //we highlight every value separately calling the highlight method multiple times, only if we need to have back a snippet per value (whole value) - int values = mergeValues ? 1 : textsToHighlight.size(); - for (int i = 0; i < values; i++) { - Snippet[] fieldSnippets = highlighter.highlightDoc(fieldMapper.names().indexName(), mapperHighlighterEntry.filteredQueryTerms, hitContext.reader(), hitContext.docId(), numberOfFragments); - if (fieldSnippets != null) { - for (Snippet fieldSnippet : fieldSnippets) { - if (Strings.hasText(fieldSnippet.getText())) { - snippets.add(fieldSnippet); - } - } + IndexSearcher searcher = new IndexSearcher(hitContext.reader()); + Snippet[] fieldSnippets = highlighter.highlightField(fieldMapper.names().indexName(), highlighterContext.query.originalQuery(), searcher, hitContext.docId(), numberOfFragments); + for (Snippet fieldSnippet : fieldSnippets) { + if (Strings.hasText(fieldSnippet.getText())) { + snippets.add(fieldSnippet); } } @@ -160,97 +123,10 @@ public int compare(Snippet o1, Snippet o2) { return null; } - private static Query rewrite(HighlighterContext highlighterContext, IndexReader reader) throws IOException { - - Query original = highlighterContext.query.originalQuery(); - - //we walk the query tree and when we encounter multi term queries we need to make sure the rewrite method - //supports multi term extraction. If not we temporarily override it (and restore it after the rewrite). - List> modifiedMultiTermQueries = Lists.newArrayList(); - overrideMultiTermRewriteMethod(original, modifiedMultiTermQueries); - - //rewrite is expensive: if the query was already rewritten we try not to rewrite it again - if (highlighterContext.query.queryRewritten() && modifiedMultiTermQueries.size() == 0) { - //return the already rewritten query - return highlighterContext.query.query(); - } - - Query query = original; - for (Query rewrittenQuery = query.rewrite(reader); rewrittenQuery != query; - rewrittenQuery = query.rewrite(reader)) { - query = rewrittenQuery; - } - - //set back the original rewrite method after the rewrite is done - for (Tuple modifiedMultiTermQuery : modifiedMultiTermQueries) { - modifiedMultiTermQuery.v1().setRewriteMethod(modifiedMultiTermQuery.v2()); - } - - return query; - } - - private static void overrideMultiTermRewriteMethod(Query query, List> modifiedMultiTermQueries) { - - if (query instanceof MultiTermQuery) { - MultiTermQuery originalMultiTermQuery = (MultiTermQuery) query; - if (!allowsForTermExtraction(originalMultiTermQuery.getRewriteMethod())) { - MultiTermQuery.RewriteMethod originalRewriteMethod = originalMultiTermQuery.getRewriteMethod(); - originalMultiTermQuery.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(50)); - //we need to rewrite anyway if it is a multi term query which was rewritten with the wrong rewrite method - modifiedMultiTermQueries.add(Tuple.tuple(originalMultiTermQuery, originalRewriteMethod)); - } - } - - if (query instanceof BooleanQuery) { - BooleanQuery booleanQuery = (BooleanQuery) query; - for (BooleanClause booleanClause : booleanQuery) { - overrideMultiTermRewriteMethod(booleanClause.getQuery(), modifiedMultiTermQueries); - } - } - - if (query instanceof FilteredQuery) { - overrideMultiTermRewriteMethod(((FilteredQuery) query).getQuery(), modifiedMultiTermQueries); - } - - if (query instanceof ConstantScoreQuery) { - overrideMultiTermRewriteMethod(((ConstantScoreQuery) query).getQuery(), modifiedMultiTermQueries); - } - } - - private static boolean allowsForTermExtraction(MultiTermQuery.RewriteMethod rewriteMethod) { - return rewriteMethod instanceof TopTermsRewrite || rewriteMethod instanceof ScoringRewrite; - } - - private static SortedSet extractTerms(Weight weight) { - SortedSet queryTerms = new TreeSet<>(); - weight.extractTerms(queryTerms); - return queryTerms; - } - - private static BytesRef[] filterTerms(SortedSet queryTerms, String field, boolean requireFieldMatch) { - SortedSet fieldTerms; - if (requireFieldMatch) { - Term floor = new Term(field, ""); - Term ceiling = new Term(field, UnicodeUtil.BIG_TERM); - fieldTerms = queryTerms.subSet(floor, ceiling); - } else { - fieldTerms = queryTerms; - } - - BytesRef terms[] = new BytesRef[fieldTerms.size()]; - int termUpto = 0; - for(Term term : fieldTerms) { - terms[termUpto++] = term.bytes(); - } - - return terms; - } - private static List filterSnippets(List snippets, int numberOfFragments) { //We need to filter the snippets as due to no_match_size we could have - //either highlighted snippets together non highlighted ones - //We don't want to mix those up + //either highlighted snippets or non highlighted ones and we don't want to mix those up List filteredSnippets = new ArrayList<>(snippets.size()); for (Snippet snippet : snippets) { if (snippet.isHighlighted()) { @@ -263,8 +139,8 @@ private static List filterSnippets(List snippets, int numberOf if (filteredSnippets.size() == 0) { if (snippets.size() > 0) { Snippet snippet = snippets.get(0); - //if we did discrete per value highlighting using whole break iterator (as number_of_fragments was 0) - //we need to obtain the first sentence of the first value + //if we tried highlighting the whole content using whole break iterator (as number_of_fragments was 0) + //we need to return the first sentence of the content rather than the whole content if (numberOfFragments == 0) { BreakIterator bi = BreakIterator.getSentenceInstance(Locale.ROOT); String text = snippet.getText(); @@ -283,21 +159,14 @@ private static List filterSnippets(List snippets, int numberOf } private static class HighlighterEntry { - final SortedSet queryTerms; Map, MapperHighlighterEntry> mappers = Maps.newHashMap(); - - private HighlighterEntry(SortedSet queryTerms) { - this.queryTerms = queryTerms; - } } private static class MapperHighlighterEntry { final CustomPassageFormatter passageFormatter; - final BytesRef[] filteredQueryTerms; - private MapperHighlighterEntry(CustomPassageFormatter passageFormatter, BytesRef[] filteredQueryTerms) { + private MapperHighlighterEntry(CustomPassageFormatter passageFormatter) { this.passageFormatter = passageFormatter; - this.filteredQueryTerms = filteredQueryTerms; } } } diff --git a/src/test/java/org/apache/lucene/search/postingshighlight/CustomPostingsHighlighterTests.java b/src/test/java/org/apache/lucene/search/postingshighlight/CustomPostingsHighlighterTests.java index 43165fa4b1c79..e07a9ce9f6535 100644 --- a/src/test/java/org/apache/lucene/search/postingshighlight/CustomPostingsHighlighterTests.java +++ b/src/test/java/org/apache/lucene/search/postingshighlight/CustomPostingsHighlighterTests.java @@ -27,221 +27,18 @@ import org.apache.lucene.search.*; import org.apache.lucene.search.highlight.DefaultEncoder; import org.apache.lucene.store.Directory; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.UnicodeUtil; -import org.elasticsearch.search.highlight.HighlightUtils; import org.elasticsearch.test.ElasticsearchTestCase; import org.junit.Test; -import java.io.IOException; -import java.util.*; +import java.util.ArrayList; +import java.util.List; import static org.hamcrest.CoreMatchers.equalTo; -import static org.hamcrest.CoreMatchers.notNullValue; public class CustomPostingsHighlighterTests extends ElasticsearchTestCase { @Test - public void testDiscreteHighlightingPerValue() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - final String firstValue = "This is a test. Just a test highlighting from postings highlighter."; - Document doc = new Document(); - doc.add(body); - body.setStringValue(firstValue); - - final String secondValue = "This is the second value to perform highlighting on."; - Field body2 = new Field("body", "", offsetsType); - doc.add(body2); - body2.setStringValue(secondValue); - - final String thirdValue = "This is the third value to test highlighting with postings."; - Field body3 = new Field("body", "", offsetsType); - doc.add(body3); - body3.setStringValue(thirdValue); - - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - List fieldValues = new ArrayList<>(); - fieldValues.add(firstValue); - fieldValues.add(secondValue); - fieldValues.add(thirdValue); - - - IndexSearcher searcher = newSearcher(ir); - - Query query = new TermQuery(new Term("body", "highlighting")); - BytesRef[] queryTerms = filterTerms(extractTerms(searcher, query), "body", true); - - TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertThat(topDocs.totalHits, equalTo(1)); - int docId = topDocs.scoreDocs[0].doc; - - //highlighting per value, considering whole values (simulating number_of_fragments=0) - CustomPostingsHighlighter highlighter = new CustomPostingsHighlighter(new CustomPassageFormatter("", "", new DefaultEncoder()), fieldValues, false, Integer.MAX_VALUE - 1, 0); - highlighter.setBreakIterator(new WholeBreakIterator()); - - Snippet[] snippets = highlighter.highlightDoc("body", queryTerms, ir, docId, 5); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0].getText(), equalTo("This is a test. Just a test highlighting from postings highlighter.")); - - snippets = highlighter.highlightDoc("body", queryTerms, ir, docId, 5); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0].getText(), equalTo("This is the second value to perform highlighting on.")); - - snippets = highlighter.highlightDoc("body", queryTerms, ir, docId, 5); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0].getText(), equalTo("This is the third value to test highlighting with postings.")); - - - //let's try without whole break iterator as well, to prove that highlighting works the same when working per value (not optimized though) - highlighter = new CustomPostingsHighlighter(new CustomPassageFormatter("", "", new DefaultEncoder()), fieldValues, false, Integer.MAX_VALUE - 1, 0); - - snippets = highlighter.highlightDoc("body", queryTerms, ir, docId, 5); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0].getText(), equalTo("Just a test highlighting from postings highlighter.")); - - snippets = highlighter.highlightDoc("body", queryTerms, ir, docId, 5); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0].getText(), equalTo("This is the second value to perform highlighting on.")); - - snippets = highlighter.highlightDoc("body", queryTerms, ir, docId, 5); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0].getText(), equalTo("This is the third value to test highlighting with postings.")); - - ir.close(); - dir.close(); - } - - /* - Tests that scoring works properly even when using discrete per value highlighting - */ - @Test - public void testDiscreteHighlightingScoring() throws Exception { - - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - - //good position but only one match - final String firstValue = "This is a test. Just a test1 highlighting from postings highlighter."; - Field body = new Field("body", "", offsetsType); - Document doc = new Document(); - doc.add(body); - body.setStringValue(firstValue); - - //two matches, not the best snippet due to its length though - final String secondValue = "This is the second highlighting value to perform highlighting on a longer text that gets scored lower."; - Field body2 = new Field("body", "", offsetsType); - doc.add(body2); - body2.setStringValue(secondValue); - - //two matches and short, will be scored highest - final String thirdValue = "This is highlighting the third short highlighting value."; - Field body3 = new Field("body", "", offsetsType); - doc.add(body3); - body3.setStringValue(thirdValue); - - //one match, same as first but at the end, will be scored lower due to its position - final String fourthValue = "Just a test4 highlighting from postings highlighter."; - Field body4 = new Field("body", "", offsetsType); - doc.add(body4); - body4.setStringValue(fourthValue); - - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - - String firstHlValue = "Just a test1 highlighting from postings highlighter."; - String secondHlValue = "This is the second highlighting value to perform highlighting on a longer text that gets scored lower."; - String thirdHlValue = "This is highlighting the third short highlighting value."; - String fourthHlValue = "Just a test4 highlighting from postings highlighter."; - - - IndexSearcher searcher = newSearcher(ir); - Query query = new TermQuery(new Term("body", "highlighting")); - BytesRef[] queryTerms = filterTerms(extractTerms(searcher, query), "body", true); - - TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertThat(topDocs.totalHits, equalTo(1)); - - int docId = topDocs.scoreDocs[0].doc; - - List fieldValues = new ArrayList<>(); - fieldValues.add(firstValue); - fieldValues.add(secondValue); - fieldValues.add(thirdValue); - fieldValues.add(fourthValue); - - boolean mergeValues = true; - CustomPostingsHighlighter highlighter = new CustomPostingsHighlighter(new CustomPassageFormatter("", "", new DefaultEncoder()), fieldValues, mergeValues, Integer.MAX_VALUE-1, 0); - Snippet[] snippets = highlighter.highlightDoc("body", queryTerms, ir, docId, 5); - - assertThat(snippets.length, equalTo(4)); - - assertThat(snippets[0].getText(), equalTo(firstHlValue)); - assertThat(snippets[1].getText(), equalTo(secondHlValue)); - assertThat(snippets[2].getText(), equalTo(thirdHlValue)); - assertThat(snippets[3].getText(), equalTo(fourthHlValue)); - - - //Let's highlight each separate value and check how the snippets are scored - mergeValues = false; - highlighter = new CustomPostingsHighlighter(new CustomPassageFormatter("", "", new DefaultEncoder()), fieldValues, mergeValues, Integer.MAX_VALUE-1, 0); - List snippets2 = new ArrayList<>(); - for (int i = 0; i < fieldValues.size(); i++) { - snippets2.addAll(Arrays.asList(highlighter.highlightDoc("body", queryTerms, ir, docId, 5))); - } - - assertThat(snippets2.size(), equalTo(4)); - assertThat(snippets2.get(0).getText(), equalTo(firstHlValue)); - assertThat(snippets2.get(1).getText(), equalTo(secondHlValue)); - assertThat(snippets2.get(2).getText(), equalTo(thirdHlValue)); - assertThat(snippets2.get(3).getText(), equalTo(fourthHlValue)); - - Comparator comparator = new Comparator() { - @Override - public int compare(Snippet o1, Snippet o2) { - return (int)Math.signum(o1.getScore() - o2.getScore()); - } - }; - - //sorting both groups of snippets - Arrays.sort(snippets, comparator); - Collections.sort(snippets2, comparator); - - //checking that the snippets are in the same order, regardless of whether we used per value discrete highlighting or not - //we can't compare the scores directly since they are slightly different due to the multiValued separator added when merging values together - //That causes slightly different lengths and start offsets, thus a slightly different score. - //Anyways, that's not an issue. What's important is that the score is computed the same way, so that the produced order is always the same. - for (int i = 0; i < snippets.length; i++) { - assertThat(snippets[i].getText(), equalTo(snippets2.get(i).getText())); - } - - ir.close(); - dir.close(); - } - - /* - Tests that we produce the same snippets and scores when manually merging values in our own custom highlighter rather than using the built-in code - */ - @Test - public void testMergeValuesScoring() throws Exception { + public void testCustomPostingsHighlighter() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); @@ -281,16 +78,13 @@ public void testMergeValuesScoring() throws Exception { IndexReader ir = iw.getReader(); iw.close(); - String firstHlValue = "Just a test1 highlighting from postings highlighter."; String secondHlValue = "This is the second highlighting value to perform highlighting on a longer text that gets scored lower."; String thirdHlValue = "This is highlighting the third short highlighting value."; String fourthHlValue = "Just a test4 highlighting from postings highlighter."; - IndexSearcher searcher = newSearcher(ir); Query query = new TermQuery(new Term("body", "highlighting")); - BytesRef[] queryTerms = filterTerms(extractTerms(searcher, query), "body", true); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); assertThat(topDocs.totalHits, equalTo(1)); @@ -303,9 +97,8 @@ public void testMergeValuesScoring() throws Exception { fieldValues.add(thirdValue); fieldValues.add(fourthValue); - boolean mergeValues = true; - CustomPostingsHighlighter highlighter = new CustomPostingsHighlighter(new CustomPassageFormatter("", "", new DefaultEncoder()), fieldValues, mergeValues, Integer.MAX_VALUE-1, 0); - Snippet[] snippets = highlighter.highlightDoc("body", queryTerms, ir, docId, 5); + CustomPostingsHighlighter highlighter = new CustomPostingsHighlighter(null, new CustomPassageFormatter("", "", new DefaultEncoder()), fieldValues, Integer.MAX_VALUE-1, false); + Snippet[] snippets = highlighter.highlightField("body", query, searcher, docId, 5); assertThat(snippets.length, equalTo(4)); @@ -314,96 +107,6 @@ public void testMergeValuesScoring() throws Exception { assertThat(snippets[2].getText(), equalTo(thirdHlValue)); assertThat(snippets[3].getText(), equalTo(fourthHlValue)); - - //testing now our fork / normal postings highlighter, which merges multiple values together using the paragraph separator - XPostingsHighlighter highlighter2 = new XPostingsHighlighter(Integer.MAX_VALUE - 1) { - @Override - protected char getMultiValuedSeparator(String field) { - return HighlightUtils.PARAGRAPH_SEPARATOR; - } - - @Override - protected PassageFormatter getFormatter(String field) { - return new CustomPassageFormatter("", "", new DefaultEncoder()); - } - }; - - Map highlightMap = highlighter2.highlightFieldsAsObjects(new String[]{"body"}, query, searcher, new int[]{docId}, new int[]{5}); - Object[] objects = highlightMap.get("body"); - assertThat(objects, notNullValue()); - assertThat(objects.length, equalTo(1)); - Snippet[] normalSnippets = (Snippet[])objects[0]; - - assertThat(normalSnippets.length, equalTo(4)); - - assertThat(normalSnippets[0].getText(), equalTo(firstHlValue)); - assertThat(normalSnippets[1].getText(), equalTo(secondHlValue)); - assertThat(normalSnippets[2].getText(), equalTo(thirdHlValue)); - assertThat(normalSnippets[3].getText(), equalTo(fourthHlValue)); - - - for (int i = 0; i < normalSnippets.length; i++) { - Snippet normalSnippet = snippets[0]; - Snippet customSnippet = normalSnippets[0]; - assertThat(customSnippet.getText(), equalTo(normalSnippet.getText())); - assertThat(customSnippet.getScore(), equalTo(normalSnippet.getScore())); - } - - ir.close(); - dir.close(); - } - - @Test - public void testRequireFieldMatch() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - Field none = new Field("none", "", offsetsType); - Document doc = new Document(); - doc.add(body); - doc.add(none); - - String firstValue = "This is a test. Just a test highlighting from postings. Feel free to ignore."; - body.setStringValue(firstValue); - none.setStringValue(firstValue); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - Query query = new TermQuery(new Term("none", "highlighting")); - IndexSearcher searcher = newSearcher(ir); - SortedSet queryTerms = extractTerms(searcher, query); - TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertThat(topDocs.totalHits, equalTo(1)); - int docId = topDocs.scoreDocs[0].doc; - - List values = new ArrayList<>(); - values.add(firstValue); - - CustomPassageFormatter passageFormatter = new CustomPassageFormatter("", "", new DefaultEncoder()); - CustomPostingsHighlighter highlighter = new CustomPostingsHighlighter(passageFormatter, values, true, Integer.MAX_VALUE - 1, 0); - - //no snippets with simulated require field match (we filter the terms ourselves) - boolean requireFieldMatch = true; - BytesRef[] filteredQueryTerms = filterTerms(queryTerms, "body", requireFieldMatch); - Snippet[] snippets = highlighter.highlightDoc("body", filteredQueryTerms, ir, docId, 5); - assertThat(snippets.length, equalTo(0)); - - - highlighter = new CustomPostingsHighlighter(passageFormatter, values, true, Integer.MAX_VALUE - 1, 0); - //one snippet without require field match, just passing in the query terms with no filtering on our side - requireFieldMatch = false; - filteredQueryTerms = filterTerms(queryTerms, "body", requireFieldMatch); - snippets = highlighter.highlightDoc("body", filteredQueryTerms, ir, docId, 5); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0].getText(), equalTo("Just a test highlighting from postings.")); - ir.close(); dir.close(); } @@ -434,7 +137,6 @@ public void testNoMatchSize() throws Exception { Query query = new TermQuery(new Term("none", "highlighting")); IndexSearcher searcher = newSearcher(ir); - SortedSet queryTerms = extractTerms(searcher, query); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); assertThat(topDocs.totalHits, equalTo(1)); int docId = topDocs.scoreDocs[0].doc; @@ -442,48 +144,18 @@ public void testNoMatchSize() throws Exception { List values = new ArrayList<>(); values.add(firstValue); - BytesRef[] filteredQueryTerms = filterTerms(queryTerms, "body", true); CustomPassageFormatter passageFormatter = new CustomPassageFormatter("", "", new DefaultEncoder()); - CustomPostingsHighlighter highlighter = new CustomPostingsHighlighter(passageFormatter, values, true, Integer.MAX_VALUE - 1, 0); - Snippet[] snippets = highlighter.highlightDoc("body", filteredQueryTerms, ir, docId, 5); + CustomPostingsHighlighter highlighter = new CustomPostingsHighlighter(null, passageFormatter, values, Integer.MAX_VALUE - 1, false); + Snippet[] snippets = highlighter.highlightField("body", query, searcher, docId, 5); assertThat(snippets.length, equalTo(0)); - highlighter = new CustomPostingsHighlighter(passageFormatter, values, true, Integer.MAX_VALUE - 1, scaledRandomIntBetween(1, 10)); - snippets = highlighter.highlightDoc("body", filteredQueryTerms, ir, docId, 5); + highlighter = new CustomPostingsHighlighter(null, passageFormatter, values, Integer.MAX_VALUE - 1, true); + snippets = highlighter.highlightField("body", query, searcher, docId, 5); assertThat(snippets.length, equalTo(1)); assertThat(snippets[0].getText(), equalTo("This is a test.")); ir.close(); dir.close(); } - - private static SortedSet extractTerms(IndexSearcher searcher, Query query) throws IOException { - return extractTerms(searcher.createNormalizedWeight(query, false)); - } - - private static SortedSet extractTerms(Weight weight) { - SortedSet queryTerms = new TreeSet<>(); - weight.extractTerms(queryTerms); - return queryTerms; - } - - private static BytesRef[] filterTerms(SortedSet queryTerms, String field, boolean requireFieldMatch) { - SortedSet fieldTerms; - if (requireFieldMatch) { - Term floor = new Term(field, ""); - Term ceiling = new Term(field, UnicodeUtil.BIG_TERM); - fieldTerms = queryTerms.subSet(floor, ceiling); - } else { - fieldTerms = queryTerms; - } - - BytesRef terms[] = new BytesRef[fieldTerms.size()]; - int termUpto = 0; - for(Term term : fieldTerms) { - terms[termUpto++] = term.bytes(); - } - - return terms; - } } diff --git a/src/test/java/org/apache/lucene/search/postingshighlight/XPostingsHighlighterTests.java b/src/test/java/org/apache/lucene/search/postingshighlight/XPostingsHighlighterTests.java deleted file mode 100644 index 7bd7715596262..0000000000000 --- a/src/test/java/org/apache/lucene/search/postingshighlight/XPostingsHighlighterTests.java +++ /dev/null @@ -1,1691 +0,0 @@ -/* - * Licensed to Elasticsearch under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. Elasticsearch licenses this - * file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ - -package org.apache.lucene.search.postingshighlight; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.document.*; -import org.apache.lucene.index.*; -import org.apache.lucene.search.*; -import org.apache.lucene.search.highlight.DefaultEncoder; -import org.apache.lucene.store.Directory; -import org.elasticsearch.test.ElasticsearchTestCase; -import org.junit.Test; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.text.BreakIterator; -import java.util.Arrays; -import java.util.Iterator; -import java.util.Map; - -import static org.hamcrest.CoreMatchers.*; - -public class XPostingsHighlighterTests extends ElasticsearchTestCase { - - /* - Tests changes needed to make possible to perform discrete highlighting. - We want to highlight every field value separately in case of multiple values, at least when needing to return the whole field content - This is needed to be able to get back a single snippet per value when number_of_fragments=0 - */ - @Test - public void testDiscreteHighlightingPerValue() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - final String firstValue = "This is a test. Just a test highlighting from postings highlighter."; - Document doc = new Document(); - doc.add(body); - body.setStringValue(firstValue); - - final String secondValue = "This is the second value to perform highlighting on."; - Field body2 = new Field("body", "", offsetsType); - doc.add(body2); - body2.setStringValue(secondValue); - - final String thirdValue = "This is the third value to test highlighting with postings."; - Field body3 = new Field("body", "", offsetsType); - doc.add(body3); - body3.setStringValue(thirdValue); - - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter() { - @Override - protected BreakIterator getBreakIterator(String field) { - return new WholeBreakIterator(); - } - - @Override - protected char getMultiValuedSeparator(String field) { - //U+2029 PARAGRAPH SEPARATOR (PS): each value holds a discrete passage for highlighting - return 8233; - } - }; - Query query = new TermQuery(new Term("body", "highlighting")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertThat(topDocs.totalHits, equalTo(1)); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs); - assertThat(snippets.length, equalTo(1)); - - String firstHlValue = "This is a test. Just a test highlighting from postings highlighter."; - String secondHlValue = "This is the second value to perform highlighting on."; - String thirdHlValue = "This is the third value to test highlighting with postings."; - - //default behaviour: using the WholeBreakIterator, despite the multi valued paragraph separator we get back a single snippet for multiple values - assertThat(snippets[0], equalTo(firstHlValue + (char)8233 + secondHlValue + (char)8233 + thirdHlValue)); - - - - highlighter = new XPostingsHighlighter() { - Iterator valuesIterator = Arrays.asList(firstValue, secondValue, thirdValue).iterator(); - Iterator offsetsIterator = Arrays.asList(0, firstValue.length() + 1, firstValue.length() + secondValue.length() + 2).iterator(); - - @Override - protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException { - return new String[][]{new String[]{valuesIterator.next()}}; - } - - @Override - protected int getOffsetForCurrentValue(String field, int docId) { - return offsetsIterator.next(); - } - - @Override - protected BreakIterator getBreakIterator(String field) { - return new WholeBreakIterator(); - } - }; - - //first call using the WholeBreakIterator, we get now only the first value properly highlighted as we wish - snippets = highlighter.highlight("body", query, searcher, topDocs); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0], equalTo(firstHlValue)); - - //second call using the WholeBreakIterator, we get now only the second value properly highlighted as we wish - snippets = highlighter.highlight("body", query, searcher, topDocs); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0], equalTo(secondHlValue)); - - //third call using the WholeBreakIterator, we get now only the third value properly highlighted as we wish - snippets = highlighter.highlight("body", query, searcher, topDocs); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0], equalTo(thirdHlValue)); - - ir.close(); - dir.close(); - } - - @Test - public void testDiscreteHighlightingPerValue_secondValueWithoutMatches() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - final String firstValue = "This is a test. Just a test highlighting from postings highlighter."; - Document doc = new Document(); - doc.add(body); - body.setStringValue(firstValue); - - final String secondValue = "This is the second value without matches."; - Field body2 = new Field("body", "", offsetsType); - doc.add(body2); - body2.setStringValue(secondValue); - - final String thirdValue = "This is the third value to test highlighting with postings."; - Field body3 = new Field("body", "", offsetsType); - doc.add(body3); - body3.setStringValue(thirdValue); - - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - - Query query = new TermQuery(new Term("body", "highlighting")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertThat(topDocs.totalHits, equalTo(1)); - - XPostingsHighlighter highlighter = new XPostingsHighlighter() { - @Override - protected BreakIterator getBreakIterator(String field) { - return new WholeBreakIterator(); - } - - @Override - protected char getMultiValuedSeparator(String field) { - //U+2029 PARAGRAPH SEPARATOR (PS): each value holds a discrete passage for highlighting - return 8233; - } - - @Override - protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { - return new Passage[0]; - } - }; - String snippets[] = highlighter.highlight("body", query, searcher, topDocs); - assertThat(snippets.length, equalTo(1)); - String firstHlValue = "This is a test. Just a test highlighting from postings highlighter."; - String thirdHlValue = "This is the third value to test highlighting with postings."; - //default behaviour: using the WholeBreakIterator, despite the multi valued paragraph separator we get back a single snippet for multiple values - //but only the first and the third value are returned since there are no matches in the second one. - assertThat(snippets[0], equalTo(firstHlValue + (char)8233 + secondValue + (char)8233 + thirdHlValue)); - - - highlighter = new XPostingsHighlighter() { - Iterator valuesIterator = Arrays.asList(firstValue, secondValue, thirdValue).iterator(); - Iterator offsetsIterator = Arrays.asList(0, firstValue.length() + 1, firstValue.length() + secondValue.length() + 2).iterator(); - - @Override - protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException { - return new String[][]{new String[]{valuesIterator.next()}}; - } - - @Override - protected int getOffsetForCurrentValue(String field, int docId) { - return offsetsIterator.next(); - } - - @Override - protected BreakIterator getBreakIterator(String field) { - return new WholeBreakIterator(); - } - - @Override - protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { - return new Passage[0]; - } - }; - - //first call using the WholeBreakIterator, we get now only the first value properly highlighted as we wish - snippets = highlighter.highlight("body", query, searcher, topDocs); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0], equalTo(firstHlValue)); - - //second call using the WholeBreakIterator, we get now nothing back because there's nothing to highlight in the second value - snippets = highlighter.highlight("body", query, searcher, topDocs); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0], nullValue()); - - //third call using the WholeBreakIterator, we get now only the third value properly highlighted as we wish - snippets = highlighter.highlight("body", query, searcher, topDocs); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0], equalTo(thirdHlValue)); - - ir.close(); - dir.close(); - } - - @Test - public void testDiscreteHighlightingPerValue_MultipleMatches() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - final String firstValue = "This is a highlighting test. Just a test highlighting from postings highlighter."; - Document doc = new Document(); - doc.add(body); - body.setStringValue(firstValue); - - final String secondValue = "This is the second highlighting value to test highlighting with postings."; - Field body2 = new Field("body", "", offsetsType); - doc.add(body2); - body2.setStringValue(secondValue); - - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - - Query query = new TermQuery(new Term("body", "highlighting")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertThat(topDocs.totalHits, equalTo(1)); - - String firstHlValue = "This is a highlighting test. Just a test highlighting from postings highlighter."; - String secondHlValue = "This is the second highlighting value to test highlighting with postings."; - - XPostingsHighlighter highlighter = new XPostingsHighlighter() { - Iterator valuesIterator = Arrays.asList(firstValue, secondValue).iterator(); - Iterator offsetsIterator = Arrays.asList(0, firstValue.length() + 1).iterator(); - - @Override - protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException { - return new String[][]{new String[]{valuesIterator.next()}}; - } - - @Override - protected int getOffsetForCurrentValue(String field, int docId) { - return offsetsIterator.next(); - } - - @Override - protected BreakIterator getBreakIterator(String field) { - return new WholeBreakIterator(); - } - - @Override - protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { - return new Passage[0]; - } - }; - - //first call using the WholeBreakIterator, we get now only the first value properly highlighted as we wish - String[] snippets = highlighter.highlight("body", query, searcher, topDocs); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0], equalTo(firstHlValue)); - - //second call using the WholeBreakIterator, we get now only the second value properly highlighted as we wish - snippets = highlighter.highlight("body", query, searcher, topDocs); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0], equalTo(secondHlValue)); - - ir.close(); - dir.close(); - } - - @Test - public void testDiscreteHighlightingPerValue_MultipleQueryTerms() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - final String firstValue = "This is the first sentence. This is the second sentence."; - Document doc = new Document(); - doc.add(body); - body.setStringValue(firstValue); - - final String secondValue = "This is the third sentence. This is the fourth sentence."; - Field body2 = new Field("body", "", offsetsType); - doc.add(body2); - body2.setStringValue(secondValue); - - final String thirdValue = "This is the fifth sentence"; - Field body3 = new Field("body", "", offsetsType); - doc.add(body3); - body3.setStringValue(thirdValue); - - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - - BooleanQuery query = new BooleanQuery(); - query.add(new BooleanClause(new TermQuery(new Term("body", "third")), BooleanClause.Occur.SHOULD)); - query.add(new BooleanClause(new TermQuery(new Term("body", "seventh")), BooleanClause.Occur.SHOULD)); - query.add(new BooleanClause(new TermQuery(new Term("body", "fifth")), BooleanClause.Occur.SHOULD)); - query.setMinimumNumberShouldMatch(1); - - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertThat(topDocs.totalHits, equalTo(1)); - - String secondHlValue = "This is the third sentence. This is the fourth sentence."; - String thirdHlValue = "This is the fifth sentence"; - - XPostingsHighlighter highlighter = new XPostingsHighlighter() { - Iterator valuesIterator = Arrays.asList(firstValue, secondValue, thirdValue).iterator(); - Iterator offsetsIterator = Arrays.asList(0, firstValue.length() + 1, secondValue.length() + 1).iterator(); - - @Override - protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException { - return new String[][]{new String[]{valuesIterator.next()}}; - } - - @Override - protected int getOffsetForCurrentValue(String field, int docId) { - return offsetsIterator.next(); - } - - @Override - protected BreakIterator getBreakIterator(String field) { - return new WholeBreakIterator(); - } - - @Override - protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { - return new Passage[0]; - } - }; - - //first call using the WholeBreakIterator, we get now null as the first value doesn't hold any match - String[] snippets = highlighter.highlight("body", query, searcher, topDocs); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0], nullValue()); - - //second call using the WholeBreakIterator, we get now only the second value properly highlighted as we wish - snippets = highlighter.highlight("body", query, searcher, topDocs); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0], equalTo(secondHlValue)); - - //second call using the WholeBreakIterator, we get now only the third value properly highlighted as we wish - snippets = highlighter.highlight("body", query, searcher, topDocs); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0], equalTo(thirdHlValue)); - - ir.close(); - dir.close(); - } - - /* - The following are tests that we added to make sure that certain behaviours are possible using the postings highlighter - They don't require our forked version, but only custom versions of methods that can be overridden and are already exposed to subclasses - */ - - /* - Tests that it's possible to obtain different fragments per document instead of a big string of concatenated fragments. - We use our own PassageFormatter for that and override the getFormatter method. - */ - @Test - public void testCustomPassageFormatterMultipleFragments() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - Document doc = new Document(); - doc.add(body); - body.setStringValue("This test is another test. Not a good sentence. Test test test test."); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - XPostingsHighlighter highlighter = new XPostingsHighlighter(); - IndexSearcher searcher = newSearcher(ir); - Query query = new TermQuery(new Term("body", "test")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertThat(topDocs.totalHits, equalTo(1)); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 5); - assertThat(snippets.length, equalTo(1)); - //default behaviour that we want to change - assertThat(snippets[0], equalTo("This test is another test. ... Test test test test.")); - - - final CustomPassageFormatter passageFormatter = new CustomPassageFormatter("", "", new DefaultEncoder()); - highlighter = new XPostingsHighlighter() { - @Override - protected PassageFormatter getFormatter(String field) { - return passageFormatter; - } - }; - - final ScoreDoc scoreDocs[] = topDocs.scoreDocs; - int docids[] = new int[scoreDocs.length]; - int maxPassages[] = new int[scoreDocs.length]; - for (int i = 0; i < docids.length; i++) { - docids[i] = scoreDocs[i].doc; - maxPassages[i] = 5; - } - Map highlights = highlighter.highlightFieldsAsObjects(new String[]{"body"}, query, searcher, docids, maxPassages); - assertThat(highlights, notNullValue()); - assertThat(highlights.size(), equalTo(1)); - Object[] objectSnippets = highlights.get("body"); - assertThat(objectSnippets, notNullValue()); - assertThat(objectSnippets.length, equalTo(1)); - assertThat(objectSnippets[0], instanceOf(Snippet[].class)); - - Snippet[] snippetsSnippet = (Snippet[]) objectSnippets[0]; - assertThat(snippetsSnippet.length, equalTo(2)); - //multiple fragments as we wish - assertThat(snippetsSnippet[0].getText(), equalTo("This test is another test.")); - assertThat(snippetsSnippet[1].getText(), equalTo("Test test test test.")); - - ir.close(); - dir.close(); - } - - /* - Tests that it's possible to return no fragments when there's nothing to highlight - We do that by overriding the getEmptyHighlight method - */ - @Test - public void testHighlightWithNoMatches() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - Field none = new Field("none", "", offsetsType); - Document doc = new Document(); - doc.add(body); - doc.add(none); - - body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); - none.setStringValue(body.stringValue()); - iw.addDocument(doc); - body.setStringValue("Highlighting the first term. Hope it works."); - none.setStringValue(body.stringValue()); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter(); - Query query = new TermQuery(new Term("none", "highlighting")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertThat(topDocs.totalHits, equalTo(2)); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 1); - //Two null snippets if there are no matches (thanks to our own custom passage formatter) - assertThat(snippets.length, equalTo(2)); - //default behaviour: returns the first sentence with num passages = 1 - assertThat(snippets[0], equalTo("This is a test. ")); - assertThat(snippets[1], equalTo("Highlighting the first term. ")); - - highlighter = new XPostingsHighlighter() { - @Override - protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { - return new Passage[0]; - } - }; - snippets = highlighter.highlight("body", query, searcher, topDocs); - //Two null snippets if there are no matches, as we wish - assertThat(snippets.length, equalTo(2)); - assertThat(snippets[0], nullValue()); - assertThat(snippets[1], nullValue()); - - ir.close(); - dir.close(); - } - - /* - Tests that it's possible to avoid having fragments that span across different values - We do that by overriding the getMultiValuedSeparator and using a proper separator between values - */ - @Test - public void testCustomMultiValuedSeparator() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - Document doc = new Document(); - doc.add(body); - - body.setStringValue("This is a test. Just a test highlighting from postings"); - - Field body2 = new Field("body", "", offsetsType); - doc.add(body2); - body2.setStringValue("highlighter."); - iw.addDocument(doc); - - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter(); - Query query = new TermQuery(new Term("body", "highlighting")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertThat(topDocs.totalHits, equalTo(1)); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs); - assertThat(snippets.length, equalTo(1)); - //default behaviour: getting a fragment that spans across different values - assertThat(snippets[0], equalTo("Just a test highlighting from postings highlighter.")); - - - highlighter = new XPostingsHighlighter() { - @Override - protected char getMultiValuedSeparator(String field) { - //U+2029 PARAGRAPH SEPARATOR (PS): each value holds a discrete passage for highlighting - return 8233; - } - }; - snippets = highlighter.highlight("body", query, searcher, topDocs); - assertThat(snippets.length, equalTo(1)); - //getting a fragment that doesn't span across different values since we used the paragraph separator between the different values - assertThat(snippets[0], equalTo("Just a test highlighting from postings" + (char)8233)); - - ir.close(); - dir.close(); - } - - - - - /* - The following are all the existing postings highlighter tests, to make sure we don't have regression in our own fork - */ - - @Test - public void testBasics() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - Document doc = new Document(); - doc.add(body); - - body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); - iw.addDocument(doc); - body.setStringValue("Highlighting the first term. Hope it works."); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter(); - Query query = new TermQuery(new Term("body", "highlighting")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs); - assertEquals(2, snippets.length); - assertEquals("Just a test highlighting from postings. ", snippets[0]); - assertEquals("Highlighting the first term. ", snippets[1]); - - ir.close(); - dir.close(); - } - - public void testFormatWithMatchExceedingContentLength2() throws Exception { - - String bodyText = "123 TEST 01234 TEST"; - - String[] snippets = formatWithMatchExceedingContentLength(bodyText); - - assertEquals(1, snippets.length); - assertEquals("123 TEST 01234 TE", snippets[0]); - } - - public void testFormatWithMatchExceedingContentLength3() throws Exception { - - String bodyText = "123 5678 01234 TEST TEST"; - - String[] snippets = formatWithMatchExceedingContentLength(bodyText); - - assertEquals(1, snippets.length); - assertEquals("123 5678 01234 TE", snippets[0]); - } - - public void testFormatWithMatchExceedingContentLength() throws Exception { - - String bodyText = "123 5678 01234 TEST"; - - String[] snippets = formatWithMatchExceedingContentLength(bodyText); - - assertEquals(1, snippets.length); - // LUCENE-5166: no snippet - assertEquals("123 5678 01234 TE", snippets[0]); - } - - private String[] formatWithMatchExceedingContentLength(String bodyText) throws IOException { - - int maxLength = 17; - - final Analyzer analyzer = new MockAnalyzer(random()); - - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(analyzer); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - final FieldType fieldType = new FieldType(TextField.TYPE_STORED); - fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - final Field body = new Field("body", bodyText, fieldType); - - Document doc = new Document(); - doc.add(body); - - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - - Query query = new TermQuery(new Term("body", "test")); - - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits); - - XPostingsHighlighter highlighter = new XPostingsHighlighter(maxLength); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs); - - - ir.close(); - dir.close(); - return snippets; - } - - // simple test highlighting last word. - public void testHighlightLastWord() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - Document doc = new Document(); - doc.add(body); - - body.setStringValue("This is a test"); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter(); - Query query = new TermQuery(new Term("body", "test")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs); - assertEquals(1, snippets.length); - assertEquals("This is a test", snippets[0]); - - ir.close(); - dir.close(); - } - - // simple test with one sentence documents. - @Test - public void testOneSentence() throws Exception { - Directory dir = newDirectory(); - // use simpleanalyzer for more natural tokenization (else "test." is a token) - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - Document doc = new Document(); - doc.add(body); - - body.setStringValue("This is a test."); - iw.addDocument(doc); - body.setStringValue("Test a one sentence document."); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter(); - Query query = new TermQuery(new Term("body", "test")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs); - assertEquals(2, snippets.length); - assertEquals("This is a test.", snippets[0]); - assertEquals("Test a one sentence document.", snippets[1]); - - ir.close(); - dir.close(); - } - - // simple test with multiple values that make a result longer than maxLength. - @Test - public void testMaxLengthWithMultivalue() throws Exception { - Directory dir = newDirectory(); - // use simpleanalyzer for more natural tokenization (else "test." is a token) - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Document doc = new Document(); - - for(int i = 0; i < 3 ; i++) { - Field body = new Field("body", "", offsetsType); - body.setStringValue("This is a multivalued field"); - doc.add(body); - } - - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter(40); - Query query = new TermQuery(new Term("body", "field")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs); - assertEquals(1, snippets.length); - assertTrue("Snippet should have maximum 40 characters plus the pre and post tags", - snippets[0].length() == (40 + "".length())); - - ir.close(); - dir.close(); - } - - @Test - public void testMultipleFields() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - Field title = new Field("title", "", offsetsType); - Document doc = new Document(); - doc.add(body); - doc.add(title); - - body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); - title.setStringValue("I am hoping for the best."); - iw.addDocument(doc); - body.setStringValue("Highlighting the first term. Hope it works."); - title.setStringValue("But best may not be good enough."); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter(); - BooleanQuery query = new BooleanQuery(); - query.add(new TermQuery(new Term("body", "highlighting")), BooleanClause.Occur.SHOULD); - query.add(new TermQuery(new Term("title", "best")), BooleanClause.Occur.SHOULD); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits); - Map snippets = highlighter.highlightFields(new String [] { "body", "title" }, query, searcher, topDocs); - assertEquals(2, snippets.size()); - assertEquals("Just a test highlighting from postings. ", snippets.get("body")[0]); - assertEquals("Highlighting the first term. ", snippets.get("body")[1]); - assertEquals("I am hoping for the best.", snippets.get("title")[0]); - assertEquals("But best may not be good enough.", snippets.get("title")[1]); - ir.close(); - dir.close(); - } - - @Test - public void testMultipleTerms() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - Document doc = new Document(); - doc.add(body); - - body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); - iw.addDocument(doc); - body.setStringValue("Highlighting the first term. Hope it works."); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter(); - BooleanQuery query = new BooleanQuery(); - query.add(new TermQuery(new Term("body", "highlighting")), BooleanClause.Occur.SHOULD); - query.add(new TermQuery(new Term("body", "just")), BooleanClause.Occur.SHOULD); - query.add(new TermQuery(new Term("body", "first")), BooleanClause.Occur.SHOULD); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs); - assertEquals(2, snippets.length); - assertEquals("Just a test highlighting from postings. ", snippets[0]); - assertEquals("Highlighting the first term. ", snippets[1]); - - ir.close(); - dir.close(); - } - - @Test - public void testMultiplePassages() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - Document doc = new Document(); - doc.add(body); - - body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); - iw.addDocument(doc); - body.setStringValue("This test is another test. Not a good sentence. Test test test test."); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter(); - Query query = new TermQuery(new Term("body", "test")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2); - assertEquals(2, snippets.length); - assertEquals("This is a test. Just a test highlighting from postings. ", snippets[0]); - assertEquals("This test is another test. ... Test test test test.", snippets[1]); - - ir.close(); - dir.close(); - } - - @Test - public void testUserFailedToIndexOffsets() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType positionsType = new FieldType(TextField.TYPE_STORED); - positionsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); - Field body = new Field("body", "", positionsType); - Field title = new StringField("title", "", Field.Store.YES); - Document doc = new Document(); - doc.add(body); - doc.add(title); - - body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); - title.setStringValue("test"); - iw.addDocument(doc); - body.setStringValue("This test is another test. Not a good sentence. Test test test test."); - title.setStringValue("test"); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter(); - Query query = new TermQuery(new Term("body", "test")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits); - try { - highlighter.highlight("body", query, searcher, topDocs, 2); - fail("did not hit expected exception"); - } catch (IllegalArgumentException iae) { - // expected - } - - try { - highlighter.highlight("title", new TermQuery(new Term("title", "test")), searcher, topDocs, 2); - fail("did not hit expected exception"); - } catch (IllegalArgumentException iae) { - // expected - } - ir.close(); - dir.close(); - } - - @Test - public void testBuddhism() throws Exception { - String text = "This eight-volume set brings together seminal papers in Buddhist studies from a vast " + - "range of academic disciplines published over the last forty years. With a new introduction " + - "by the editor, this collection is a unique and unrivalled research resource for both " + - "student and scholar. Coverage includes: - Buddhist origins; early history of Buddhism in " + - "South and Southeast Asia - early Buddhist Schools and Doctrinal History; Theravada Doctrine " + - "- the Origins and nature of Mahayana Buddhism; some Mahayana religious topics - Abhidharma " + - "and Madhyamaka - Yogacara, the Epistemological tradition, and Tathagatagarbha - Tantric " + - "Buddhism (Including China and Japan); Buddhism in Nepal and Tibet - Buddhism in South and " + - "Southeast Asia, and - Buddhism in China, East Asia, and Japan."; - Directory dir = newDirectory(); - Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, analyzer); - - FieldType positionsType = new FieldType(TextField.TYPE_STORED); - positionsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", text, positionsType); - Document document = new Document(); - document.add(body); - iw.addDocument(document); - IndexReader ir = iw.getReader(); - iw.close(); - IndexSearcher searcher = newSearcher(ir); - PhraseQuery query = new PhraseQuery(); - query.add(new Term("body", "buddhist")); - query.add(new Term("body", "origins")); - TopDocs topDocs = searcher.search(query, 10); - assertEquals(1, topDocs.totalHits); - XPostingsHighlighter highlighter = new XPostingsHighlighter(); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2); - assertEquals(1, snippets.length); - assertTrue(snippets[0].contains("Buddhist origins")); - ir.close(); - dir.close(); - } - - @Test - public void testCuriousGeorge() throws Exception { - String text = "It’s the formula for success for preschoolers—Curious George and fire trucks! " + - "Curious George and the Firefighters is a story based on H. A. and Margret Rey’s " + - "popular primate and painted in the original watercolor and charcoal style. " + - "Firefighters are a famously brave lot, but can they withstand a visit from one curious monkey?"; - Directory dir = newDirectory(); - Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, analyzer); - FieldType positionsType = new FieldType(TextField.TYPE_STORED); - positionsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", text, positionsType); - Document document = new Document(); - document.add(body); - iw.addDocument(document); - IndexReader ir = iw.getReader(); - iw.close(); - IndexSearcher searcher = newSearcher(ir); - PhraseQuery query = new PhraseQuery(); - query.add(new Term("body", "curious")); - query.add(new Term("body", "george")); - TopDocs topDocs = searcher.search(query, 10); - assertEquals(1, topDocs.totalHits); - XPostingsHighlighter highlighter = new XPostingsHighlighter(); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2); - assertEquals(1, snippets.length); - assertFalse(snippets[0].contains("CuriousCurious")); - ir.close(); - dir.close(); - } - - @Test - public void testCambridgeMA() throws Exception { - BufferedReader r = new BufferedReader(new InputStreamReader( - this.getClass().getResourceAsStream("CambridgeMA.utf8"), "UTF-8")); - String text = r.readLine(); - r.close(); - Directory dir = newDirectory(); - Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, analyzer); - FieldType positionsType = new FieldType(TextField.TYPE_STORED); - positionsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", text, positionsType); - Document document = new Document(); - document.add(body); - iw.addDocument(document); - IndexReader ir = iw.getReader(); - iw.close(); - IndexSearcher searcher = newSearcher(ir); - BooleanQuery query = new BooleanQuery(); - query.add(new TermQuery(new Term("body", "porter")), BooleanClause.Occur.SHOULD); - query.add(new TermQuery(new Term("body", "square")), BooleanClause.Occur.SHOULD); - query.add(new TermQuery(new Term("body", "massachusetts")), BooleanClause.Occur.SHOULD); - TopDocs topDocs = searcher.search(query, 10); - assertEquals(1, topDocs.totalHits); - XPostingsHighlighter highlighter = new XPostingsHighlighter(Integer.MAX_VALUE-1); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2); - assertEquals(1, snippets.length); - assertTrue(snippets[0].contains("Square")); - assertTrue(snippets[0].contains("Porter")); - ir.close(); - dir.close(); - } - - @Test - public void testPassageRanking() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - Document doc = new Document(); - doc.add(body); - - body.setStringValue("This is a test. Just highlighting from postings. This is also a much sillier test. Feel free to test test test test test test test."); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter(); - Query query = new TermQuery(new Term("body", "test")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2); - assertEquals(1, snippets.length); - assertEquals("This is a test. ... Feel free to test test test test test test test.", snippets[0]); - - ir.close(); - dir.close(); - } - - @Test - public void testBooleanMustNot() throws Exception { - Directory dir = newDirectory(); - Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, analyzer); - FieldType positionsType = new FieldType(TextField.TYPE_STORED); - positionsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "This sentence has both terms. This sentence has only terms.", positionsType); - Document document = new Document(); - document.add(body); - iw.addDocument(document); - IndexReader ir = iw.getReader(); - iw.close(); - IndexSearcher searcher = newSearcher(ir); - BooleanQuery query = new BooleanQuery(); - query.add(new TermQuery(new Term("body", "terms")), BooleanClause.Occur.SHOULD); - BooleanQuery query2 = new BooleanQuery(); - query.add(query2, BooleanClause.Occur.SHOULD); - query2.add(new TermQuery(new Term("body", "both")), BooleanClause.Occur.MUST_NOT); - TopDocs topDocs = searcher.search(query, 10); - assertEquals(1, topDocs.totalHits); - XPostingsHighlighter highlighter = new XPostingsHighlighter(Integer.MAX_VALUE-1); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2); - assertEquals(1, snippets.length); - assertFalse(snippets[0].contains("both")); - ir.close(); - dir.close(); - } - - @Test - public void testHighlightAllText() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - Document doc = new Document(); - doc.add(body); - - body.setStringValue("This is a test. Just highlighting from postings. This is also a much sillier test. Feel free to test test test test test test test."); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter(10000) { - @Override - protected BreakIterator getBreakIterator(String field) { - return new WholeBreakIterator(); - } - }; - Query query = new TermQuery(new Term("body", "test")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2); - assertEquals(1, snippets.length); - assertEquals("This is a test. Just highlighting from postings. This is also a much sillier test. Feel free to test test test test test test test.", snippets[0]); - - ir.close(); - dir.close(); - } - - @Test - public void testSpecificDocIDs() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - Document doc = new Document(); - doc.add(body); - - body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); - iw.addDocument(doc); - body.setStringValue("Highlighting the first term. Hope it works."); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter(); - Query query = new TermQuery(new Term("body", "highlighting")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertEquals(2, topDocs.totalHits); - ScoreDoc[] hits = topDocs.scoreDocs; - int[] docIDs = new int[2]; - docIDs[0] = hits[0].doc; - docIDs[1] = hits[1].doc; - String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, new int[] { 1 }).get("body"); - assertEquals(2, snippets.length); - assertEquals("Just a test highlighting from postings. ", snippets[0]); - assertEquals("Highlighting the first term. ", snippets[1]); - - ir.close(); - dir.close(); - } - - @Test - public void testCustomFieldValueSource() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - Document doc = new Document(); - - FieldType offsetsType = new FieldType(TextField.TYPE_NOT_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - final String text = "This is a test. Just highlighting from postings. This is also a much sillier test. Feel free to test test test test test test test."; - Field body = new Field("body", text, offsetsType); - doc.add(body); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - - XPostingsHighlighter highlighter = new XPostingsHighlighter(10000) { - @Override - protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException { - assertThat(fields.length, equalTo(1)); - assertThat(docids.length, equalTo(1)); - String[][] contents = new String[1][1]; - contents[0][0] = text; - return contents; - } - - @Override - protected BreakIterator getBreakIterator(String field) { - return new WholeBreakIterator(); - } - }; - - Query query = new TermQuery(new Term("body", "test")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2); - assertEquals(1, snippets.length); - assertEquals("This is a test. Just highlighting from postings. This is also a much sillier test. Feel free to test test test test test test test.", snippets[0]); - - ir.close(); - dir.close(); - } - - /** Make sure highlighter returns first N sentences if - * there were no hits. */ - @Test - public void testEmptyHighlights() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Document doc = new Document(); - - Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType); - doc.add(body); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter(); - Query query = new TermQuery(new Term("body", "highlighting")); - int[] docIDs = new int[] {0}; - String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, new int[] { 2 }).get("body"); - assertEquals(1, snippets.length); - assertEquals("test this is. another sentence this test has. ", snippets[0]); - - ir.close(); - dir.close(); - } - - /** Make sure highlighter we can customize how emtpy - * highlight is returned. */ - @Test - public void testCustomEmptyHighlights() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Document doc = new Document(); - - Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType); - doc.add(body); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter() { - @Override - public Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { - return new Passage[0]; - } - }; - Query query = new TermQuery(new Term("body", "highlighting")); - int[] docIDs = new int[] {0}; - String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, new int[] { 2 }).get("body"); - assertEquals(1, snippets.length); - assertNull(snippets[0]); - - ir.close(); - dir.close(); - } - - /** Make sure highlighter returns whole text when there - * are no hits and BreakIterator is null. */ - @Test - public void testEmptyHighlightsWhole() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Document doc = new Document(); - - Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType); - doc.add(body); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter(10000) { - @Override - protected BreakIterator getBreakIterator(String field) { - return new WholeBreakIterator(); - } - }; - Query query = new TermQuery(new Term("body", "highlighting")); - int[] docIDs = new int[] {0}; - String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, new int[] { 2 }).get("body"); - assertEquals(1, snippets.length); - assertEquals("test this is. another sentence this test has. far away is that planet.", snippets[0]); - - ir.close(); - dir.close(); - } - - /** Make sure highlighter is OK with entirely missing - * field. */ - @Test - public void testFieldIsMissing() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Document doc = new Document(); - - Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType); - doc.add(body); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter(); - Query query = new TermQuery(new Term("bogus", "highlighting")); - int[] docIDs = new int[] {0}; - String snippets[] = highlighter.highlightFields(new String[] {"bogus"}, query, searcher, docIDs, new int[] { 2 }).get("bogus"); - assertEquals(1, snippets.length); - assertNull(snippets[0]); - - ir.close(); - dir.close(); - } - - @Test - public void testFieldIsJustSpace() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - - Document doc = new Document(); - doc.add(new Field("body", " ", offsetsType)); - doc.add(new Field("id", "id", offsetsType)); - iw.addDocument(doc); - - doc = new Document(); - doc.add(new Field("body", "something", offsetsType)); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter(); - int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc; - - Query query = new TermQuery(new Term("body", "highlighting")); - int[] docIDs = new int[1]; - docIDs[0] = docID; - String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, new int[] { 2 }).get("body"); - assertEquals(1, snippets.length); - assertEquals(" ", snippets[0]); - - ir.close(); - dir.close(); - } - - @Test - public void testFieldIsEmptyString() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - - Document doc = new Document(); - doc.add(new Field("body", "", offsetsType)); - doc.add(new Field("id", "id", offsetsType)); - iw.addDocument(doc); - - doc = new Document(); - doc.add(new Field("body", "something", offsetsType)); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter(); - int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc; - - Query query = new TermQuery(new Term("body", "highlighting")); - int[] docIDs = new int[1]; - docIDs[0] = docID; - String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, new int[] { 2 }).get("body"); - assertEquals(1, snippets.length); - assertNull(snippets[0]); - - ir.close(); - dir.close(); - } - - @Test - public void testMultipleDocs() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - - int numDocs = scaledRandomIntBetween(100, 1000); - for(int i=0;i snippets = highlighter.highlightFields(new String[] { "title", "body" }, query, searcher, new int[] { 0 }, new int[] { 1, 2 }); - String titleHighlight = snippets.get("title")[0]; - String bodyHighlight = snippets.get("body")[0]; - assertEquals("This is a test. ", titleHighlight); - assertEquals("This is a test. Just a test highlighting from postings. ", bodyHighlight); - ir.close(); - dir.close(); - } - - public void testEncode() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - Document doc = new Document(); - doc.add(body); - - body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - PostingsHighlighter highlighter = new PostingsHighlighter() { - @Override - protected PassageFormatter getFormatter(String field) { - return new DefaultPassageFormatter("", "", "... ", true); - } - }; - Query query = new TermQuery(new Term("body", "highlighting")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs); - assertEquals(1, snippets.length); - assertEquals("Just a test highlighting from <i>postings</i>. ", snippets[0]); - - ir.close(); - dir.close(); - } - - /** customizing the gap separator to force a sentence break */ - public void testGapSeparator() throws Exception { - Directory dir = newDirectory(); - // use simpleanalyzer for more natural tokenization (else "test." is a token) - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Document doc = new Document(); - - Field body1 = new Field("body", "", offsetsType); - body1.setStringValue("This is a multivalued field"); - doc.add(body1); - - Field body2 = new Field("body", "", offsetsType); - body2.setStringValue("This is something different"); - doc.add(body2); - - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - PostingsHighlighter highlighter = new PostingsHighlighter() { - @Override - protected char getMultiValuedSeparator(String field) { - assert field.equals("body"); - return '\u2029'; - } - }; - Query query = new TermQuery(new Term("body", "field")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits); - String snippets[] = highlighter.highlight("body", query, searcher, topDocs); - assertEquals(1, snippets.length); - assertEquals("This is a multivalued field\u2029", snippets[0]); - - ir.close(); - dir.close(); - } - - // LUCENE-4906 - public void testObjectFormatter() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field body = new Field("body", "", offsetsType); - Document doc = new Document(); - doc.add(body); - - body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - XPostingsHighlighter highlighter = new XPostingsHighlighter() { - @Override - protected PassageFormatter getFormatter(String field) { - return new PassageFormatter() { - PassageFormatter defaultFormatter = new DefaultPassageFormatter(); - - @Override - public String[] format(Passage passages[], String content) { - // Just turns the String snippet into a length 2 - // array of String - return new String[] {"blah blah", defaultFormatter.format(passages, content).toString()}; - } - }; - } - }; - - Query query = new TermQuery(new Term("body", "highlighting")); - TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits); - int[] docIDs = new int[1]; - docIDs[0] = topDocs.scoreDocs[0].doc; - Map snippets = highlighter.highlightFieldsAsObjects(new String[]{"body"}, query, searcher, docIDs, new int[] {1}); - Object[] bodySnippets = snippets.get("body"); - assertEquals(1, bodySnippets.length); - assertTrue(Arrays.equals(new String[] {"blah blah", "Just a test highlighting from postings. "}, (String[]) bodySnippets[0])); - - ir.close(); - dir.close(); - } -} diff --git a/src/test/java/org/elasticsearch/search/highlight/HighlighterSearchTests.java b/src/test/java/org/elasticsearch/search/highlight/HighlighterSearchTests.java index 7758c3fca13d3..76fc678cd75c5 100644 --- a/src/test/java/org/elasticsearch/search/highlight/HighlighterSearchTests.java +++ b/src/test/java/org/elasticsearch/search/highlight/HighlighterSearchTests.java @@ -21,7 +21,6 @@ import com.carrotsearch.randomizedtesting.generators.RandomPicks; import com.google.common.base.Joiner; import com.google.common.collect.Iterables; - import org.apache.lucene.util.LuceneTestCase.Slow; import org.elasticsearch.Version; import org.elasticsearch.action.index.IndexRequestBuilder; @@ -31,15 +30,9 @@ import org.elasticsearch.common.settings.ImmutableSettings.Builder; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentFactory; -import org.elasticsearch.index.query.BoostableQueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.index.query.IdsQueryBuilder; -import org.elasticsearch.index.query.MatchQueryBuilder; +import org.elasticsearch.index.query.*; import org.elasticsearch.index.query.MatchQueryBuilder.Operator; import org.elasticsearch.index.query.MatchQueryBuilder.Type; -import org.elasticsearch.index.query.MultiMatchQueryBuilder; -import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.rest.RestStatus; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.builder.SearchSourceBuilder; @@ -56,40 +49,12 @@ import static org.elasticsearch.client.Requests.searchRequest; import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; -import static org.elasticsearch.index.query.QueryBuilders.missingQuery; -import static org.elasticsearch.index.query.QueryBuilders.typeQuery; -import static org.elasticsearch.index.query.QueryBuilders.boolQuery; -import static org.elasticsearch.index.query.QueryBuilders.boostingQuery; -import static org.elasticsearch.index.query.QueryBuilders.commonTermsQuery; -import static org.elasticsearch.index.query.QueryBuilders.constantScoreQuery; -import static org.elasticsearch.index.query.QueryBuilders.filteredQuery; -import static org.elasticsearch.index.query.QueryBuilders.fuzzyQuery; -import static org.elasticsearch.index.query.QueryBuilders.matchPhrasePrefixQuery; -import static org.elasticsearch.index.query.QueryBuilders.matchPhraseQuery; -import static org.elasticsearch.index.query.QueryBuilders.matchQuery; -import static org.elasticsearch.index.query.QueryBuilders.multiMatchQuery; -import static org.elasticsearch.index.query.QueryBuilders.prefixQuery; -import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery; -import static org.elasticsearch.index.query.QueryBuilders.rangeQuery; -import static org.elasticsearch.index.query.QueryBuilders.regexpQuery; -import static org.elasticsearch.index.query.QueryBuilders.termQuery; -import static org.elasticsearch.index.query.QueryBuilders.wildcardQuery; +import static org.elasticsearch.index.query.QueryBuilders.*; import static org.elasticsearch.search.builder.SearchSourceBuilder.highlight; import static org.elasticsearch.search.builder.SearchSourceBuilder.searchSource; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertFailures; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHighlight; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNotHighlighted; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.*; import static org.elasticsearch.test.hamcrest.RegexMatcher.matches; -import static org.hamcrest.Matchers.anyOf; -import static org.hamcrest.Matchers.containsString; -import static org.hamcrest.Matchers.equalTo; -import static org.hamcrest.Matchers.hasKey; -import static org.hamcrest.Matchers.not; -import static org.hamcrest.Matchers.nullValue; -import static org.hamcrest.Matchers.startsWith; +import static org.hamcrest.Matchers.*; @Slow public class HighlighterSearchTests extends ElasticsearchIntegrationTest { @@ -452,8 +417,7 @@ public void testSourceLookupHighlightingUsingPostingsHighlighter() throws Except .addHighlightedField("title", -1, 0).get(); for (int i = 0; i < indexRequestBuilders.length; i++) { - assertHighlight(search, i, "title", 0, equalTo("This is a test on the highlighting bug present in elasticsearch. Hopefully it works.")); - assertHighlight(search, i, "title", 1, 2, equalTo("This is the second bug to perform highlighting on.")); + assertHighlight(search, i, "title", 0, equalTo("This is a test on the highlighting bug present in elasticsearch. Hopefully it works." + HighlightUtils.PARAGRAPH_SEPARATOR + "This is the second bug to perform highlighting on.")); } search = client().prepareSearch() @@ -496,10 +460,10 @@ public void testHighlightIssue1994() throws Exception { .addHighlightedField("titleTV", -1, 2) .get(); - assertHighlight(search, 0, "title", 0, equalTo("This is a test on the highlighting bug present in elasticsearch")); + assertHighlight(search, 0, "title", 0, equalTo("This is a test on the highlighting bug present in elasticsearch")); assertHighlight(search, 0, "title", 1, 2, equalTo("The bug is bugging us")); - assertHighlight(search, 0, "titleTV", 0, equalTo("This is a test on the highlighting bug present in elasticsearch")); - assertHighlight(search, 0, "titleTV", 1, 2, equalTo("The bug is bugging us")); + assertHighlight(search, 0, "titleTV", 0, equalTo("This is a test on the highlighting bug present in elasticsearch")); + assertHighlight(search, 0, "titleTV", 1, 2, equalTo("The bug is bugging us")); search = client().prepareSearch() .setQuery(matchQuery("titleTV", "highlight")) @@ -551,8 +515,9 @@ public void testHighlightingOnWildcardFields() throws Exception { logger.info("--> highlighting and searching on field*"); SearchSourceBuilder source = searchSource() - .query(termQuery("field-plain", "test")) - .highlight(highlight().field("field*").preTags("").postTags("")); + //postings hl doesn't support require_field_match, its field needs to be queried directly + .query(termQuery("field-postings", "test")) + .highlight(highlight().field("field*").preTags("").postTags("").requireFieldMatch(false)); SearchResponse searchResponse = client().search(searchRequest("test").source(source)).actionGet(); @@ -587,14 +552,14 @@ public void testForceSourceWithSourceDisabledBackcompat() throws Exception { assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("The quick brown fox jumps over the lazy dog")); assertFailures(client().prepareSearch("test") - .setQuery(termQuery("field1", "quick")) - .addHighlightedField(new Field("field1").preTags("").postTags("").highlighterType("plain").forceSource(true)), + .setQuery(termQuery("field1", "quick")) + .addHighlightedField(new Field("field1").preTags("").postTags("").highlighterType("plain").forceSource(true)), RestStatus.BAD_REQUEST, containsString("source is forced for fields [field1] but type [type1] has disabled _source")); assertFailures(client().prepareSearch("test") - .setQuery(termQuery("field1", "quick")) - .addHighlightedField(new Field("field1").preTags("").postTags("").highlighterType("fvh").forceSource(true)), + .setQuery(termQuery("field1", "quick")) + .addHighlightedField(new Field("field1").preTags("").postTags("").highlighterType("fvh").forceSource(true)), RestStatus.BAD_REQUEST, containsString("source is forced for fields [field1] but type [type1] has disabled _source")); @@ -1451,11 +1416,11 @@ public void testPlainHighlightDifferentFragmenter() throws Exception { assertHighlight(response, 0, "tags", 1, 2, equalTo("here is another one that is very long tag and has the tag token near the end")); assertFailures(client().prepareSearch("test") - .setQuery(QueryBuilders.matchQuery("tags", "long tag").type(MatchQueryBuilder.Type.PHRASE)) - .addHighlightedField(new HighlightBuilder.Field("tags") - .fragmentSize(-1).numOfFragments(2).fragmenter("invalid")), - RestStatus.BAD_REQUEST, - containsString("unknown fragmenter option [invalid] for the field [tags]")); + .setQuery(QueryBuilders.matchQuery("tags", "long tag").type(MatchQueryBuilder.Type.PHRASE)) + .addHighlightedField(new HighlightBuilder.Field("tags") + .fragmentSize(-1).numOfFragments(2).fragmenter("invalid")), + RestStatus.BAD_REQUEST, + containsString("unknown fragmenter option [invalid] for the field [tags]")); } @Test @@ -1891,8 +1856,9 @@ public void testHighlightNoMatchSizeNumberOfFragments() throws IOException { field.highlighterType("postings"); response = client().prepareSearch("test").setQuery(queryBuilder).addHighlightedField(field).get(); - assertHighlight(response, 0, "text", 0, 2, equalTo("This is the third sentence. This is the fourth sentence.")); - assertHighlight(response, 0, "text", 1, 2, equalTo("This is the fifth sentence")); + assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence. This is the second sentence." + + HighlightUtils.PARAGRAPH_SEPARATOR + "This is the third sentence. This is the fourth sentence." + + HighlightUtils.PARAGRAPH_SEPARATOR + "This is the fifth sentence")); } @Test @@ -1914,7 +1880,7 @@ public void testPostingsHighlighter() throws Exception { logger.info("--> searching on _all, highlighting on field1"); source = searchSource() - .query(termQuery("_all", "test")) + .query(termQuery("field1", "test")) .highlight(highlight().field("field1").preTags("").postTags("")); searchResponse = client().search(searchRequest("test").source(source)).actionGet(); @@ -1923,7 +1889,7 @@ public void testPostingsHighlighter() throws Exception { logger.info("--> searching on _all, highlighting on field2"); source = searchSource() - .query(termQuery("_all", "quick")) + .query(termQuery("field2", "quick")) .highlight(highlight().field("field2").order("score").preTags("").postTags("")); searchResponse = client().search(searchRequest("test").source(source)).actionGet(); @@ -1932,7 +1898,7 @@ public void testPostingsHighlighter() throws Exception { logger.info("--> searching on _all, highlighting on field2"); source = searchSource() - .query(matchPhraseQuery("_all", "quick brown")) + .query(matchPhraseQuery("field2", "quick brown")) .highlight(highlight().field("field2").preTags("").postTags("")); searchResponse = client().search(searchRequest("test").source(source)).actionGet(); @@ -1943,7 +1909,7 @@ public void testPostingsHighlighter() throws Exception { //lets fall back to the standard highlighter then, what people would do to highlight query matches logger.info("--> searching on _all, highlighting on field2, falling back to the plain highlighter"); source = searchSource() - .query(matchPhraseQuery("_all", "quick brown")) + .query(matchPhraseQuery("field2", "quick brown")) .highlight(highlight().field("field2").preTags("").postTags("").highlighterType("highlighter")); searchResponse = client().search(searchRequest("test").source(source)).actionGet(); @@ -1962,10 +1928,8 @@ public void testPostingsHighlighterMultipleFields() throws Exception { SearchResponse response = client().prepareSearch("test") .setQuery(QueryBuilders.matchQuery("field1", "fox")) .addHighlightedField(new HighlightBuilder.Field("field1").preTags("<1>").postTags("").requireFieldMatch(true)) - .addHighlightedField(new HighlightBuilder.Field("field2").preTags("<2>").postTags("").requireFieldMatch(false)) .get(); assertHighlight(response, 0, "field1", 0, 1, equalTo("The quick brown <1>fox.")); - assertHighlight(response, 0, "field2", 0, 1, equalTo("The slow brown <2>fox.")); } @Test @@ -1982,8 +1946,7 @@ public void testPostingsHighlighterNumberOfFragments() throws Exception { SearchSourceBuilder source = searchSource() .query(termQuery("field1", "fox")) .highlight(highlight() - .field(new HighlightBuilder.Field("field1").numOfFragments(5).preTags("").postTags("")) - .field(new HighlightBuilder.Field("field2").numOfFragments(2).preTags("").postTags(""))); + .field(new HighlightBuilder.Field("field1").numOfFragments(5).preTags("").postTags(""))); SearchResponse searchResponse = client().search(searchRequest("test").source(source)).actionGet(); @@ -1991,9 +1954,6 @@ public void testPostingsHighlighterNumberOfFragments() throws Exception { assertHighlight(searchResponse, 0, "field1", 1, equalTo("The lazy red fox jumps over the quick dog.")); assertHighlight(searchResponse, 0, "field1", 2, 3, equalTo("The quick brown dog jumps over the lazy fox.")); - assertHighlight(searchResponse, 0, "field2", 0, equalTo("The quick brown fox jumps over the lazy dog.")); - assertHighlight(searchResponse, 0, "field2", 1, 2, equalTo("The lazy red fox jumps over the quick dog.")); - client().prepareIndex("test", "type1", "2") .setSource("field1", new String[]{"The quick brown fox jumps over the lazy dog. Second sentence not finished", "The lazy red fox jumps over the quick dog.", "The quick brown dog jumps over the lazy fox."}).get(); refresh(); @@ -2010,76 +1970,15 @@ public void testPostingsHighlighterNumberOfFragments() throws Exception { if ("1".equals(searchHit.id())) { assertHighlight(searchHit, "field1", 0, 1, equalTo("The quick brown fox jumps over the lazy dog. The lazy red fox jumps over the quick dog. The quick brown dog jumps over the lazy fox.")); } else if ("2".equals(searchHit.id())) { - assertHighlight(searchHit, "field1", 0, equalTo("The quick brown fox jumps over the lazy dog. Second sentence not finished")); - assertHighlight(searchHit, "field1", 1, equalTo("The lazy red fox jumps over the quick dog.")); - assertHighlight(searchHit, "field1", 2, 3, equalTo("The quick brown dog jumps over the lazy fox.")); + assertHighlight(searchHit, "field1", 0, equalTo("The quick brown fox jumps over the lazy dog. Second sentence not finished" + + HighlightUtils.PARAGRAPH_SEPARATOR + "The lazy red fox jumps over the quick dog." + + HighlightUtils.PARAGRAPH_SEPARATOR + "The quick brown dog jumps over the lazy fox.")); } else { fail("Only hits with id 1 and 2 are returned"); } } } - @Test - public void testPostingsHighlighterRequireFieldMatch() throws Exception { - assertAcked(prepareCreate("test").addMapping("type1", type1PostingsffsetsMapping())); - ensureGreen(); - - client().prepareIndex("test", "type1") - .setSource("field1", "The quick brown fox jumps over the lazy dog. The lazy red fox jumps over the quick dog. The quick brown dog jumps over the lazy fox.", - "field2", "The quick brown fox jumps over the lazy dog. The lazy red fox jumps over the quick dog. The quick brown dog jumps over the lazy fox.").get(); - refresh(); - - logger.info("--> highlighting and searching on field1"); - SearchSourceBuilder source = searchSource() - .query(termQuery("field1", "fox")) - .highlight(highlight() - .field(new HighlightBuilder.Field("field1").requireFieldMatch(true).preTags("").postTags("")) - .field(new HighlightBuilder.Field("field2").requireFieldMatch(true).preTags("").postTags(""))); - - SearchResponse searchResponse = client().search(searchRequest("test").source(source)).actionGet(); - - //field2 is not returned highlighted because of the require field match option set to true - assertNotHighlighted(searchResponse, 0, "field2"); - assertHighlight(searchResponse, 0, "field1", 0, equalTo("The quick brown fox jumps over the lazy dog.")); - assertHighlight(searchResponse, 0, "field1", 1, equalTo("The lazy red fox jumps over the quick dog.")); - assertHighlight(searchResponse, 0, "field1", 2, 3, equalTo("The quick brown dog jumps over the lazy fox.")); - - logger.info("--> highlighting and searching on field1 and field2 - require field match set to false"); - source = searchSource() - .query(termQuery("field1", "fox")) - .highlight(highlight() - .field(new HighlightBuilder.Field("field1").requireFieldMatch(false).preTags("").postTags("")) - .field(new HighlightBuilder.Field("field2").requireFieldMatch(false).preTags("").postTags(""))); - - searchResponse = client().search(searchRequest("test").source(source)).actionGet(); - - assertHighlight(searchResponse, 0, "field1", 0, equalTo("The quick brown fox jumps over the lazy dog.")); - assertHighlight(searchResponse, 0, "field1", 1, equalTo("The lazy red fox jumps over the quick dog.")); - assertHighlight(searchResponse, 0, "field1", 2, 3, equalTo("The quick brown dog jumps over the lazy fox.")); - - //field2 is now returned highlighted thanks to require_field_match set to false - assertHighlight(searchResponse, 0, "field2", 0, equalTo("The quick brown fox jumps over the lazy dog.")); - assertHighlight(searchResponse, 0, "field2", 1, equalTo("The lazy red fox jumps over the quick dog.")); - assertHighlight(searchResponse, 0, "field2", 2, 3, equalTo("The quick brown dog jumps over the lazy fox.")); - logger.info("--> highlighting and searching on field1 and field2 via multi_match query"); - final MultiMatchQueryBuilder mmquery = multiMatchQuery("fox", "field1", "field2").type(RandomPicks.randomFrom(getRandom(), MultiMatchQueryBuilder.Type.values())); - source = searchSource() - .query(mmquery) - .highlight(highlight().highlightQuery(randomBoolean() ? mmquery : null) - .field(new HighlightBuilder.Field("field1").requireFieldMatch(true).preTags("").postTags("")) - .field(new HighlightBuilder.Field("field2").requireFieldMatch(true).preTags("").postTags(""))); - searchResponse = client().search(searchRequest("test").source(source)).actionGet(); - assertHitCount(searchResponse, 1l); - - assertHighlight(searchResponse, 0, "field1", 0, equalTo("The quick brown fox jumps over the lazy dog.")); - assertHighlight(searchResponse, 0, "field1", 1, equalTo("The lazy red fox jumps over the quick dog.")); - assertHighlight(searchResponse, 0, "field1", 2, 3, equalTo("The quick brown dog jumps over the lazy fox.")); - //field2 is now returned highlighted thanks to the multi_match query on both fields - assertHighlight(searchResponse, 0, "field2", 0, equalTo("The quick brown fox jumps over the lazy dog.")); - assertHighlight(searchResponse, 0, "field2", 1, equalTo("The lazy red fox jumps over the quick dog.")); - assertHighlight(searchResponse, 0, "field2", 2, 3, equalTo("The quick brown dog jumps over the lazy fox.")); - } - @Test public void testMultiMatchQueryHighlight() throws IOException { String[] highlighterTypes = new String[] {"fvh", "plain", "postings"}; @@ -2098,14 +1997,22 @@ public void testMultiMatchQueryHighlight() throws IOException { refresh(); final int iters = scaledRandomIntBetween(20, 30); for (int i = 0; i < iters; i++) { - MultiMatchQueryBuilder.Type matchQueryType = rarely() ? null : RandomPicks.randomFrom(getRandom(), MultiMatchQueryBuilder.Type.values()); + String highlighterType = rarely() ? null : RandomPicks.randomFrom(getRandom(), highlighterTypes); + MultiMatchQueryBuilder.Type[] supportedQueryTypes; + if ("postings".equals(highlighterType)) { + //phrase_prefix is not supported by postings highlighter, as it rewrites against an empty reader, the prefix will never match any term + supportedQueryTypes = new MultiMatchQueryBuilder.Type[]{MultiMatchQueryBuilder.Type.BEST_FIELDS, MultiMatchQueryBuilder.Type.CROSS_FIELDS, MultiMatchQueryBuilder.Type.MOST_FIELDS, MultiMatchQueryBuilder.Type.PHRASE}; + } else { + supportedQueryTypes = MultiMatchQueryBuilder.Type.values(); + } + MultiMatchQueryBuilder.Type matchQueryType = rarely() ? null : RandomPicks.randomFrom(getRandom(), supportedQueryTypes); final MultiMatchQueryBuilder multiMatchQueryBuilder = multiMatchQuery("the quick brown fox", "field1", "field2").type(matchQueryType); - String type = rarely() ? null : RandomPicks.randomFrom(getRandom(),highlighterTypes); + SearchSourceBuilder source = searchSource() .query(multiMatchQueryBuilder) - .highlight(highlight().highlightQuery(randomBoolean() ? multiMatchQueryBuilder : null).highlighterType(type) + .highlight(highlight().highlightQuery(randomBoolean() ? multiMatchQueryBuilder : null).highlighterType(highlighterType) .field(new Field("field1").requireFieldMatch(true).preTags("").postTags(""))); - logger.info("Running multi-match type: [" + matchQueryType + "] highlight with type: [" + type + "]"); + logger.info("Running multi-match type: [" + matchQueryType + "] highlight with type: [" + highlighterType + "]"); SearchResponse searchResponse = client().search(searchRequest("test").source(source)).actionGet(); assertHitCount(searchResponse, 1l); assertHighlight(searchResponse, 0, "field1", 0, anyOf(equalTo("The quick brown fox jumps over"), @@ -2140,16 +2047,6 @@ public void testPostingsHighlighterOrderByScore() throws Exception { assertThat(field1.fragments()[2].string(), equalTo("This is the second value's first sentence.")); assertThat(field1.fragments()[3].string(), equalTo("This sentence contains one match, not that short.")); assertThat(field1.fragments()[4].string(), equalTo("One sentence match here and scored lower since the text is quite long, not that appealing.")); - - //lets use now number_of_fragments = 0, so that we highlight per value without breaking them into snippets, but we sort the values by score - source = searchSource() - .query(termQuery("field1", "sentence")) - .highlight(highlight().field("field1", -1, 0).order("score")); - - searchResponse = client().search(searchRequest("test").source(source)).actionGet(); - assertHighlight(searchResponse, 0, "field1", 0, equalTo("This is the second value's first sentence. This one contains no matches. This sentence contains three sentence occurrences (sentence).")); - assertHighlight(searchResponse, 0, "field1", 1, equalTo("This sentence contains one match, not that short. This sentence contains two sentence matches. This one contains no matches.")); - assertHighlight(searchResponse, 0, "field1", 2, 3, equalTo("One sentence match here and scored lower since the text is quite long, not that appealing. This one contains no matches.")); } @Test @@ -2261,25 +2158,24 @@ public void testPostingsHighlighterShouldFailIfNoOffsets() throws Exception { assertNoFailures(search); assertFailures(client().prepareSearch() - .setQuery(matchQuery("title", "this is a test")) - .addHighlightedField("title") - .setHighlighterType("postings-highlighter"), + .setQuery(matchQuery("title", "this is a test")) + .addHighlightedField("title") + .setHighlighterType("postings-highlighter"), RestStatus.BAD_REQUEST, containsString("the field [title] should be indexed with positions and offsets in the postings list to be used with postings highlighter")); - assertFailures(client().prepareSearch() - .setQuery(matchQuery("title", "this is a test")) - .addHighlightedField("title") - .setHighlighterType("postings"), + .setQuery(matchQuery("title", "this is a test")) + .addHighlightedField("title") + .setHighlighterType("postings"), RestStatus.BAD_REQUEST, containsString("the field [title] should be indexed with positions and offsets in the postings list to be used with postings highlighter")); assertFailures(client().prepareSearch() - .setQuery(matchQuery("title", "this is a test")) - .addHighlightedField("tit*") - .setHighlighterType("postings"), + .setQuery(matchQuery("title", "this is a test")) + .addHighlightedField("tit*") + .setHighlighterType("postings"), RestStatus.BAD_REQUEST, containsString("the field [title] should be indexed with positions and offsets in the postings list to be used with postings highlighter")); } @@ -2317,9 +2213,8 @@ public void testPostingsHighlighterCommonTermsQuery() throws IOException { assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!")); } - public XContentBuilder type1PostingsffsetsMapping() throws IOException { + private static XContentBuilder type1PostingsffsetsMapping() throws IOException { return XContentFactory.jsonBuilder().startObject().startObject("type1") - .startObject("_all").field("store", "yes").field("index_options", "offsets").endObject() .startObject("properties") .startObject("field1").field("type", "string").field("index_options", "offsets").endObject() .startObject("field2").field("type", "string").field("index_options", "offsets").endObject() @@ -2327,9 +2222,6 @@ public XContentBuilder type1PostingsffsetsMapping() throws IOException { .endObject().endObject(); } - private static final String[] REWRITE_METHODS = new String[]{"constant_score_auto", "scoring_boolean", "constant_score_boolean", - "constant_score_filter", "top_terms_boost_50", "top_terms_50"}; - @Test public void testPostingsHighlighterPrefixQuery() throws Exception { assertAcked(prepareCreate("test").addMapping("type1", type1PostingsffsetsMapping())); @@ -2339,7 +2231,7 @@ public void testPostingsHighlighterPrefixQuery() throws Exception { refresh(); logger.info("--> highlighting and searching on field2"); - SearchSourceBuilder source = searchSource().query(prefixQuery("field2", "qui").rewrite(randomFrom(REWRITE_METHODS))) + SearchSourceBuilder source = searchSource().query(prefixQuery("field2", "qui")) .highlight(highlight().field("field2")); SearchResponse searchResponse = client().prepareSearch("test").setSource(source.buildAsBytes()).get(); assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!")); @@ -2369,7 +2261,7 @@ public void testPostingsHighlighterRegexpQuery() throws Exception { client().prepareIndex("test", "type1").setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog! Second sentence.").get(); refresh(); logger.info("--> highlighting and searching on field2"); - SearchSourceBuilder source = searchSource().query(regexpQuery("field2", "qu[a-l]+k").rewrite(randomFrom(REWRITE_METHODS))) + SearchSourceBuilder source = searchSource().query(regexpQuery("field2", "qu[a-l]+k")) .highlight(highlight().field("field2")); SearchResponse searchResponse = client().prepareSearch("test").setSource(source.buildAsBytes()).get(); @@ -2384,13 +2276,13 @@ public void testPostingsHighlighterWildcardQuery() throws Exception { client().prepareIndex("test", "type1").setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog! Second sentence.").get(); refresh(); logger.info("--> highlighting and searching on field2"); - SearchSourceBuilder source = searchSource().query(wildcardQuery("field2", "qui*").rewrite(randomFrom(REWRITE_METHODS))) + SearchSourceBuilder source = searchSource().query(wildcardQuery("field2", "qui*")) .highlight(highlight().field("field2")); SearchResponse searchResponse = client().prepareSearch("test").setSource(source.buildAsBytes()).get(); assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!")); - source = searchSource().query(wildcardQuery("field2", "qu*k").rewrite(randomFrom(REWRITE_METHODS))) + source = searchSource().query(wildcardQuery("field2", "qu*k")) .highlight(highlight().field("field2")); searchResponse = client().prepareSearch("test").setSource(source.buildAsBytes()).get(); assertHitCount(searchResponse, 1l); @@ -2421,7 +2313,7 @@ public void testPostingsHighlighterQueryString() throws Exception { client().prepareIndex("test", "type1").setSource("field1", "this is a test", "field2", "The quick brown fox jumps over the lazy dog! Second sentence.").get(); refresh(); logger.info("--> highlighting and searching on field2"); - SearchSourceBuilder source = searchSource().query(queryStringQuery("qui*").defaultField("field2").rewrite(randomFrom(REWRITE_METHODS))) + SearchSourceBuilder source = searchSource().query(queryStringQuery("qui*").defaultField("field2")) .highlight(highlight().field("field2")); SearchResponse searchResponse = client().prepareSearch("test").setSource(source.buildAsBytes()).get(); assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog!")); @@ -2437,7 +2329,7 @@ public void testPostingsHighlighterRegexpQueryWithinConstantScoreQuery() throws refresh(); logger.info("--> highlighting and searching on field1"); - SearchSourceBuilder source = searchSource().query(constantScoreQuery(regexpQuery("field1", "pho[a-z]+").rewrite(randomFrom(REWRITE_METHODS)))) + SearchSourceBuilder source = searchSource().query(constantScoreQuery(regexpQuery("field1", "pho[a-z]+"))) .highlight(highlight().field("field1")); SearchResponse searchResponse = client().prepareSearch("test").setSource(source.buildAsBytes()).get(); assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("The photography word will get highlighted")); @@ -2456,7 +2348,7 @@ public void testPostingsHighlighterMultiTermQueryMultipleLevels() throws Excepti SearchSourceBuilder source = searchSource().query(boolQuery() .should(constantScoreQuery(QueryBuilders.missingQuery("field1"))) .should(matchQuery("field1", "test")) - .should(filteredQuery(queryStringQuery("field1:photo*").rewrite(randomFrom(REWRITE_METHODS)), null))) + .should(filteredQuery(queryStringQuery("field1:photo*"), null))) .highlight(highlight().field("field1")); SearchResponse searchResponse = client().prepareSearch("test").setSource(source.buildAsBytes()).get(); assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("The photography word will get highlighted")); @@ -2472,7 +2364,7 @@ public void testPostingsHighlighterPrefixQueryWithinBooleanQuery() throws Except refresh(); logger.info("--> highlighting and searching on field1"); - SearchSourceBuilder source = searchSource().query(boolQuery().must(prefixQuery("field1", "photo").rewrite(randomFrom(REWRITE_METHODS))).should(matchQuery("field1", "test").minimumShouldMatch("0"))) + SearchSourceBuilder source = searchSource().query(boolQuery().must(prefixQuery("field1", "photo")).should(matchQuery("field1", "test").minimumShouldMatch("0"))) .highlight(highlight().field("field1")); SearchResponse searchResponse = client().prepareSearch("test").setSource(source.buildAsBytes()).get(); assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("The photography word will get highlighted")); @@ -2488,7 +2380,7 @@ public void testPostingsHighlighterQueryStringWithinFilteredQuery() throws Excep refresh(); logger.info("--> highlighting and searching on field1"); - SearchSourceBuilder source = searchSource().query(filteredQuery(queryStringQuery("field1:photo*").rewrite(randomFrom(REWRITE_METHODS)), missingQuery("field_null"))) + SearchSourceBuilder source = searchSource().query(filteredQuery(queryStringQuery("field1:photo*"), missingQuery("field_null"))) .highlight(highlight().field("field1")); SearchResponse searchResponse = client().prepareSearch("test").setSource(source.buildAsBytes()).get(); assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("The photography word will get highlighted"));