Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
HSEARCH-3103 Implement sort on multi-valued text fields
- Loading branch information
Showing
8 changed files
with
359 additions
and
46 deletions.
There are no files selected for viewing
304 changes: 304 additions & 0 deletions
304
...te/search/backend/lucene/lowlevel/docvalues/impl/TextMultiValuesToSingleValuesSource.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,304 @@ | ||
/* | ||
* Hibernate Search, full-text search for your domain model | ||
* | ||
* License: GNU Lesser General Public License (LGPL), version 2.1 or later | ||
* See the lgpl.txt file in the root directory or <http://www.gnu.org/licenses/lgpl-2.1.html>. | ||
*/ | ||
package org.hibernate.search.backend.lucene.lowlevel.docvalues.impl; | ||
|
||
import java.io.IOException; | ||
import java.util.Objects; | ||
|
||
import org.hibernate.search.backend.lucene.lowlevel.join.impl.JoinFirstChildIdIterator; | ||
import org.hibernate.search.backend.lucene.lowlevel.join.impl.NestedDocsProvider; | ||
|
||
import org.apache.lucene.index.DocValues; | ||
import org.apache.lucene.index.LeafReaderContext; | ||
import org.apache.lucene.index.SortedDocValues; | ||
import org.apache.lucene.index.SortedSetDocValues; | ||
import org.apache.lucene.search.DocIdSetIterator; | ||
import org.apache.lucene.util.BitSet; | ||
import org.apache.lucene.util.BytesRef; | ||
|
||
/** | ||
* A source of {@link org.apache.lucene.index.SortedDocValues} (text doc values) with multiple values per document, | ||
* where multiple values are "aggregated" into a single value | ||
* according to a given {@link MultiValueMode}. | ||
*/ | ||
public abstract class TextMultiValuesToSingleValuesSource { | ||
|
||
/** | ||
* Creates a {@link TextMultiValuesToSingleValuesSource} that wraps a text field | ||
* | ||
* @param field the field | ||
* @param mode the mode | ||
* @param nested the nested provider | ||
* @return DoubleMultiValuesSource | ||
*/ | ||
public static TextMultiValuesToSingleValuesSource fromField(String field, MultiValueMode mode, NestedDocsProvider nested) { | ||
return new FieldMultiValuesToSingleValuesSource( field, mode, nested ); | ||
} | ||
|
||
protected final MultiValueMode mode; | ||
protected final NestedDocsProvider nestedDocsProvider; | ||
|
||
public TextMultiValuesToSingleValuesSource(MultiValueMode mode, NestedDocsProvider nestedDocsProvider) { | ||
this.mode = mode; | ||
this.nestedDocsProvider = nestedDocsProvider; | ||
} | ||
|
||
@Override | ||
public boolean equals(Object o) { | ||
if ( this == o ) { | ||
return true; | ||
} | ||
if ( o == null || getClass() != o.getClass() ) { | ||
return false; | ||
} | ||
TextMultiValuesToSingleValuesSource that = (TextMultiValuesToSingleValuesSource) o; | ||
return Objects.equals( mode, that.mode ) | ||
&& Objects.equals( nestedDocsProvider, that.nestedDocsProvider ); | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return Objects.hash( mode, nestedDocsProvider ); | ||
} | ||
|
||
public SortedDocValues getValues(LeafReaderContext ctx) throws IOException { | ||
SortedSetDocValues values = getSortedSetDocValues( ctx ); | ||
|
||
if ( nestedDocsProvider == null ) { | ||
return select( values ); | ||
} | ||
|
||
final BitSet rootDocs = nestedDocsProvider.parentDocs( ctx ); | ||
final DocIdSetIterator innerDocs = nestedDocsProvider.childDocs( ctx ); | ||
return select( values, rootDocs, innerDocs, Integer.MAX_VALUE ); | ||
} | ||
|
||
protected abstract SortedSetDocValues getSortedSetDocValues(LeafReaderContext ctx) throws IOException; | ||
|
||
protected SortedDocValues select(SortedSetDocValues values) { | ||
final SortedDocValues singleton = DocValues.unwrapSingleton( values ); | ||
if ( singleton != null ) { | ||
return singleton; | ||
} | ||
else { | ||
return new SortedSetDocValuesToSortedDocValuesWrapper( values ) { | ||
int docID = -1; | ||
int lastEmittedOrd = -1; | ||
|
||
@Override | ||
public int ordValue() { | ||
return lastEmittedOrd; | ||
} | ||
|
||
@Override | ||
public int docID() { | ||
return docID; | ||
} | ||
|
||
@Override | ||
public boolean advanceExact(int doc) throws IOException { | ||
if ( values.advanceExact( doc ) ) { | ||
lastEmittedOrd = (int) pick( values ); | ||
docID = doc; | ||
return true; | ||
} | ||
return false; | ||
} | ||
}; | ||
} | ||
} | ||
|
||
protected SortedDocValues select(final SortedSetDocValues values, final BitSet parentDocs, | ||
final DocIdSetIterator childDocs, int maxChildren) { | ||
if ( parentDocs == null || childDocs == null ) { | ||
return DocValues.emptySorted(); | ||
} | ||
|
||
JoinFirstChildIdIterator joinIterator = new JoinFirstChildIdIterator( parentDocs, childDocs, values ); | ||
|
||
return new SortedSetDocValuesToSortedDocValuesWrapper( values ) { | ||
int docID = -1; | ||
int lastSeenParentDoc = -1; | ||
int lastEmittedOrd = -1; | ||
|
||
@Override | ||
public int ordValue() { | ||
return lastEmittedOrd; | ||
} | ||
|
||
@Override | ||
public int docID() { | ||
return docID; | ||
} | ||
|
||
@Override | ||
public boolean advanceExact(int parentDoc) throws IOException { | ||
assert parentDoc >= lastSeenParentDoc : "can only evaluate current and upcoming parent docs"; | ||
if ( parentDoc == lastSeenParentDoc ) { | ||
return true; | ||
} | ||
|
||
int nextChildWithValue = joinIterator.advance( parentDoc ); | ||
if ( nextChildWithValue == JoinFirstChildIdIterator.NO_CHILD_WITH_VALUE ) { | ||
// No child of this parent has a value | ||
return false; | ||
} | ||
|
||
docID = lastSeenParentDoc = parentDoc; | ||
lastEmittedOrd = (int) pick( values, childDocs, nextChildWithValue, parentDoc, maxChildren ); | ||
return true; | ||
} | ||
}; | ||
} | ||
|
||
protected long pick(SortedSetDocValues values) throws IOException { | ||
long result; | ||
|
||
switch ( mode ) { | ||
case MIN: { | ||
result = Long.MAX_VALUE; | ||
for ( long ord; ( ord = values.nextOrd() ) != SortedSetDocValues.NO_MORE_ORDS; ) { | ||
result = Math.min( result, ord ); | ||
} | ||
break; | ||
} | ||
case MAX: { | ||
result = Long.MIN_VALUE; | ||
for ( long ord; ( ord = values.nextOrd() ) != SortedSetDocValues.NO_MORE_ORDS; ) { | ||
result = Math.max( result, ord ); | ||
} | ||
break; | ||
} | ||
default: | ||
throw new IllegalArgumentException( "Unsupported sort mode: " + mode ); | ||
} | ||
|
||
return result; | ||
} | ||
|
||
protected long pick(SortedSetDocValues values, DocIdSetIterator docItr, int startDoc, int endDoc, | ||
int maxChildren) throws IOException { | ||
long returnValue; | ||
|
||
switch ( mode ) { | ||
case MIN: { | ||
returnValue = Long.MAX_VALUE; | ||
int count = 0; | ||
for ( int doc = startDoc; doc < endDoc; doc = docItr.nextDoc() ) { | ||
if ( values.advanceExact( doc ) ) { | ||
if ( ++count > maxChildren ) { | ||
break; | ||
} | ||
for ( long ord; ( ord = values.nextOrd() ) != SortedSetDocValues.NO_MORE_ORDS; ) { | ||
returnValue = Math.min( returnValue, ord ); | ||
} | ||
} | ||
} | ||
break; | ||
} | ||
case MAX: { | ||
returnValue = Long.MIN_VALUE; | ||
int count = 0; | ||
for ( int doc = startDoc; doc < endDoc; doc = docItr.nextDoc() ) { | ||
if ( values.advanceExact( doc ) ) { | ||
if ( ++count > maxChildren ) { | ||
break; | ||
} | ||
for ( long ord; ( ord = values.nextOrd() ) != SortedSetDocValues.NO_MORE_ORDS; ) { | ||
returnValue = Math.max( returnValue, ord ); | ||
} | ||
} | ||
} | ||
break; | ||
} | ||
default: | ||
throw new IllegalArgumentException( "Unsupported sort mode: " + mode ); | ||
} | ||
|
||
return returnValue; | ||
} | ||
|
||
private static class FieldMultiValuesToSingleValuesSource extends TextMultiValuesToSingleValuesSource { | ||
|
||
private final String field; | ||
|
||
public FieldMultiValuesToSingleValuesSource(String field, MultiValueMode mode, NestedDocsProvider nestedDocsProvider) { | ||
super( mode, nestedDocsProvider ); | ||
this.field = field; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "text(" + field + "," + mode + "," + nestedDocsProvider + ")"; | ||
} | ||
|
||
@Override | ||
public boolean equals(Object o) { | ||
if ( this == o ) { | ||
return true; | ||
} | ||
if ( !super.equals( o ) ) { | ||
return false; | ||
} | ||
FieldMultiValuesToSingleValuesSource that = (FieldMultiValuesToSingleValuesSource) o; | ||
return Objects.equals( field, that.field ); | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return Objects.hash( super.hashCode(), field ); | ||
} | ||
|
||
@Override | ||
protected SortedSetDocValues getSortedSetDocValues(LeafReaderContext ctx) throws IOException { | ||
return DocValues.getSortedSet( ctx.reader(), field ); | ||
} | ||
} | ||
|
||
private abstract static class SortedSetDocValuesToSortedDocValuesWrapper extends SortedDocValues { | ||
|
||
private final SortedSetDocValues delegate; | ||
|
||
SortedSetDocValuesToSortedDocValuesWrapper(SortedSetDocValues delegate) { | ||
this.delegate = delegate; | ||
if ( delegate.getValueCount() > Integer.MAX_VALUE ) { | ||
// We may want to remove this limitation? | ||
// It would require defining our own FieldComparator mimicking TermOrdValComparator, which is pretty complex... | ||
// Note that single-valued text docvalues are limited to that many different terms anyway, | ||
// so this is no worse than the "legacy" sorts on single-valued text fields. | ||
throw new IllegalStateException( "Cannot sort when more than " + Integer.MAX_VALUE + " terms are indexed" ); | ||
} | ||
} | ||
|
||
@Override | ||
public int getValueCount() { | ||
return (int) delegate.getValueCount(); | ||
} | ||
|
||
@Override | ||
public BytesRef lookupOrd(int ord) throws IOException { | ||
return delegate.lookupOrd( ord ); | ||
} | ||
|
||
@Override | ||
public int nextDoc() { | ||
throw new UnsupportedOperationException(); | ||
} | ||
|
||
@Override | ||
public int advance(int target) { | ||
throw new UnsupportedOperationException(); | ||
} | ||
|
||
@Override | ||
public long cost() { | ||
throw new UnsupportedOperationException(); | ||
} | ||
|
||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.