Skip to content

Commit

Permalink
HSEARCH-3103 Implement sort on multi-valued text fields
Browse files Browse the repository at this point in the history
  • Loading branch information
yrodiere committed Mar 19, 2020
1 parent 130f8dc commit 0e90a45
Show file tree
Hide file tree
Showing 8 changed files with 359 additions and 46 deletions.
@@ -0,0 +1,304 @@
/*
* Hibernate Search, full-text search for your domain model
*
* License: GNU Lesser General Public License (LGPL), version 2.1 or later
* See the lgpl.txt file in the root directory or <http://www.gnu.org/licenses/lgpl-2.1.html>.
*/
package org.hibernate.search.backend.lucene.lowlevel.docvalues.impl;

import java.io.IOException;
import java.util.Objects;

import org.hibernate.search.backend.lucene.lowlevel.join.impl.JoinFirstChildIdIterator;
import org.hibernate.search.backend.lucene.lowlevel.join.impl.NestedDocsProvider;

import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.BytesRef;

/**
* A source of {@link org.apache.lucene.index.SortedDocValues} (text doc values) with multiple values per document,
* where multiple values are "aggregated" into a single value
* according to a given {@link MultiValueMode}.
*/
public abstract class TextMultiValuesToSingleValuesSource {

/**
* Creates a {@link TextMultiValuesToSingleValuesSource} that wraps a text field
*
* @param field the field
* @param mode the mode
* @param nested the nested provider
* @return DoubleMultiValuesSource
*/
public static TextMultiValuesToSingleValuesSource fromField(String field, MultiValueMode mode, NestedDocsProvider nested) {
return new FieldMultiValuesToSingleValuesSource( field, mode, nested );
}

protected final MultiValueMode mode;
protected final NestedDocsProvider nestedDocsProvider;

public TextMultiValuesToSingleValuesSource(MultiValueMode mode, NestedDocsProvider nestedDocsProvider) {
this.mode = mode;
this.nestedDocsProvider = nestedDocsProvider;
}

@Override
public boolean equals(Object o) {
if ( this == o ) {
return true;
}
if ( o == null || getClass() != o.getClass() ) {
return false;
}
TextMultiValuesToSingleValuesSource that = (TextMultiValuesToSingleValuesSource) o;
return Objects.equals( mode, that.mode )
&& Objects.equals( nestedDocsProvider, that.nestedDocsProvider );
}

@Override
public int hashCode() {
return Objects.hash( mode, nestedDocsProvider );
}

public SortedDocValues getValues(LeafReaderContext ctx) throws IOException {
SortedSetDocValues values = getSortedSetDocValues( ctx );

if ( nestedDocsProvider == null ) {
return select( values );
}

final BitSet rootDocs = nestedDocsProvider.parentDocs( ctx );
final DocIdSetIterator innerDocs = nestedDocsProvider.childDocs( ctx );
return select( values, rootDocs, innerDocs, Integer.MAX_VALUE );
}

protected abstract SortedSetDocValues getSortedSetDocValues(LeafReaderContext ctx) throws IOException;

protected SortedDocValues select(SortedSetDocValues values) {
final SortedDocValues singleton = DocValues.unwrapSingleton( values );
if ( singleton != null ) {
return singleton;
}
else {
return new SortedSetDocValuesToSortedDocValuesWrapper( values ) {
int docID = -1;
int lastEmittedOrd = -1;

@Override
public int ordValue() {
return lastEmittedOrd;
}

@Override
public int docID() {
return docID;
}

@Override
public boolean advanceExact(int doc) throws IOException {
if ( values.advanceExact( doc ) ) {
lastEmittedOrd = (int) pick( values );
docID = doc;
return true;
}
return false;
}
};
}
}

protected SortedDocValues select(final SortedSetDocValues values, final BitSet parentDocs,
final DocIdSetIterator childDocs, int maxChildren) {
if ( parentDocs == null || childDocs == null ) {
return DocValues.emptySorted();
}

JoinFirstChildIdIterator joinIterator = new JoinFirstChildIdIterator( parentDocs, childDocs, values );

return new SortedSetDocValuesToSortedDocValuesWrapper( values ) {
int docID = -1;
int lastSeenParentDoc = -1;
int lastEmittedOrd = -1;

@Override
public int ordValue() {
return lastEmittedOrd;
}

@Override
public int docID() {
return docID;
}

@Override
public boolean advanceExact(int parentDoc) throws IOException {
assert parentDoc >= lastSeenParentDoc : "can only evaluate current and upcoming parent docs";
if ( parentDoc == lastSeenParentDoc ) {
return true;
}

int nextChildWithValue = joinIterator.advance( parentDoc );
if ( nextChildWithValue == JoinFirstChildIdIterator.NO_CHILD_WITH_VALUE ) {
// No child of this parent has a value
return false;
}

docID = lastSeenParentDoc = parentDoc;
lastEmittedOrd = (int) pick( values, childDocs, nextChildWithValue, parentDoc, maxChildren );
return true;
}
};
}

protected long pick(SortedSetDocValues values) throws IOException {
long result;

switch ( mode ) {
case MIN: {
result = Long.MAX_VALUE;
for ( long ord; ( ord = values.nextOrd() ) != SortedSetDocValues.NO_MORE_ORDS; ) {
result = Math.min( result, ord );
}
break;
}
case MAX: {
result = Long.MIN_VALUE;
for ( long ord; ( ord = values.nextOrd() ) != SortedSetDocValues.NO_MORE_ORDS; ) {
result = Math.max( result, ord );
}
break;
}
default:
throw new IllegalArgumentException( "Unsupported sort mode: " + mode );
}

return result;
}

protected long pick(SortedSetDocValues values, DocIdSetIterator docItr, int startDoc, int endDoc,
int maxChildren) throws IOException {
long returnValue;

switch ( mode ) {
case MIN: {
returnValue = Long.MAX_VALUE;
int count = 0;
for ( int doc = startDoc; doc < endDoc; doc = docItr.nextDoc() ) {
if ( values.advanceExact( doc ) ) {
if ( ++count > maxChildren ) {
break;
}
for ( long ord; ( ord = values.nextOrd() ) != SortedSetDocValues.NO_MORE_ORDS; ) {
returnValue = Math.min( returnValue, ord );
}
}
}
break;
}
case MAX: {
returnValue = Long.MIN_VALUE;
int count = 0;
for ( int doc = startDoc; doc < endDoc; doc = docItr.nextDoc() ) {
if ( values.advanceExact( doc ) ) {
if ( ++count > maxChildren ) {
break;
}
for ( long ord; ( ord = values.nextOrd() ) != SortedSetDocValues.NO_MORE_ORDS; ) {
returnValue = Math.max( returnValue, ord );
}
}
}
break;
}
default:
throw new IllegalArgumentException( "Unsupported sort mode: " + mode );
}

return returnValue;
}

private static class FieldMultiValuesToSingleValuesSource extends TextMultiValuesToSingleValuesSource {

private final String field;

public FieldMultiValuesToSingleValuesSource(String field, MultiValueMode mode, NestedDocsProvider nestedDocsProvider) {
super( mode, nestedDocsProvider );
this.field = field;
}

@Override
public String toString() {
return "text(" + field + "," + mode + "," + nestedDocsProvider + ")";
}

@Override
public boolean equals(Object o) {
if ( this == o ) {
return true;
}
if ( !super.equals( o ) ) {
return false;
}
FieldMultiValuesToSingleValuesSource that = (FieldMultiValuesToSingleValuesSource) o;
return Objects.equals( field, that.field );
}

@Override
public int hashCode() {
return Objects.hash( super.hashCode(), field );
}

@Override
protected SortedSetDocValues getSortedSetDocValues(LeafReaderContext ctx) throws IOException {
return DocValues.getSortedSet( ctx.reader(), field );
}
}

private abstract static class SortedSetDocValuesToSortedDocValuesWrapper extends SortedDocValues {

private final SortedSetDocValues delegate;

SortedSetDocValuesToSortedDocValuesWrapper(SortedSetDocValues delegate) {
this.delegate = delegate;
if ( delegate.getValueCount() > Integer.MAX_VALUE ) {
// We may want to remove this limitation?
// It would require defining our own FieldComparator mimicking TermOrdValComparator, which is pretty complex...
// Note that single-valued text docvalues are limited to that many different terms anyway,
// so this is no worse than the "legacy" sorts on single-valued text fields.
throw new IllegalStateException( "Cannot sort when more than " + Integer.MAX_VALUE + " terms are indexed" );
}
}

@Override
public int getValueCount() {
return (int) delegate.getValueCount();
}

@Override
public BytesRef lookupOrd(int ord) throws IOException {
return delegate.lookupOrd( ord );
}

@Override
public int nextDoc() {
throw new UnsupportedOperationException();
}

@Override
public int advance(int target) {
throw new UnsupportedOperationException();
}

@Override
public long cost() {
throw new UnsupportedOperationException();
}

}

}
Expand Up @@ -16,7 +16,7 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocValuesFieldExistsQuery;
Expand Down Expand Up @@ -65,7 +65,7 @@ public void encode(LuceneDocumentBuilder documentBuilder, String absoluteFieldPa
if ( sortable || aggregable ) {
BytesRef normalized = normalize( absoluteFieldPath, value );
if ( sortable ) {
documentBuilder.addField( new SortedDocValuesField( absoluteFieldPath, normalized ) );
documentBuilder.addField( new SortedSetDocValuesField( absoluteFieldPath, normalized ) );
}
if ( aggregable ) {
documentBuilder.addField( new SortedSetDocValuesFacetField( absoluteFieldPath, normalized.utf8ToString() ) );
Expand Down
Expand Up @@ -8,8 +8,9 @@

import java.io.IOException;

import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.DocValuesJoin;
import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.MultiValueMode;
import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.ReplaceMissingSortedDocValues;
import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.TextMultiValuesToSingleValuesSource;
import org.hibernate.search.backend.lucene.types.sort.impl.SortMissingValue;

import org.apache.lucene.index.LeafReaderContext;
Expand All @@ -20,20 +21,24 @@
public class LuceneTextFieldComparatorSource extends LuceneFieldComparatorSource {

private final Object missingValue;
private final MultiValueMode multiValueMode;

public LuceneTextFieldComparatorSource(String nestedDocumentPath, Object missingValue) {
public LuceneTextFieldComparatorSource(String nestedDocumentPath, Object missingValue, MultiValueMode multiValueMode) {
super( nestedDocumentPath );
this.missingValue = missingValue;
this.multiValueMode = multiValueMode;
}

@Override
public FieldComparator<?> newComparator(String fieldname, int numHits, int sortPos, boolean reversed) {
final boolean sortMissingLast = missingLast() ^ reversed;
TextMultiValuesToSingleValuesSource source =
TextMultiValuesToSingleValuesSource.fromField( fieldname, multiValueMode, nestedDocsProvider );

return new FieldComparator.TermOrdValComparator( numHits, fieldname, sortMissingLast ) {
@Override
protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field) throws IOException {
SortedDocValues sortedDocValues = DocValuesJoin.getJoinedAsSingleValuedSorted( context, field, nestedDocsProvider );
SortedDocValues sortedDocValues = source.getValues( context );

if ( missingValue == null || missingFirst() || missingLast() ) {
return sortedDocValues;
Expand Down
Expand Up @@ -9,6 +9,7 @@
import java.lang.invoke.MethodHandles;

import org.hibernate.search.backend.lucene.logging.impl.Log;
import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.MultiValueMode;
import org.hibernate.search.backend.lucene.scope.model.impl.LuceneCompatibilityChecker;
import org.hibernate.search.backend.lucene.search.impl.LuceneSearchContext;
import org.hibernate.search.backend.lucene.search.sort.impl.AbstractLuceneSearchSortBuilder;
Expand Down Expand Up @@ -121,6 +122,30 @@ protected final EventContext getEventContext() {
return EventContexts.fromIndexFieldAbsolutePath( absoluteFieldPath );
}

protected final MultiValueMode getMultiValueMode() {
MultiValueMode multiValueMode = MultiValueMode.MIN;
if ( mode != null ) {
switch ( mode ) {
case MIN:
multiValueMode = MultiValueMode.MIN;
break;
case MAX:
multiValueMode = MultiValueMode.MAX;
break;
case AVG:
multiValueMode = MultiValueMode.AVG;
break;
case SUM:
multiValueMode = MultiValueMode.SUM;
break;
case MEDIAN:
multiValueMode = MultiValueMode.MEDIAN;
break;
}
}
return multiValueMode;
}

private DslConverter<?, ? extends F> getDslToIndexConverter(ValueConvert convert) {
switch ( convert ) {
case NO:
Expand Down

0 comments on commit 0e90a45

Please sign in to comment.