Skip to content

Commit

Permalink
HSEARCH-3786 Simplify the collection of child documents in the Lucene…
Browse files Browse the repository at this point in the history
… backend

We still have the same problem as before (explained in HSEARCH-3786):
children are collected for all matching documents, not just topdocs,
which is ridiculously inefficient.

However, we will be fixing this problem in the next commits for
HSEARCH-3797 by differentiating collectors that must be applied to
all matching documents and collectors that must be applied to topdocs
only.
  • Loading branch information
yrodiere committed Jan 6, 2020
1 parent b109953 commit 638091d
Show file tree
Hide file tree
Showing 8 changed files with 101 additions and 178 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -531,9 +531,6 @@ SearchException conflictingIdentifierTypesForPredicate(ToDocumentIdentifierValue
value = "Invalid routing key: '%1$s'. Valid keys are: %2$s.")
SearchException invalidRoutingKeyForExplicitShardingStrategy(String invalidKey, Collection<String> validKeys);

@Message(id = ID_OFFSET_2 + 92, value = "Error fetching nested documents. Native Lucene query: '%1$s'.")
SearchException errorFetchingNestedDocuments(Query luceneQuery, @Cause Exception e);

@Message(id = ID_OFFSET_2 + 93, value = "Multiple conflicting nested document paths to build a projection for field '%1$s'. '%2$s' vs. '%3$s'.")
SearchException conflictingNestedDocumentPathsForProjection(String absoluteFieldPath, String nestedDocumentPath1, String nestedDocumentPath2, @Param EventContext context);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,69 +12,86 @@
import java.util.Map;
import java.util.Set;

import org.hibernate.search.backend.lucene.lowlevel.common.impl.MetadataFields;
import org.hibernate.search.backend.lucene.lowlevel.join.impl.NestedDocsProvider;
import org.hibernate.search.util.common.AssertionFailure;

import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.search.Weight;

public class ChildrenCollector implements Collector {
public class ChildrenCollector extends SimpleCollector {

private final Map<String, Set<Integer>> children = new HashMap<>();
private final NestedDocsProvider nestedDocsProvider;
private final Weight childrenWeight;

@Override
public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
return new FieldLeafCollector( context );
}
private int currentLeafDocBase;
private int currentLeafLastSeenParentDoc;
private DocIdSetIterator currentLeafChildDocs;

@Override
public ScoreMode scoreMode() {
return ScoreMode.COMPLETE_NO_SCORES;
private final Map<Integer, Set<Integer>> children = new HashMap<>();

public ChildrenCollector(IndexSearcher indexSearcher, NestedDocsProvider nestedDocsProvider) throws IOException {
this.childrenWeight = nestedDocsProvider.childDocsWeight( indexSearcher );
this.nestedDocsProvider = nestedDocsProvider;
}

public Map<String, Set<Integer>> getChildren() {
return children;
@Override
public String toString() {
final StringBuilder sb = new StringBuilder( "ChildrenCollector{" );
sb.append( "children=" ).append( children );
sb.append( '}' );
return sb.toString();
}

private class FieldLeafCollector implements LeafCollector {
@Override
public void collect(int parentDoc) throws IOException {
if ( currentLeafChildDocs == null ) {
return; // No children in this leaf
}

private final LeafReader reader;
private final BinaryDocValues docValues;
if ( parentDoc < currentLeafLastSeenParentDoc ) {
throw new AssertionFailure( "Collector.collect called in unexpected order" );
}

public FieldLeafCollector(LeafReaderContext context) throws IOException {
reader = context.reader();
docValues = DocValues.getBinary( reader, MetadataFields.rootIdFieldName() );
final int firstChildDoc;
if ( currentLeafChildDocs.docID() > currentLeafLastSeenParentDoc ) {
firstChildDoc = currentLeafChildDocs.docID();
}
else {
firstChildDoc = currentLeafChildDocs.advance( currentLeafLastSeenParentDoc + 1 );
}
currentLeafLastSeenParentDoc = parentDoc;

@Override
public void setScorer(Scorable scorer) throws IOException {
// we don't need any scorer
if ( firstChildDoc > parentDoc ) {
// No child
return;
}

@Override
public void collect(int doc) throws IOException {
if ( !docValues.advanceExact( doc ) ) {
return;
}

String parentId = docValues.binaryValue().utf8ToString();
if ( !children.containsKey( parentId ) ) {
children.put( parentId, new HashSet<>() );
}
children.get( parentId ).add( doc );
Set<Integer> childrenOfThisDoc = new HashSet<>();
children.put( parentDoc, childrenOfThisDoc );

for ( int childDoc = firstChildDoc; childDoc < parentDoc; childDoc = currentLeafChildDocs.nextDoc() ) {
childrenOfThisDoc.add( currentLeafDocBase + childDoc );
}
}

@Override
public String toString() {
final StringBuilder sb = new StringBuilder( "LuceneChildrenCollector{" );
sb.append( "children=" ).append( children );
sb.append( '}' );
return sb.toString();
public ScoreMode scoreMode() {
return ScoreMode.COMPLETE_NO_SCORES;
}

public Map<Integer, Set<Integer>> getChildren() {
return children;
}

@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
this.currentLeafDocBase = context.docBase;
this.currentLeafLastSeenParentDoc = -1;

this.currentLeafChildDocs = nestedDocsProvider.childDocs( childrenWeight, context );
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,6 @@ static <C extends Collector> CollectorKey<C> create() {

CollectorKey<TopDocsCollector> TOP_DOCS = create();

CollectorKey<ChildrenCollector> CHILDREN = create();

}

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import java.io.IOException;
import java.util.Collections;
import java.util.Set;

import org.hibernate.search.backend.lucene.lowlevel.query.impl.Queries;

Expand Down Expand Up @@ -37,8 +38,12 @@ public class NestedDocsProvider {
private final Query childQuery;

public NestedDocsProvider(String nestedDocumentPath, Query originalParentQuery) {
this( Collections.singleton( nestedDocumentPath ), originalParentQuery );
}

public NestedDocsProvider(Set<String> nestedDocumentPaths, Query originalParentQuery) {
this.parentFiler = new QueryBitSetProducer( originalParentQuery );
this.childQuery = Queries.findChildQuery( Collections.singleton( nestedDocumentPath ), originalParentQuery );
this.childQuery = Queries.findChildQuery( nestedDocumentPaths, originalParentQuery );
}

public BitSet parentDocs(LeafReaderContext context) throws IOException {
Expand All @@ -51,7 +56,15 @@ public DocIdSetIterator childDocs(LeafReaderContext context) throws IOException
// Maybe we can cache on shard-base. See Elasticsearch code.
IndexSearcher indexSearcher = new IndexSearcher( topLevelCtx );

Weight weight = indexSearcher.createWeight( indexSearcher.rewrite( childQuery ), ScoreMode.COMPLETE_NO_SCORES, 1f );
Weight weight = childDocsWeight( indexSearcher );
return childDocs( weight, context );
}

public Weight childDocsWeight(IndexSearcher indexSearcher) throws IOException {
return indexSearcher.createWeight( indexSearcher.rewrite( childQuery ), ScoreMode.COMPLETE_NO_SCORES, 1f );
}

public DocIdSetIterator childDocs(Weight weight, LeafReaderContext context) throws IOException {
Scorer s = weight.scorer( context );
return s == null ? null : s.iterator();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
*/
package org.hibernate.search.backend.lucene.search.extraction.impl;

import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
Expand All @@ -15,7 +16,6 @@
import java.util.Optional;
import java.util.Set;

import org.hibernate.search.backend.lucene.lowlevel.collector.impl.HibernateSearchDocumentIdToLuceneDocIdMapCollector;
import org.hibernate.search.backend.lucene.lowlevel.collector.impl.CollectorExecutionContext;
import org.hibernate.search.backend.lucene.lowlevel.collector.impl.CollectorFactory;
import org.hibernate.search.backend.lucene.lowlevel.collector.impl.CollectorKey;
Expand All @@ -25,6 +25,7 @@
import org.hibernate.search.backend.lucene.search.timeout.impl.TimeoutManager;

import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
Expand Down Expand Up @@ -55,8 +56,9 @@ private ExtractionRequirements(Builder builder) {
storedFieldVisitor = builder.createStoredFieldVisitor();
}

public LuceneCollectors createCollectors(Query luceneQuery, Sort sort,
IndexReaderMetadataResolver metadataResolver, int maxDocs, TimeoutManager timeoutManager) {
public LuceneCollectors createCollectors(IndexSearcher indexSearcher, Query luceneQuery, Sort sort,
IndexReaderMetadataResolver metadataResolver, int maxDocs, TimeoutManager timeoutManager)
throws IOException {
TopDocsCollector<?> topDocsCollector = null;
Integer scoreSortFieldIndexForRescoring = null;
boolean requireFieldDocRescoring = false;
Expand Down Expand Up @@ -106,6 +108,7 @@ public LuceneCollectors createCollectors(Query luceneQuery, Sort sort,
luceneCollectors.put( CollectorKey.TOTAL_HIT_COUNT, totalHitCountCollector );

Map<String, NestedDocsProvider> nestedDocsProviders;
ChildrenCollector childrenCollector = null;
if ( requiredNestedDocumentExtractionPaths.isEmpty() ) {
nestedDocsProviders = Collections.emptyMap();
}
Expand All @@ -117,6 +120,11 @@ public LuceneCollectors createCollectors(Query luceneQuery, Sort sort,
new NestedDocsProvider( nestedDocumentPath, luceneQuery )
);
}

NestedDocsProvider nestedDocsProvider =
new NestedDocsProvider( requiredNestedDocumentExtractionPaths, luceneQuery );
childrenCollector = new ChildrenCollector( indexSearcher, nestedDocsProvider );
luceneCollectors.put( CollectorKey.CHILDREN, childrenCollector );
}

CollectorExecutionContext executionContext =
Expand All @@ -131,21 +139,12 @@ public LuceneCollectors createCollectors(Query luceneQuery, Sort sort,
MultiCollector.wrap( luceneCollectors.values() ), timeoutManager
);

ChildrenCollector childrenCollector = null;
Collector collectorForNestedDocuments = null;
if ( !requiredNestedDocumentExtractionPaths.isEmpty() ) {
childrenCollector = new ChildrenCollector();
collectorForNestedDocuments = wrapTimeLimitingCollectorIfNecessary(
childrenCollector, timeoutManager
);
}

return new LuceneCollectors(
indexSearcher,
luceneQuery,
requireFieldDocRescoring, scoreSortFieldIndexForRescoring,
requiredNestedDocumentExtractionPaths,
topDocsCollector, totalHitCountCollector, childrenCollector,
compositeCollector, collectorForNestedDocuments,
compositeCollector,
luceneCollectors,
timeoutManager
);
Expand Down Expand Up @@ -187,7 +186,6 @@ public <C extends Collector> void requireCollector(CollectorFactory<C> collector

public void requireNestedDocumentExtraction(String nestedDocumentPath) {
if ( nestedDocumentPath != null ) {
requireCollector( HibernateSearchDocumentIdToLuceneDocIdMapCollector.FACTORY );
this.requiredNestedDocumentExtractionPaths.add( nestedDocumentPath );
}
}
Expand Down
Loading

0 comments on commit 638091d

Please sign in to comment.