HSEARCH-3786 Simplify the collection of child documents in the Lucene…

… backend We still have the same problem as before (explained in HSEARCH-3786): children are collected for all matching documents, not just topdocs, which is ridiculously inefficient. However, we will be fixing this problem in the next commits for HSEARCH-3797 by differentiating collectors that must be applied to all matching documents and collectors that must be applied to topdocs only.
hibernate · Jan 6, 2020 · 638091d · 638091d
1 parent b109953
commit 638091d
Show file tree

Hide file tree

Showing 8 changed files with 101 additions and 178 deletions.
diff --git a/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/logging/impl/Log.java b/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/logging/impl/Log.java
@@ -531,9 +531,6 @@ SearchException conflictingIdentifierTypesForPredicate(ToDocumentIdentifierValue
 			value = "Invalid routing key: '%1$s'. Valid keys are: %2$s.")
 	SearchException invalidRoutingKeyForExplicitShardingStrategy(String invalidKey, Collection<String> validKeys);
 
-	@Message(id = ID_OFFSET_2 + 92, value = "Error fetching nested documents. Native Lucene query: '%1$s'.")
-	SearchException errorFetchingNestedDocuments(Query luceneQuery, @Cause Exception e);
-
 	@Message(id = ID_OFFSET_2 + 93, value = "Multiple conflicting nested document paths to build a projection for field '%1$s'. '%2$s' vs. '%3$s'.")
 	SearchException conflictingNestedDocumentPathsForProjection(String absoluteFieldPath, String nestedDocumentPath1, String nestedDocumentPath2, @Param EventContext context);
 

diff --git a/...n/java/org/hibernate/search/backend/lucene/lowlevel/collector/impl/ChildrenCollector.java b/...n/java/org/hibernate/search/backend/lucene/lowlevel/collector/impl/ChildrenCollector.java
@@ -12,69 +12,86 @@
 import java.util.Map;
 import java.util.Set;
 
-import org.hibernate.search.backend.lucene.lowlevel.common.impl.MetadataFields;
+import org.hibernate.search.backend.lucene.lowlevel.join.impl.NestedDocsProvider;
+import org.hibernate.search.util.common.AssertionFailure;
 
-import org.apache.lucene.index.BinaryDocValues;
-import org.apache.lucene.index.DocValues;
-import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.search.Collector;
-import org.apache.lucene.search.LeafCollector;
-import org.apache.lucene.search.Scorable;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.ScoreMode;
+import org.apache.lucene.search.SimpleCollector;
+import org.apache.lucene.search.Weight;
 
-public class ChildrenCollector implements Collector {
+public class ChildrenCollector extends SimpleCollector {
 
-	private final Map<String, Set<Integer>> children = new HashMap<>();
+	private final NestedDocsProvider nestedDocsProvider;
+	private final Weight childrenWeight;
 
-	@Override
-	public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
-		return new FieldLeafCollector( context );
-	}
+	private int currentLeafDocBase;
+	private int currentLeafLastSeenParentDoc;
+	private DocIdSetIterator currentLeafChildDocs;
 
-	@Override
-	public ScoreMode scoreMode() {
-		return ScoreMode.COMPLETE_NO_SCORES;
+	private final Map<Integer, Set<Integer>> children = new HashMap<>();
+
+	public ChildrenCollector(IndexSearcher indexSearcher, NestedDocsProvider nestedDocsProvider) throws IOException {
+		this.childrenWeight = nestedDocsProvider.childDocsWeight( indexSearcher );
+		this.nestedDocsProvider = nestedDocsProvider;
 	}
 
-	public Map<String, Set<Integer>> getChildren() {
-		return children;
+	@Override
+	public String toString() {
+		final StringBuilder sb = new StringBuilder( "ChildrenCollector{" );
+		sb.append( "children=" ).append( children );
+		sb.append( '}' );
+		return sb.toString();
 	}
 
-	private class FieldLeafCollector implements LeafCollector {
+	@Override
+	public void collect(int parentDoc) throws IOException {
+		if ( currentLeafChildDocs == null ) {
+			return; // No children in this leaf
+		}
 
-		private final LeafReader reader;
-		private final BinaryDocValues docValues;
+		if ( parentDoc < currentLeafLastSeenParentDoc ) {
+			throw new AssertionFailure( "Collector.collect called in unexpected order" );
+		}
 
-		public FieldLeafCollector(LeafReaderContext context) throws IOException {
-			reader = context.reader();
-			docValues = DocValues.getBinary( reader, MetadataFields.rootIdFieldName() );
+		final int firstChildDoc;
+		if ( currentLeafChildDocs.docID() > currentLeafLastSeenParentDoc ) {
+			firstChildDoc = currentLeafChildDocs.docID();
 		}
+		else {
+			firstChildDoc = currentLeafChildDocs.advance( currentLeafLastSeenParentDoc + 1 );
+		}
+		currentLeafLastSeenParentDoc = parentDoc;
 
-		@Override
-		public void setScorer(Scorable scorer) throws IOException {
-			// we don't need any scorer
+		if ( firstChildDoc > parentDoc ) {
+			// No child
+			return;
 		}
 
-		@Override
-		public void collect(int doc) throws IOException {
-			if ( !docValues.advanceExact( doc ) ) {
-				return;
-			}
-
-			String parentId = docValues.binaryValue().utf8ToString();
-			if ( !children.containsKey( parentId ) ) {
-				children.put( parentId, new HashSet<>() );
-			}
-			children.get( parentId ).add( doc );
+		Set<Integer> childrenOfThisDoc = new HashSet<>();
+		children.put( parentDoc, childrenOfThisDoc );
+
+		for ( int childDoc = firstChildDoc; childDoc < parentDoc; childDoc = currentLeafChildDocs.nextDoc() ) {
+			childrenOfThisDoc.add( currentLeafDocBase + childDoc );
 		}
 	}
 
 	@Override
-	public String toString() {
-		final StringBuilder sb = new StringBuilder( "LuceneChildrenCollector{" );
-		sb.append( "children=" ).append( children );
-		sb.append( '}' );
-		return sb.toString();
+	public ScoreMode scoreMode() {
+		return ScoreMode.COMPLETE_NO_SCORES;
+	}
+
+	public Map<Integer, Set<Integer>> getChildren() {
+		return children;
+	}
+
+	@Override
+	protected void doSetNextReader(LeafReaderContext context) throws IOException {
+		this.currentLeafDocBase = context.docBase;
+		this.currentLeafLastSeenParentDoc = -1;
+
+		this.currentLeafChildDocs = nestedDocsProvider.childDocs( childrenWeight, context );
 	}
 }
diff --git a/...c/main/java/org/hibernate/search/backend/lucene/lowlevel/collector/impl/CollectorKey.java b/...c/main/java/org/hibernate/search/backend/lucene/lowlevel/collector/impl/CollectorKey.java
@@ -28,4 +28,6 @@ static <C extends Collector> CollectorKey<C> create() {
 
 	CollectorKey<TopDocsCollector> TOP_DOCS = create();
 
+	CollectorKey<ChildrenCollector> CHILDREN = create();
+
 }
diff --git a/...nd/lucene/lowlevel/collector/impl/HibernateSearchDocumentIdToLuceneDocIdMapCollector.java b/...nd/lucene/lowlevel/collector/impl/HibernateSearchDocumentIdToLuceneDocIdMapCollector.java
diff --git a/.../main/java/org/hibernate/search/backend/lucene/lowlevel/join/impl/NestedDocsProvider.java b/.../main/java/org/hibernate/search/backend/lucene/lowlevel/join/impl/NestedDocsProvider.java
@@ -8,6 +8,7 @@
 
 import java.io.IOException;
 import java.util.Collections;
+import java.util.Set;
 
 import org.hibernate.search.backend.lucene.lowlevel.query.impl.Queries;
 
@@ -37,8 +38,12 @@ public class NestedDocsProvider {
 	private final Query childQuery;
 
 	public NestedDocsProvider(String nestedDocumentPath, Query originalParentQuery) {
+		this( Collections.singleton( nestedDocumentPath ), originalParentQuery );
+	}
+
+	public NestedDocsProvider(Set<String> nestedDocumentPaths, Query originalParentQuery) {
 		this.parentFiler = new QueryBitSetProducer( originalParentQuery );
-		this.childQuery = Queries.findChildQuery( Collections.singleton( nestedDocumentPath ), originalParentQuery );
+		this.childQuery = Queries.findChildQuery( nestedDocumentPaths, originalParentQuery );
 	}
 
 	public BitSet parentDocs(LeafReaderContext context) throws IOException {
@@ -51,7 +56,15 @@ public DocIdSetIterator childDocs(LeafReaderContext context) throws IOException
 		// Maybe we can cache on shard-base. See Elasticsearch code.
 		IndexSearcher indexSearcher = new IndexSearcher( topLevelCtx );
 
-		Weight weight = indexSearcher.createWeight( indexSearcher.rewrite( childQuery ), ScoreMode.COMPLETE_NO_SCORES, 1f );
+		Weight weight = childDocsWeight( indexSearcher );
+		return childDocs( weight, context );
+	}
+
+	public Weight childDocsWeight(IndexSearcher indexSearcher) throws IOException {
+		return indexSearcher.createWeight( indexSearcher.rewrite( childQuery ), ScoreMode.COMPLETE_NO_SCORES, 1f );
+	}
+
+	public DocIdSetIterator childDocs(Weight weight, LeafReaderContext context) throws IOException {
 		Scorer s = weight.scorer( context );
 		return s == null ? null : s.iterator();
 	}

diff --git a/...va/org/hibernate/search/backend/lucene/search/extraction/impl/ExtractionRequirements.java b/...va/org/hibernate/search/backend/lucene/search/extraction/impl/ExtractionRequirements.java
@@ -6,6 +6,7 @@
  */
 package org.hibernate.search.backend.lucene.search.extraction.impl;
 
+import java.io.IOException;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -15,7 +16,6 @@
 import java.util.Optional;
 import java.util.Set;
 
-import org.hibernate.search.backend.lucene.lowlevel.collector.impl.HibernateSearchDocumentIdToLuceneDocIdMapCollector;
 import org.hibernate.search.backend.lucene.lowlevel.collector.impl.CollectorExecutionContext;
 import org.hibernate.search.backend.lucene.lowlevel.collector.impl.CollectorFactory;
 import org.hibernate.search.backend.lucene.lowlevel.collector.impl.CollectorKey;
@@ -25,6 +25,7 @@
 import org.hibernate.search.backend.lucene.search.timeout.impl.TimeoutManager;
 
 import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MultiCollector;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.Sort;
@@ -55,8 +56,9 @@ private ExtractionRequirements(Builder builder) {
 		storedFieldVisitor = builder.createStoredFieldVisitor();
 	}
 
-	public LuceneCollectors createCollectors(Query luceneQuery, Sort sort,
-			IndexReaderMetadataResolver metadataResolver, int maxDocs, TimeoutManager timeoutManager) {
+	public LuceneCollectors createCollectors(IndexSearcher indexSearcher, Query luceneQuery, Sort sort,
+			IndexReaderMetadataResolver metadataResolver, int maxDocs, TimeoutManager timeoutManager)
+			throws IOException {
 		TopDocsCollector<?> topDocsCollector = null;
 		Integer scoreSortFieldIndexForRescoring = null;
 		boolean requireFieldDocRescoring = false;
@@ -106,6 +108,7 @@ public LuceneCollectors createCollectors(Query luceneQuery, Sort sort,
 		luceneCollectors.put( CollectorKey.TOTAL_HIT_COUNT, totalHitCountCollector );
 
 		Map<String, NestedDocsProvider> nestedDocsProviders;
+		ChildrenCollector childrenCollector = null;
 		if ( requiredNestedDocumentExtractionPaths.isEmpty() ) {
 			nestedDocsProviders = Collections.emptyMap();
 		}
@@ -117,6 +120,11 @@ public LuceneCollectors createCollectors(Query luceneQuery, Sort sort,
 						new NestedDocsProvider( nestedDocumentPath, luceneQuery )
 				);
 			}
+
+			NestedDocsProvider nestedDocsProvider =
+					new NestedDocsProvider( requiredNestedDocumentExtractionPaths, luceneQuery );
+			childrenCollector = new ChildrenCollector( indexSearcher, nestedDocsProvider );
+			luceneCollectors.put( CollectorKey.CHILDREN, childrenCollector );
 		}
 
 		CollectorExecutionContext executionContext =
@@ -131,21 +139,12 @@ public LuceneCollectors createCollectors(Query luceneQuery, Sort sort,
 				MultiCollector.wrap( luceneCollectors.values() ), timeoutManager
 		);
 
-		ChildrenCollector childrenCollector = null;
-		Collector collectorForNestedDocuments = null;
-		if ( !requiredNestedDocumentExtractionPaths.isEmpty() ) {
-			childrenCollector = new ChildrenCollector();
-			collectorForNestedDocuments = wrapTimeLimitingCollectorIfNecessary(
-					childrenCollector, timeoutManager
-			);
-		}
-
 		return new LuceneCollectors(
+				indexSearcher,
 				luceneQuery,
 				requireFieldDocRescoring, scoreSortFieldIndexForRescoring,
-				requiredNestedDocumentExtractionPaths,
 				topDocsCollector, totalHitCountCollector, childrenCollector,
-				compositeCollector, collectorForNestedDocuments,
+				compositeCollector,
 				luceneCollectors,
 				timeoutManager
 		);
@@ -187,7 +186,6 @@ public <C extends Collector> void requireCollector(CollectorFactory<C> collector
 
 		public void requireNestedDocumentExtraction(String nestedDocumentPath) {
 			if ( nestedDocumentPath != null ) {
-				requireCollector( HibernateSearchDocumentIdToLuceneDocIdMapCollector.FACTORY );
 				this.requiredNestedDocumentExtractionPaths.add( nestedDocumentPath );
 			}
 		}