From feccb0b245bd901194bdf8f0d92769a19ed6ab4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yoann=20Rodi=C3=A8re?= Date: Wed, 15 Apr 2020 17:01:38 +0200 Subject: [PATCH] HSEARCH-3894 Fix Lucene text aggregations: count each document only once per term This solution is probably not great performance-wise, but at least it will work. It'll have to do until someone has time to find something better. --- .../facet/impl/TextMultiValueFacetCounts.java | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/facet/impl/TextMultiValueFacetCounts.java b/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/facet/impl/TextMultiValueFacetCounts.java index 8f98f17860f..aebfbe13332 100644 --- a/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/facet/impl/TextMultiValueFacetCounts.java +++ b/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/facet/impl/TextMultiValueFacetCounts.java @@ -13,6 +13,8 @@ import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.TextMultiValues; import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.TextMultiValuesSource; +import com.carrotsearch.hppc.IntHashSet; +import com.carrotsearch.hppc.procedures.IntProcedure; import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.FacetsCollector; @@ -123,6 +125,7 @@ private void countOneSegment(OrdinalMap ordinalMap, TextMultiValues segValues, i // nothing to count return; } + IntHashSet uniqueOrdinalsForDocument = new IntHashSet(); DocIdSetIterator docs = hits.bits.iterator(); @@ -141,6 +144,7 @@ private void countOneSegment(OrdinalMap ordinalMap, TextMultiValues segValues, i int numSegOrds = (int) segValues.getValueCount(); if ( hits.totalHits < numSegOrds / 10 ) { + IntProcedure incrementCountForOrdinal = ord -> counts[ord]++; // Remap every ord to global ord as we iterate: for ( int doc = docs.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = docs.nextDoc() ) { if ( !segValues.advanceExact( doc ) ) { @@ -148,21 +152,27 @@ private void countOneSegment(OrdinalMap ordinalMap, TextMultiValues segValues, i } while ( segValues.hasNextValue() ) { int term = (int) segValues.nextOrd(); - counts[(int) ordMap.get( term )]++; + int globalOrd = (int) ordMap.get( term ); + uniqueOrdinalsForDocument.add( globalOrd ); } + uniqueOrdinalsForDocument.forEach( incrementCountForOrdinal ); + uniqueOrdinalsForDocument.clear(); } } else { // First count in seg-ord space: final int[] segCounts = new int[numSegOrds]; + IntProcedure incrementCountForOrdinal = ord -> segCounts[ord]++; for ( int doc = docs.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = docs.nextDoc() ) { if ( !segValues.advanceExact( doc ) ) { continue; // No value for this doc } while ( segValues.hasNextValue() ) { int term = (int) segValues.nextOrd(); - segCounts[term]++; + uniqueOrdinalsForDocument.add( term ); } + uniqueOrdinalsForDocument.forEach( incrementCountForOrdinal ); + uniqueOrdinalsForDocument.clear(); } // Then, migrate to global ords: @@ -176,15 +186,18 @@ private void countOneSegment(OrdinalMap ordinalMap, TextMultiValues segValues, i } else { // No ord mapping (e.g., single segment index): - // just aggregate directly into counts: + // just aggregate directly into counts. + IntProcedure incrementCountForOrdinal = ord -> counts[ord]++; for ( int doc = docs.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = docs.nextDoc() ) { if ( !segValues.advanceExact( doc ) ) { continue; // No value for this doc } while ( segValues.hasNextValue() ) { int term = (int) segValues.nextOrd(); - counts[term]++; + uniqueOrdinalsForDocument.add( term ); } + uniqueOrdinalsForDocument.forEach( incrementCountForOrdinal ); + uniqueOrdinalsForDocument.clear(); } } }