Skip to content

Commit

Permalink
HSEARCH-3894 Fix Lucene text aggregations: count each document only o…
Browse files Browse the repository at this point in the history
…nce per term

This solution is probably not great performance-wise, but at least it
will work. It'll have to do until someone has time to find something
better.
  • Loading branch information
yrodiere authored and fax4ever committed Apr 16, 2020
1 parent 297cfe9 commit feccb0b
Showing 1 changed file with 17 additions and 4 deletions.
Expand Up @@ -13,6 +13,8 @@
import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.TextMultiValues;
import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.TextMultiValuesSource;

import com.carrotsearch.hppc.IntHashSet;
import com.carrotsearch.hppc.procedures.IntProcedure;
import org.apache.lucene.facet.FacetResult;
import org.apache.lucene.facet.Facets;
import org.apache.lucene.facet.FacetsCollector;
Expand Down Expand Up @@ -123,6 +125,7 @@ private void countOneSegment(OrdinalMap ordinalMap, TextMultiValues segValues, i
// nothing to count
return;
}
IntHashSet uniqueOrdinalsForDocument = new IntHashSet();

DocIdSetIterator docs = hits.bits.iterator();

Expand All @@ -141,28 +144,35 @@ private void countOneSegment(OrdinalMap ordinalMap, TextMultiValues segValues, i
int numSegOrds = (int) segValues.getValueCount();

if ( hits.totalHits < numSegOrds / 10 ) {
IntProcedure incrementCountForOrdinal = ord -> counts[ord]++;
// Remap every ord to global ord as we iterate:
for ( int doc = docs.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = docs.nextDoc() ) {
if ( !segValues.advanceExact( doc ) ) {
continue; // No value for this doc
}
while ( segValues.hasNextValue() ) {
int term = (int) segValues.nextOrd();
counts[(int) ordMap.get( term )]++;
int globalOrd = (int) ordMap.get( term );
uniqueOrdinalsForDocument.add( globalOrd );
}
uniqueOrdinalsForDocument.forEach( incrementCountForOrdinal );
uniqueOrdinalsForDocument.clear();
}
}
else {
// First count in seg-ord space:
final int[] segCounts = new int[numSegOrds];
IntProcedure incrementCountForOrdinal = ord -> segCounts[ord]++;
for ( int doc = docs.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = docs.nextDoc() ) {
if ( !segValues.advanceExact( doc ) ) {
continue; // No value for this doc
}
while ( segValues.hasNextValue() ) {
int term = (int) segValues.nextOrd();
segCounts[term]++;
uniqueOrdinalsForDocument.add( term );
}
uniqueOrdinalsForDocument.forEach( incrementCountForOrdinal );
uniqueOrdinalsForDocument.clear();
}

// Then, migrate to global ords:
Expand All @@ -176,15 +186,18 @@ private void countOneSegment(OrdinalMap ordinalMap, TextMultiValues segValues, i
}
else {
// No ord mapping (e.g., single segment index):
// just aggregate directly into counts:
// just aggregate directly into counts.
IntProcedure incrementCountForOrdinal = ord -> counts[ord]++;
for ( int doc = docs.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = docs.nextDoc() ) {
if ( !segValues.advanceExact( doc ) ) {
continue; // No value for this doc
}
while ( segValues.hasNextValue() ) {
int term = (int) segValues.nextOrd();
counts[term]++;
uniqueOrdinalsForDocument.add( term );
}
uniqueOrdinalsForDocument.forEach( incrementCountForOrdinal );
uniqueOrdinalsForDocument.clear();
}
}
}
Expand Down

0 comments on commit feccb0b

Please sign in to comment.