Permalink
Browse files

The adaptive facet filter now has a configurable inverted index penalty

  • Loading branch information...
1 parent 58a9ea9 commit 4a486762fa67ed5e93036f40dc070886418b150f jhartman committed May 10, 2013
@@ -1,11 +1,13 @@
package com.browseengine.bobo.facets.filter;
import java.io.IOException;
+import java.lang.Math;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
+import com.browseengine.bobo.util.*;
import org.apache.log4j.Logger;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
@@ -28,11 +30,14 @@
private static final long serialVersionUID = 1L;
private static Logger logger = Logger.getLogger(AdaptiveFacetFilter.class);
+
+ public static final int DEFAULT_INVERTED_INDEX_PENALTY = 32;
private final RandomAccessFilter _facetFilter;
private final FacetDataCacheBuilder _facetDataCacheBuilder;
private final Set<String> _valSet;
private boolean _takeComplement = false;
+ private final int _invertedIndexPenalty;
public interface FacetDataCacheBuilder{
FacetDataCache build(BoboIndexReader reader);
@@ -42,11 +47,16 @@
// If takeComplement is true, we still return the filter for NotValues . Therefore, the calling function of this class needs to apply NotFilter on top
// of this filter if takeComplement is true.
- public AdaptiveFacetFilter(FacetDataCacheBuilder facetDataCacheBuilder,RandomAccessFilter facetFilter,String[] val, boolean takeComplement){
+ public AdaptiveFacetFilter(FacetDataCacheBuilder facetDataCacheBuilder,
+ RandomAccessFilter facetFilter,
+ String[] val,
+ boolean takeComplement,
+ int invertedIndexPenalty) {
_facetFilter = facetFilter;
_facetDataCacheBuilder = facetDataCacheBuilder;
_valSet = new HashSet<String>(Arrays.asList(val));
_takeComplement = takeComplement;
+ _invertedIndexPenalty = invertedIndexPenalty;
}
public double getFacetSelectivity(BoboIndexReader reader)
@@ -73,7 +83,7 @@ public RandomAccessDocIdSet getRandomAccessDocIdSet(BoboIndexReader reader)
ArrayList<String> validVals = new ArrayList<String>(_valSet.size());
- for (String val : _valSet){
+ for (String val : _valSet) {
int idx = valArray.indexOf(val);
if (idx>=0){
validVals.add(valArray.get(idx)); // get and format the value
@@ -87,23 +97,37 @@ public RandomAccessDocIdSet getRandomAccessDocIdSet(BoboIndexReader reader)
// takeComplement is only used to choose between TermListRandomAccessDocIdSet and innerDocSet
int validFreqCount = _takeComplement ? (totalCount - freqCount) : freqCount;
-
- if (_facetDataCacheBuilder.getIndexFieldName() != null && ((validFreqCount<<1) < totalCount)) {
+
+ int invertedIndexCost = estimateInvertedIndexCost(validFreqCount, _valSet.size(), totalCount);
+ int forwardIndexCost = estimateForwardIndexCost(validFreqCount, _valSet.size(), totalCount);
+
+ if (_facetDataCacheBuilder.getIndexFieldName() != null && invertedIndexCost < forwardIndexCost) {
return new TermListRandomAccessDocIdSet(_facetDataCacheBuilder.getIndexFieldName(), innerDocSet, validVals, reader);
- }
- else{
+ } else {
return innerDocSet;
}
}
+ // Merges several streams from lucene
+ private final int estimateInvertedIndexCost(int hitCount, int numQueries, int totalDocs) {
+ int log2 = BitMath.log2Ceiling(numQueries);
+ int numComparisons = Math.max(1, log2);
+ return _invertedIndexPenalty * numComparisons * hitCount;
+ }
+
+ // Implementation checks in a bitset for each doc
+ private final int estimateForwardIndexCost(int hitCount, int numQueries, int totalDocs) {
+ return totalDocs;
+ }
+
public static class TermListRandomAccessDocIdSet extends RandomAccessDocIdSet{
private final RandomAccessDocIdSet _innerSet;
private final ArrayList<String> _vals;
private final IndexReader _reader;
private final String _name;
- private final static int OR_THRESHOLD = 5;
- TermListRandomAccessDocIdSet(String name,RandomAccessDocIdSet innerSet,ArrayList<String> vals,IndexReader reader){
+
+ TermListRandomAccessDocIdSet(String name,RandomAccessDocIdSet innerSet,ArrayList<String> vals,IndexReader reader){
_name = name;
_innerSet = innerSet;
_vals = vals;
@@ -174,16 +198,11 @@ public DocIdSetIterator iterator() throws IOException {
return new TermDocIdSet(_reader, _name,_vals.get(0)).iterator();
}
else{
- if (_vals.size()<OR_THRESHOLD){
- ArrayList<DocIdSet> docSetList = new ArrayList<DocIdSet>(_vals.size());
- for (String val : _vals){
- docSetList.add(new TermDocIdSet(_reader, _name,val));
- }
- return new OrDocIdSet(docSetList).iterator();
- }
- else{
- return _innerSet.iterator();
- }
+ ArrayList<DocIdSet> docSetList = new ArrayList<DocIdSet>(_vals.size());
+ for (String val : _vals){
+ docSetList.add(new TermDocIdSet(_reader, _name,val));
+ }
+ return new OrDocIdSet(docSetList).iterator();
}
}
}
@@ -50,15 +50,17 @@
protected int _maxItems = BigNestedIntArray.MAX_ITEMS;
protected Term _sizePayloadTerm;
protected Set<String> _depends;
+ protected final int _invertedIndexPenalty;
- public MultiValueFacetHandler(String name,
- String indexFieldName,
- TermListFactory termListFactory,
+ public MultiValueFacetHandler(String name,
+ String indexFieldName,
+ TermListFactory termListFactory,
Term sizePayloadTerm,
- Set<String> depends)
+ Set<String> depends, int invertedIndexPenalty)
{
super(name, depends);
_depends = depends;
+ _invertedIndexPenalty = invertedIndexPenalty;
_indexFieldName = (indexFieldName != null ? indexFieldName : name);
_termListFactory = termListFactory;
_sizePayloadTerm = sizePayloadTerm;
@@ -71,40 +73,76 @@ public int getNumItems(BoboIndexReader reader, int id) {
return data.getNumItems(id);
}
+ public MultiValueFacetHandler(String name, String indexFieldName, TermListFactory termListFactory, Term sizePayloadTerm, int invertedIndexPenalty)
+ {
+ this(name, indexFieldName, termListFactory, sizePayloadTerm, null, invertedIndexPenalty);
+ }
+
+ public MultiValueFacetHandler(String name, TermListFactory termListFactory, Term sizePayloadTerm, int invertedIndexPenalty)
+ {
+ this(name, name, termListFactory, sizePayloadTerm, null, invertedIndexPenalty);
+ }
+
+ public MultiValueFacetHandler(String name, String indexFieldName, TermListFactory termListFactory, int invertedIndexPenalty)
+ {
+ this(name, indexFieldName, termListFactory, null, null, invertedIndexPenalty);
+ }
+
+ public MultiValueFacetHandler(String name, TermListFactory termListFactory, int invertedIndexPenalty)
+ {
+ this(name, name, termListFactory, invertedIndexPenalty);
+ }
+
+ public MultiValueFacetHandler(String name, String indexFieldName, int invertedIndexPenalty)
+ {
+ this(name, indexFieldName, null, invertedIndexPenalty);
+ }
+
+ public MultiValueFacetHandler(String name, int invertedIndexPenalty)
+ {
+ this(name, name, null, invertedIndexPenalty);
+ }
+
+ public MultiValueFacetHandler(String name, Set<String> depends, int invertedIndexPenalty)
+ {
+ this(name, name, null, null, depends, invertedIndexPenalty);
+ }
+
public MultiValueFacetHandler(String name, String indexFieldName, TermListFactory termListFactory, Term sizePayloadTerm)
{
- this(name, indexFieldName, termListFactory, sizePayloadTerm, null);
+ this(name, indexFieldName, termListFactory, sizePayloadTerm, null, AdaptiveFacetFilter.DEFAULT_INVERTED_INDEX_PENALTY);
}
- public MultiValueFacetHandler(String name, TermListFactory termListFactory, Term sizePayloadTerm)
+ public MultiValueFacetHandler(String name, TermListFactory termListFactory, Term sizePayloadTerm)
{
- this(name, name, termListFactory, sizePayloadTerm, null);
+ this(name, name, termListFactory, sizePayloadTerm, null, AdaptiveFacetFilter.DEFAULT_INVERTED_INDEX_PENALTY);
}
- public MultiValueFacetHandler(String name, String indexFieldName, TermListFactory termListFactory)
+ public MultiValueFacetHandler(String name, String indexFieldName, TermListFactory termListFactory)
{
- this(name, indexFieldName, termListFactory, null, null);
+ this(name, indexFieldName, termListFactory, null, null, AdaptiveFacetFilter.DEFAULT_INVERTED_INDEX_PENALTY);
}
public MultiValueFacetHandler(String name, TermListFactory termListFactory)
{
- this(name, name, termListFactory);
+ this(name, name, termListFactory, AdaptiveFacetFilter.DEFAULT_INVERTED_INDEX_PENALTY);
}
public MultiValueFacetHandler(String name, String indexFieldName)
{
- this(name, indexFieldName, null);
+ this(name, indexFieldName, null, AdaptiveFacetFilter.DEFAULT_INVERTED_INDEX_PENALTY);
}
public MultiValueFacetHandler(String name)
{
- this(name, name, null);
+ this(name, name, null, AdaptiveFacetFilter.DEFAULT_INVERTED_INDEX_PENALTY);
}
-
+
public MultiValueFacetHandler(String name, Set<String> depends)
{
- this(name, name, null, null, depends);
+ this(name, name, null, null, depends, AdaptiveFacetFilter.DEFAULT_INVERTED_INDEX_PENALTY);
}
+
@Override
public DocComparatorSource getDocComparatorSource()
{
@@ -181,7 +219,8 @@ public MultiValueFacetDataCache load(BoboIndexReader reader, WorkArea workArea)
public RandomAccessFilter buildRandomAccessFilter(String value, Properties prop) throws IOException
{
MultiValueFacetFilter f= new MultiValueFacetFilter(new MultiDataCacheBuilder(getName(), _indexFieldName), value);
- AdaptiveFacetFilter af = new AdaptiveFacetFilter(new SimpleDataCacheBuilder(getName(), _indexFieldName), f, new String[]{value}, false);
+ AdaptiveFacetFilter af = new AdaptiveFacetFilter(new SimpleDataCacheBuilder(getName(), _indexFieldName), f,
+ new String[]{value}, false, _invertedIndexPenalty);
return af;
}
@@ -215,7 +254,8 @@ public RandomAccessFilter buildRandomAccessOrFilter(String[] vals,Properties pro
{
MultiValueORFacetFilter f = new MultiValueORFacetFilter(this,vals,false); // catch the "not" case later
if (!isNot) {
- AdaptiveFacetFilter af = new AdaptiveFacetFilter(new SimpleDataCacheBuilder(getName(), _indexFieldName), f, vals, false);
+ AdaptiveFacetFilter af =
+ new AdaptiveFacetFilter(new SimpleDataCacheBuilder(getName(), _indexFieldName), f, vals, false, _invertedIndexPenalty);
return af;
}
else{
@@ -1,57 +1,49 @@
package com.browseengine.bobo.facets.impl;
-import it.unimi.dsi.fastutil.floats.FloatArrayList;
-import it.unimi.dsi.fastutil.floats.FloatList;
-
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Map;
import java.util.Properties;
-import java.util.Set;
-
-import org.apache.log4j.Logger;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.search.Explanation;
import com.browseengine.bobo.api.BoboIndexReader;
import com.browseengine.bobo.api.BoboIndexReader.WorkArea;
-import com.browseengine.bobo.api.BrowseSelection;
-import com.browseengine.bobo.api.FacetSpec;
-import com.browseengine.bobo.facets.FacetCountCollector;
-import com.browseengine.bobo.facets.FacetCountCollectorSource;
-import com.browseengine.bobo.facets.FacetHandler;
import com.browseengine.bobo.facets.data.MultiValueWithWeightFacetDataCache;
import com.browseengine.bobo.facets.data.TermListFactory;
import com.browseengine.bobo.facets.filter.AdaptiveFacetFilter;
-import com.browseengine.bobo.facets.filter.EmptyFilter;
import com.browseengine.bobo.facets.filter.MultiValueFacetFilter;
-import com.browseengine.bobo.facets.filter.MultiValueORFacetFilter;
-import com.browseengine.bobo.facets.filter.RandomAccessAndFilter;
import com.browseengine.bobo.facets.filter.RandomAccessFilter;
-import com.browseengine.bobo.facets.filter.RandomAccessNotFilter;
import com.browseengine.bobo.facets.range.MultiDataCacheBuilder;
-import com.browseengine.bobo.facets.range.SimpleDataCacheBuilder;
-import com.browseengine.bobo.query.scoring.BoboDocScorer;
-import com.browseengine.bobo.query.scoring.FacetScoreable;
-import com.browseengine.bobo.query.scoring.FacetTermScoringFunctionFactory;
-import com.browseengine.bobo.sort.DocComparatorSource;
-import com.browseengine.bobo.util.BigNestedIntArray;
public class MultiValueWithWeightFacetHandler extends MultiValueFacetHandler
{
- public MultiValueWithWeightFacetHandler(String name, String indexFieldName, TermListFactory termListFactory)
+ public MultiValueWithWeightFacetHandler(String name, String indexFieldName, TermListFactory termListFactory, int invertedIndexPenalty)
+ {
+ super(name, indexFieldName, termListFactory, null, null, invertedIndexPenalty);
+ }
+
+ public MultiValueWithWeightFacetHandler(String name, String indexFieldName, int invertedIndexPenalty)
+ {
+ super(name, indexFieldName, null, null, null, invertedIndexPenalty);
+ }
+
+ public MultiValueWithWeightFacetHandler(String name, int invertedIndexPenalty)
+ {
+ super(name, name, null, null, null, invertedIndexPenalty);
+ }
+
+ public MultiValueWithWeightFacetHandler(String name,
+ String indexFieldName,
+ TermListFactory termListFactory)
{
- super(name, indexFieldName, termListFactory, null, null);
+ super(name, indexFieldName, termListFactory, null, null, AdaptiveFacetFilter.DEFAULT_INVERTED_INDEX_PENALTY);
}
- public MultiValueWithWeightFacetHandler(String name, String indexFieldName)
+ public MultiValueWithWeightFacetHandler(String name, String indexFieldName)
{
- super(name, indexFieldName, null, null, null);
+ super(name, indexFieldName, null, null, null, AdaptiveFacetFilter.DEFAULT_INVERTED_INDEX_PENALTY);
}
- public MultiValueWithWeightFacetHandler(String name)
+ public MultiValueWithWeightFacetHandler(String name)
{
- super(name, name, null, null, null);
+ super(name, name, null, null, null, AdaptiveFacetFilter.DEFAULT_INVERTED_INDEX_PENALTY);
}
@Override
Oops, something went wrong.

0 comments on commit 4a48676

Please sign in to comment.