forked from senseidb/bobo
/
AdaptiveFacetFilter.java
189 lines (161 loc) · 5.36 KB
/
AdaptiveFacetFilter.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
package com.browseengine.bobo.facets.filter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.apache.log4j.Logger;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import com.browseengine.bobo.api.BoboIndexReader;
import com.browseengine.bobo.docidset.EmptyDocIdSet;
import com.browseengine.bobo.docidset.RandomAccessDocIdSet;
import com.browseengine.bobo.facets.data.FacetDataCache;
import com.browseengine.bobo.facets.data.TermValueList;
public class AdaptiveFacetFilter extends RandomAccessFilter {
/**
*
*/
private static final long serialVersionUID = 1L;
private static Logger logger = Logger.getLogger(AdaptiveFacetFilter.class);
private final RandomAccessFilter _facetFilter;
private final FacetDataCacheBuilder _facetDataCacheBuilder;
private final Set<String> _valSet;
private boolean _takeComplement = false;
public interface FacetDataCacheBuilder{
FacetDataCache build(BoboIndexReader reader);
String getName();
String getIndexFieldName();
}
// If takeComplement is true, we still return the filter for NotValues . Therefore, the calling function of this class needs to apply NotFilter on top
// of this filter if takeComplement is true.
public AdaptiveFacetFilter(FacetDataCacheBuilder facetDataCacheBuilder,RandomAccessFilter facetFilter,String[] val, boolean takeComplement){
_facetFilter = facetFilter;
_facetDataCacheBuilder = facetDataCacheBuilder;
_valSet = new HashSet<String>(Arrays.asList(val));
_takeComplement = takeComplement;
}
public double getFacetSelectivity(BoboIndexReader reader)
{
double selectivity = _facetFilter.getFacetSelectivity(reader);
if(_takeComplement)
return 1.0 - selectivity;
return selectivity;
}
@Override
public RandomAccessDocIdSet getRandomAccessDocIdSet(BoboIndexReader reader)
throws IOException {
RandomAccessDocIdSet innerDocSet = _facetFilter.getRandomAccessDocIdSet(reader);
if (innerDocSet==EmptyDocIdSet.getInstance()){
return innerDocSet;
}
FacetDataCache dataCache = _facetDataCacheBuilder.build(reader);
int totalCount = reader.maxDoc();
TermValueList valArray = dataCache.valArray;
int freqCount = 0;
ArrayList<String> validVals = new ArrayList<String>(_valSet.size());
for (String val : _valSet){
int idx = valArray.indexOf(val);
if (idx>=0){
validVals.add(valArray.get(idx)); // get and format the value
freqCount+=dataCache.freqs[idx];
}
}
if (validVals.size()==0){
return EmptyDocIdSet.getInstance();
}
// takeComplement is only used to choose between TermListRandomAccessDocIdSet and innerDocSet
int validFreqCount = _takeComplement ? (totalCount - freqCount) : freqCount;
if (_facetDataCacheBuilder.getIndexFieldName() != null && ((validFreqCount<<1) < totalCount)) {
return new TermListRandomAccessDocIdSet(_facetDataCacheBuilder.getIndexFieldName(), innerDocSet, validVals, reader);
}
else{
return innerDocSet;
}
}
public static class TermListRandomAccessDocIdSet extends RandomAccessDocIdSet{
private final RandomAccessDocIdSet _innerSet;
private final ArrayList<String> _vals;
private final IndexReader _reader;
private final String _name;
private final static int OR_THRESHOLD = 5;
TermListRandomAccessDocIdSet(String name,RandomAccessDocIdSet innerSet,ArrayList<String> vals,IndexReader reader){
_name = name;
_innerSet = innerSet;
_vals = vals;
_reader = reader;
}
public static class TermDocIdSet extends DocIdSet{
final Term term;
private final IndexReader reader;
public TermDocIdSet(IndexReader reader, String name, String val){
this.reader = reader;
term = new Term(name,val);
}
@Override
public DocIdSetIterator iterator() throws IOException {
final TermDocs td = reader.termDocs(term);
if (td==null){
return EmptyDocIdSet.getInstance().iterator();
}
return new DocIdSetIterator(){
private int _doc = -1;
@Override
public int advance(int target) throws IOException {
if (td.skipTo(target)){
_doc = td.doc();
}
else{
td.close();
_doc= DocIdSetIterator.NO_MORE_DOCS;
}
return _doc;
}
@Override
public int docID() {
return _doc;
}
@Override
public int nextDoc() throws IOException {
if (td.next()){
_doc = td.doc();
}
else{
td.close();
_doc = DocIdSetIterator.NO_MORE_DOCS;
}
return _doc;
}
};
}
}
@Override
public boolean get(int docId) {
return _innerSet.get(docId);
}
@Override
public DocIdSetIterator iterator() throws IOException {
if (_vals.size()==0){
return EmptyDocIdSet.getInstance().iterator();
}
if (_vals.size()==1){
return new TermDocIdSet(_reader, _name,_vals.get(0)).iterator();
}
else{
if (_vals.size()<OR_THRESHOLD){
ArrayList<DocIdSet> docSetList = new ArrayList<DocIdSet>(_vals.size());
for (String val : _vals){
docSetList.add(new TermDocIdSet(_reader, _name,val));
}
return new OrDocIdSet(docSetList).iterator();
}
else{
return _innerSet.iterator();
}
}
}
}
}