Permalink
Browse files

Use differemt kinds of BigSegmentedArray based on the dictionary size

  • Loading branch information...
Volodymyr Zhabiuk
Volodymyr Zhabiuk committed Oct 22, 2012
1 parent de81741 commit b7154503770d9a03b256ba10709c4172bb160d33
Showing with 28 additions and 16 deletions.
  1. +28 −16 bobo-browse/src/main/java/com/browseengine/bobo/facets/data/FacetDataCache.java
@@ -35,7 +35,6 @@
public int[] freqs;
public int[] minIDs;
public int[] maxIDs;
- private final TermCountSize _termCountSize;
public FacetDataCache(BigSegmentedArray orderArray, TermValueList<T> valArray, int[] freqs, int[] minIDs,
int[] maxIDs, TermCountSize termCountSize) {
@@ -44,7 +43,6 @@ public FacetDataCache(BigSegmentedArray orderArray, TermValueList<T> valArray, i
this.freqs = freqs;
this.minIDs = minIDs;
this.maxIDs = maxIDs;
- _termCountSize = termCountSize;
}
public FacetDataCache() {
@@ -53,51 +51,67 @@ public FacetDataCache() {
this.maxIDs = null;
this.minIDs = null;
this.freqs = null;
- _termCountSize = TermCountSize.large;
}
public int getNumItems(int docid) {
int valIdx = orderArray.get(docid);
return valIdx <= 0 ? 0 : 1;
}
- private final static BigSegmentedArray newInstance(TermCountSize termCountSize, int maxDoc) {
- if (termCountSize == TermCountSize.small) {
+ private final static BigSegmentedArray newInstance(int termCount, int maxDoc) {
+ // we use < instead of <= to take into consideration "missing" value (zero element in the dictionary)
+ if (termCount < Byte.MAX_VALUE) {
return new BigByteArray(maxDoc);
- } else if (termCountSize == TermCountSize.medium) {
+ } else if (termCount < Short.MAX_VALUE) {
return new BigShortArray(maxDoc);
} else
return new BigIntArray(maxDoc);
}
- protected int getNegativeValueCount(IndexReader reader, String field) throws IOException {
+ protected int getDictValueCount(IndexReader reader, String field) throws IOException {
int ret = 0;
TermEnum termEnum = null;
try {
termEnum = reader.terms(new Term(field, ""));
do {
Term term = termEnum.term();
- if (term == null || term.field() != field)
- break;
- if (!term.text().startsWith("-")) {
+ if (term == null || !term.field().equals(field))
break;
- }
ret++;
} while (termEnum.next());
} finally {
termEnum.close();
}
return ret;
}
-
+ protected int getNegativeValueCount(IndexReader reader, String field) throws IOException {
+ int ret = 0;
+ TermEnum termEnum = null;
+ try {
+ termEnum = reader.terms(new Term(field, ""));
+ do {
+ Term term = termEnum.term();
+ if (term == null || term.field() != field)
+ break;
+ if (!term.text().startsWith("-")) {
+ break;
+ }
+ ret++;
+ } while (termEnum.next());
+ } finally {
+ termEnum.close();
+ }
+ return ret;
+ }
public void load(String fieldName, IndexReader reader, TermListFactory<T> listFactory) throws IOException {
String field = fieldName.intern();
int maxDoc = reader.maxDoc();
BigSegmentedArray order = this.orderArray;
if (order == null) // we want to reuse the memory
{
- order = newInstance(_termCountSize, maxDoc);
+ int dictValueCount = getDictValueCount(reader, fieldName);
+ order = newInstance(dictValueCount, maxDoc);
} else {
order.ensureCapacity(maxDoc); // no need to fill to 0, we are reseting the
// data anyway
@@ -130,9 +144,7 @@ public void load(String fieldName, IndexReader reader, TermListFactory<T> listFa
if (term == null || term.field() != field)
break;
- if (t > order.maxValue()) {
- throw new IOException("maximum number of value cannot exceed: " + order.maxValue());
- }
+
// store term text
// we expect that there is at most one term per document
if (t >= length)

0 comments on commit b715450

Please sign in to comment.