Permalink
Browse files

Merge branch 'master' of github.com:javasoze/kamikaze

  • Loading branch information...
2 parents 6b0636b + c78a9cc commit 73b48da5bba6d9c2748948bad7b35debb6fbffbf @javasoze committed Oct 2, 2012
View
@@ -5,3 +5,4 @@ ivy
lib
build-test
SerialDocSet
+output
View
@@ -9,7 +9,7 @@
<groupId>com.linkedin.kamikaze</groupId>
<artifactId>kamikaze</artifactId>
<packaging>jar</packaging>
- <version>3.0.6</version>
+ <version>3.0.7-SNAPSHOT</version>
<name>kamikaze</name>
<description>information retrival utility package for enhancing Lucene</description>
<url>http://sna-projects.com/kamikaze</url>
@@ -70,6 +70,7 @@
-Xms256m -Xmx4g <!-- -Xdebug -Xrunjdwp:transport=dt_socket,address=8002,server=y,suspend=y -->
</argLine>
<excludes>
+ <exclude>com/kamikaze/lucenecode/test/*.java</exclude>
<exclude>com/kamikaze/test/perf/*.java</exclude>
<exclude>**/*$*</exclude>
</excludes>
@@ -1,76 +0,0 @@
-package com.kamikaze.docidset.compression;
-
-import java.io.Serializable;
-import com.kamikaze.docidset.utils.CompResult;
-import com.kamikaze.pfordelta.PForDelta;
-
-/**
- * Wrapper of PForDelta class. This class is used to compress/decompress data blocks of integers
- *
- * @author hao yan
- */
-public class PForDeltaWithBase implements PForDeltaCompressedSortedIntegerSegment, Serializable {
-
- private static final long serialVersionUID = 1L;
-
- private static final int INVALID = -1;
-
- // Max number of bits to store an uncompressed value
- private int _compressedBitSize = 0; // The compressed size in bits of the block
-
- /**
- * Get the compressed size in bits of the block
- * @return the compressed size in bits of the block
- */
- public int getCompressedBitSize()
- {
- return _compressedBitSize;
- }
-
- /**
- * Estimate the compressed size of a block
- *
- * @param inputBlock a block of non-negative integers to be compressed
- * @param bits the value of b in the PForDelta algorithm
- * @param blockSize the block size which is 256 by default
- * @return CompResult
- * @throws IllegalArgumentException
- */
- public int estimateCompSize(int[] inputBlock, int bits, int blockSize) throws IllegalArgumentException {
- return PForDelta.estimateCompressedSize(inputBlock, bits, blockSize);
- }
-
- @Override
- public CompResult compressOneBlock(int[] inputBlock, int bits, int blockSize, boolean flag) throws IllegalArgumentException {
- return compressOneBlock(inputBlock, blockSize);
- }
-
- /**
- * Compress an integer array
- *
- * @param inputBlock the integer input array
- * @param blockSize the block size which is 256 by default
- * @return CompResult which contains the compressed size in number of bits and the reference to the compressed data
- * @throws IllegalArgumentException
- */
- public CompResult compressOneBlock(int[] inputBlock, int blockSize) throws IllegalArgumentException {
-
- int[] compBlock = PForDelta.compressOneBlockOpt(inputBlock, blockSize);
- CompResult res = new CompResult();
- res.setCompressedSize(compBlock.length<<5);
- res.setCompressedBlock(compBlock);
- return res;
- }
-
- /**
- * Decompress a compressed block into an integer array
- *
- * @param compBlock the compressed input block
- * @param blockSize the block size which is 256 by default
- * @return the decompressed output block
- */
- public int decompressOneBlock(int[] decompBlock, int[] compBlock, int blockSize)
- {
- return PForDelta.decompressOneBlock(decompBlock, compBlock, blockSize);
- }
-}
@@ -56,7 +56,7 @@ public int advance(int target) throws java.io.IOException{
if (cursor >= _array.length || _array.length == -1) return DocIdSetIterator.NO_MORE_DOCS;
if (target <= _doc) target = _doc + 1;
int index = Arrays.binarySearch(_array, target);
- if (index > 0){
+ if (index >= 0){
cursor = index;
_doc = _array[cursor];
return _doc;
@@ -5,16 +5,18 @@
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.Arrays;
+import java.util.zip.CRC32;
+import java.util.zip.Checksum;
import org.apache.lucene.search.DocIdSetIterator;
import com.kamikaze.docidset.api.DocSet;
import com.kamikaze.docidset.api.StatefulDSIterator;
-import com.kamikaze.docidset.compression.PForDeltaWithBase;
import com.kamikaze.docidset.utils.CompResult;
import com.kamikaze.docidset.utils.Conversion;
import com.kamikaze.docidset.utils.IntArray;
import com.kamikaze.docidset.utils.PForDeltaIntSegmentArray;
+import com.kamikaze.pfordelta.PForDelta;
/**
* This class implements the DocId set which is built on top of the optimized PForDelta algorithm (PForDeltaWithBase)
@@ -40,12 +42,11 @@
private int totalDocIdNum=0; // the total number of elemnts that have been inserted/accessed so far
private long compressedBitSize=0; // compressed size in bits
- transient private PForDeltaWithBase compBlockWithBase = new PForDeltaWithBase(); // the PForDelta algorithm to compress a block
transient private IntArray baseListForOnlyCompBlocks; // the base lists for skipping
transient private int[] currentNoCompBlock; // the memory used to store the uncompressed elements. Once the block is full, all its elements are compressed into sequencOfCompBlock and the block is cleared.
transient private int sizeOfCurrentNoCompBlock = 0; // the number of uncompressed elements that is hold in the currentNoCompBlock
- private int version = 1;
+ transient private int version = 1;
public PForDeltaDocIdSet() {
sequenceOfCompBlocks = new PForDeltaIntSegmentArray();
@@ -68,8 +69,6 @@ public PForDeltaDocIdSet(int batchSize) {
public static PForDeltaDocIdSet deserialize(byte[] bytesData, int offset) throws IOException
{
PForDeltaDocIdSet res = new PForDeltaDocIdSet();
-// int totalNumInt = Conversion.byteArrayToInt(bytesData, offset);
-// offset += Conversion.BYTES_PER_INT;
// 1. version
res.version = Conversion.byteArrayToInt(bytesData, offset);
@@ -113,12 +112,16 @@ public static PForDeltaDocIdSet deserialize(byte[] bytesData, int offset) throws
res.sequenceOfCompBlocks = PForDeltaIntSegmentArray.newInstanceFromBytes(bytesData, offset);
offset += (PForDeltaIntSegmentArray.getSerialIntNum(res.sequenceOfCompBlocks) * Conversion.BYTES_PER_INT);
- // 9. hashCode
- int expectedHashCode = 1;
- int hashCode = Conversion.byteArrayToInt(bytesData, offset);
- if(expectedHashCode != hashCode)
+ // 9. checksum
+ Checksum digest = new CRC32();
+ digest.update(bytesData, 0, offset);
+ long checksum = digest.getValue();
+
+ long receivedChecksum = Conversion.byteArrayToLong(bytesData, offset);
+
+ if(receivedChecksum != checksum)
{
- throw new IOException("serialization problem");
+ throw new IOException("serialization error: check sum does not match: ");
}
return res;
@@ -128,7 +131,7 @@ public static PForDeltaDocIdSet deserialize(byte[] bytesData, int offset) throws
{
int versionNumInt = 1;
int blockSizeNumInt = 1;
- int hashCodeInt = 1;
+ int checksumInt = 2; // checksum is long = 2 ints
int lastAddedNumInt = 1;
int totalDocIdNumInt = 1;
int compressedBitsNumInt = 2; // long = 2 ints
@@ -140,7 +143,7 @@ public static PForDeltaDocIdSet deserialize(byte[] bytesData, int offset) throws
// plus the hashCode for all data
int totalNumInt = versionNumInt + blockSizeNumInt + lastAddedNumInt + totalDocIdNumInt + compressedBitsNumInt +
- baseListForOnlyComnpBlocksNumInt + currentNoCompBlockBlockNumInt + seqCompBlockIntNum + hashCodeInt;
+ baseListForOnlyComnpBlocksNumInt + currentNoCompBlockBlockNumInt + seqCompBlockIntNum + checksumInt;
byte[] bytesData = new byte[(totalNumInt+1)*Conversion.BYTES_PER_INT]; // +1 because of totalNumInt itself
@@ -150,6 +153,7 @@ public static PForDeltaDocIdSet deserialize(byte[] bytesData, int offset) throws
Conversion.intToByteArray(totalNumInt, bytesData, offset);
offset += Conversion.BYTES_PER_INT;
+ int startOffset = offset;
// 1. version
Conversion.intToByteArray(pForDeltaDocIdSet.version, bytesData, offset);
offset += Conversion.BYTES_PER_INT;
@@ -187,9 +191,12 @@ public static PForDeltaDocIdSet deserialize(byte[] bytesData, int offset) throws
PForDeltaIntSegmentArray.convertToBytes(pForDeltaDocIdSet.sequenceOfCompBlocks, bytesData, offset);
offset += (seqCompBlockIntNum*Conversion.BYTES_PER_INT);
- // 9. hashCode
- int hashCode = 1;;
- Conversion.intToByteArray(hashCode, bytesData, offset);
+ // 9. checksum
+ Checksum digest = new CRC32();
+ digest.update(bytesData, startOffset, offset-startOffset);
+ long checksum = digest.getValue();
+
+ Conversion.longToByteArray(checksum, bytesData, offset);
return bytesData;
}
@@ -223,19 +230,15 @@ private void readObject(ObjectInputStream inStrm) throws IOException, ClassNotFo
{
inStrm.defaultReadObject();
- compBlockWithBase = new PForDeltaWithBase();
-
int[] baseArray = (int[])inStrm.readObject();
baseListForOnlyCompBlocks = new IntArray();
for(int i=0; i<baseArray.length; ++i)
{
baseListForOnlyCompBlocks.add(baseArray[i]);
}
- int[] noCompBlock = (int[])inStrm.readObject();
- sizeOfCurrentNoCompBlock = noCompBlock.length;
- currentNoCompBlock = new int[sizeOfCurrentNoCompBlock];
- System.arraycopy(noCompBlock, 0, currentNoCompBlock, 0, sizeOfCurrentNoCompBlock);
+ currentNoCompBlock = (int[])inStrm.readObject();
+ sizeOfCurrentNoCompBlock = currentNoCompBlock.length;
}
@@ -293,7 +296,9 @@ public boolean find(int target)
return false;
// compBlockWithBase.decompressOneBlock(curDecompBlock, sequenceOfCompBlocks.get(iterDecompBlock), _blockSize);
- compBlockWithBase.decompressOneBlock(myDecompBlock, sequenceOfCompBlocks.get(iterDecompBlock), _blockSize);
+ //compBlockWithBase.decompressOneBlock(myDecompBlock, sequenceOfCompBlocks.get(iterDecompBlock), _blockSize);
+ PForDelta.decompressOneBlock(myDecompBlock, sequenceOfCompBlocks.get(iterDecompBlock), _blockSize);
+
int idx ;
lastId = myDecompBlock[0];
if (lastId == target) return true;
@@ -551,8 +556,11 @@ public void flush(int docId)
*/
private CompResult PForDeltaCompressOneBlock(int[] srcData)
{
- CompResult compRes = compBlockWithBase.compressOneBlock(srcData, _blockSize);
- return compRes;
+ int[] compBlock = PForDelta.compressOneBlockOpt(srcData, _blockSize);
+ CompResult res = new CompResult();
+ res.setCompressedSize(compBlock.length<<5);
+ res.setCompressedBlock(compBlock);
+ return res;
}
/**
@@ -561,7 +569,7 @@ private CompResult PForDeltaCompressOneBlock(int[] srcData)
*/
private int PForDeltaEstimateCompSize(int[] srcData, int b)
{
- return compBlockWithBase.estimateCompSize(srcData, b, _blockSize);
+ return PForDelta.estimateCompressedSize(srcData, b, _blockSize);
}
private void initSet() {
@@ -756,7 +764,6 @@ private void printBlock(int[] block, int size)
int compBlockNum=0; // the number of compressed blocks
transient int[] iterDecompBlock = new int[_blockSize]; // temporary storage for the decompressed data
- PForDeltaWithBase iterPForDeltaSetWithBase = new PForDeltaWithBase(); // PForDelta algorithm
PForDeltaDocIdIterator() {
super();
@@ -797,7 +804,7 @@ public int nextDoc()
// must be in one of the compressed blocks
else if(offset == 0) // case 2: the comp block has been decompressed;
{
- iterPForDeltaSetWithBase.decompressOneBlock(iterDecompBlock, sequenceOfCompBlocks.get(iterBlockIndex), _blockSize);
+ PForDelta.decompressOneBlock(iterDecompBlock, sequenceOfCompBlocks.get(iterBlockIndex), _blockSize);
lastAccessedDocId = iterDecompBlock[offset];
}
else // case 3: in the recently decompressed block
@@ -936,7 +943,7 @@ private int advanceToTargetInTheFollowingCompBlocks(int target, int startBlockIn
System.err.println("ERROR: advanceToTargetInTheFollowingCompBlocks(): Impossible, we must be able to find the block");
}
- iterPForDeltaSetWithBase.decompressOneBlock(iterDecompBlock, sequenceOfCompBlocks.get(iterBlockIndex), _blockSize);
+ PForDelta.decompressOneBlock(iterDecompBlock, sequenceOfCompBlocks.get(iterBlockIndex), _blockSize);
postProcessBlock(iterDecompBlock, _blockSize);
int offset = binarySearchForFirstElementEqualOrLargerThanTarget(iterDecompBlock, 0, _blockSize-1, target);
@@ -964,7 +971,7 @@ private int advanceToTargetInTheFollowingCompBlocksNoPostProcessing(int target,
System.err.println("ERROR: advanceToTargetInTheFollowingCompBlocks(): Impossible, we must be able to find the block");
}
- iterPForDeltaSetWithBase.decompressOneBlock(iterDecompBlock, sequenceOfCompBlocks.get(iterBlockIndex), _blockSize);
+ PForDelta.decompressOneBlock(iterDecompBlock, sequenceOfCompBlocks.get(iterBlockIndex), _blockSize);
lastAccessedDocId = iterDecompBlock[0];
if (lastAccessedDocId >= target)
{
@@ -1002,7 +1009,7 @@ private void printSet()
{
for (int i = 0; i < _blockSize; i++)
{
- iterPForDeltaSetWithBase.decompressOneBlock(iterDecompBlock, sequenceOfCompBlocks.get(i), _blockSize);
+ PForDelta.decompressOneBlock(iterDecompBlock, sequenceOfCompBlocks.get(i), _blockSize);
postProcessBlock(iterDecompBlock, _blockSize);
System.out.print(iterDecompBlock + ",");
}
@@ -29,15 +29,15 @@ public static final void longToByteArray(long value, byte[] bytes, int offset) {
bytes[offset+7] = (byte)value;
}
-public static final long byteArrayToLong(byte [] b, int offset) {
- return (b[offset] << 56)
- + ((b[offset+1] & 0xFF) << 48)
- + ((b[offset+2] & 0xFF) << 40)
- + ((b[offset+3] & 0xFF) << 32)
- + ((b[offset+4] & 0xFF) << 24)
- + ((b[offset+5] & 0xFF) << 16)
- + ((b[offset+6] & 0xFF) << 8)
- + (b[offset+7] & 0xFF);
-}
+ public static final long byteArrayToLong(byte [] b, int offset) {
+ return ((long)b[offset] << 56)
+ + (((long)b[offset+1] & 0xFF) << 48)
+ + (((long)b[offset+2] & 0xFF) << 40)
+ + (((long)b[offset+3] & 0xFF) << 32)
+ + (((long)b[offset+4] & 0xFF) << 24)
+ + ((b[offset+5] & 0xFF) << 16)
+ + ((b[offset+6] & 0xFF) << 8)
+ + (b[offset+7] & 0xFF);
+ }
}
Oops, something went wrong.

0 comments on commit 73b48da

Please sign in to comment.