Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Reimplementation of huffman-rle encoding using a fixed unit size of 6…

…4 bits. This version is extremely hacky
  • Loading branch information...
commit 3d753c32518fe5675e43f27959b4a78162d8167c 1 parent 730c386
@jts authored
View
16 src/SuffixTools/BWTReaderBinary.cpp
@@ -84,7 +84,7 @@ void BWTReaderBinary::readHeader(size_t& num_strings, size_t& num_symbols, BWFla
m_stage = IOS_BWSTR;
}
-void BWTReaderBinary::readRuns(RLBWT* pBWT, RLRawData& out, size_t numRuns)
+void BWTReaderBinary::readRuns(RLBWT* pBWT, RLDataVector& out, size_t numRuns)
{
//out.resize(numRuns);
//m_pReader->read(reinterpret_cast<char*>(&out[0]), numRuns*sizeof(RLUnit));
@@ -202,22 +202,21 @@ void BWTReaderBinary::readRuns(RLBWT* pBWT, RLRawData& out, size_t numRuns)
pBWT->m_smallMarkers.back().encoderIdx = encoderIdx;
// Perform the actual encoding
- EncodedArray encodedBytes;
- size_t symbolsEncoded = StreamEncode::encodeStream(symbolBuffer, symbolEncoder, rlEncoder, encodedBytes);
+ EVVector encodedData;
+ size_t symbolsEncoded = StreamEncode2::encodeStream(symbolBuffer, symbolEncoder, rlEncoder, encodedData);
assert(symbolsEncoded == symbolBuffer.size());
// Copy the encoded bytes
- for(size_t i = 0; i < encodedBytes.size(); ++i)
+ for(size_t i = 0; i < encodedData.size(); ++i)
{
- pBWT->m_rlString.push_back(encodedBytes[i]);
+ pBWT->m_rlString.push_back(encodedData[i]);
}
numSymbolsWrote += symbolsEncoded;
- numBytesUsed += encodedBytes.size();
+ numBytesUsed += encodedData.size() * sizeof(RLData);
numSymbolsEncoded += symbolsEncoded;
- /*
std::string testOut;
- StreamEncode::decodeStream(symbolEncoder, rlEncoder, &encodedBytes[0], symbolsEncoded, testOut);
+ StreamEncode2::decodeStream(symbolEncoder, rlEncoder, &encodedData[0], symbolsEncoded, testOut);
// Validate the decoding
bool failedValidate = false;
@@ -248,7 +247,6 @@ void BWTReaderBinary::readRuns(RLBWT* pBWT, RLRawData& out, size_t numRuns)
std::cout << "\nDecoded: " << testOut << "\n";
assert(false);
}
- */
symbolBuffer.clear();
}
pBWT->initializeFMIndex(runningAC);
View
2  src/SuffixTools/BWTReaderBinary.h
@@ -31,7 +31,7 @@ class BWTReaderBinary : public IBWTReader
virtual void readHeader(size_t& num_strings, size_t& num_symbols, BWFlag& flag);
virtual char readBWChar();
- virtual void readRuns(RLBWT* pBWT, RLRawData& out, size_t numRuns);
+ virtual void readRuns(RLBWT* pBWT, RLDataVector& out, size_t numRuns);
private:
std::istream* m_pReader;
View
1  src/SuffixTools/Huffman.h
@@ -14,6 +14,7 @@
#include <map>
#include <vector>
#include <string>
+#include <inttypes.h>
#include "Alphabet.h"
#include "HuffmanTreeCodec.h"
View
7 src/SuffixTools/HuffmanForest.cpp
@@ -18,8 +18,15 @@ HuffmanForest::HuffmanForest()
countMap['G'] = 1;
countMap['T'] = 1;
countMap['$'] = 1;
+ countMap['\0'] = 1;
HuffmanTreeCodec<char> defaultTree(countMap);
+ defaultTree.hackCode('A', 0);
+ defaultTree.hackCode('C', 1);
+ defaultTree.hackCode('G', 2);
+ defaultTree.hackCode('T', 3);
+ defaultTree.hackCode('$', 4);
+ defaultTree.hackCode('\0', 7);
m_trees.push_back(defaultTree);
}
View
13 src/SuffixTools/HuffmanTreeCodec.h
@@ -152,15 +152,24 @@ class HuffmanTreeCodec
return iter->second;
}
- DecodePair decode(size_t code) const
+ const DecodePair& decode(size_t code) const
{
assert(code < m_decoder.size());
return m_decoder[code];
}
+ void hackCode(T sym, int code)
+ {
+ m_encoder[sym].code = code;
+ m_encoder[sym].bits = 3;
+
+ m_decoder[code].symbol = sym;
+ m_decoder[code].bits = 3;
+ }
+
size_t getMinBits() const { return m_minSymbolBits; }
size_t getMaxBits() const { return m_maxSymbolBits; }
-
+ size_t getMaxCode() const { return m_decoder.size(); }
// Returns the greatest symbol in the set of encoding values
// whose value is no greater than val
T getGreatestLowerBound(T val) const
View
4 src/SuffixTools/RLBWT.cpp
@@ -233,7 +233,7 @@ void RLBWT::printInfo() const
size_t large_m_size = m_largeMarkers.capacity() * sizeof(LargeMarker);
size_t total_marker_size = small_m_size + large_m_size;
- size_t bwStr_size = m_rlString.size();
+ size_t bwStr_size = m_rlString.size() * sizeof(RLData);
size_t other_size = sizeof(*this);
size_t total_size = total_marker_size + bwStr_size + other_size;
@@ -337,7 +337,7 @@ void RLBWT::decodeToFile(const std::string& filename)
const HuffmanTreeCodec<char>& symDecoder = HuffmanForest::Instance().getDecoder(decodeIdx);
size_t startingUnitIndex = currLargeMarker.unitIndex;
//std::cout << "Decoding " << numToDecode << " symbols from unit " << startingUnitIndex << "\n";
- size_t numDecoded = StreamEncode::decodeStream(symDecoder, rlEncoder, &m_rlString[startingUnitIndex], numToDecode, outStr);
+ size_t numDecoded = StreamEncode2::decodeStream(symDecoder, rlEncoder, &m_rlString[startingUnitIndex], numToDecode, outStr);
assert(numDecoded >= numToDecode);
for(size_t j = 0; j < outStr.size(); ++j)
View
9 src/SuffixTools/RLBWT.h
@@ -27,7 +27,8 @@
// Defines
#define RLBWT_VALIDATE 1
-typedef std::vector<uint8_t> RLRawData;
+typedef uint64_t RLData;
+typedef std::vector<RLData> RLDataVector;
//
// RLBWT
@@ -92,7 +93,7 @@ class RLBWT
assert(numToCount < m_smallSampleRate);
size_t running_count = marker.counts.get(b);
size_t symbol_index = marker.unitIndex;
- StreamEncode::countDecoded(HuffmanForest::Instance().getDecoder(encoderIdx), m_rlHuffman, &m_rlString[symbol_index], numToCount, b, running_count);
+ StreamEncode2::countDecoded(HuffmanForest::Instance().getDecoder(encoderIdx), m_rlHuffman, &m_rlString[symbol_index], numToCount, b, running_count);
return running_count;
}
@@ -114,7 +115,7 @@ class RLBWT
//std::cout << "IC: "<< running_count << "\n";
assert(numToCount < m_smallSampleRate);
size_t symbol_index = marker.unitIndex;
- StreamEncode::countDecoded(HuffmanForest::Instance().getDecoder(encoderIdx), m_rlHuffman, &m_rlString[symbol_index], numToCount, running_count);
+ StreamEncode2::countDecoded(HuffmanForest::Instance().getDecoder(encoderIdx), m_rlHuffman, &m_rlString[symbol_index], numToCount, running_count);
return running_count;
}
@@ -244,7 +245,7 @@ class RLBWT
HuffmanTreeCodec<int> m_rlHuffman;
// The run-length encoded string
- RLRawData m_rlString;
+ RLDataVector m_rlString;
// The marker vector
LargeMarkerVector m_largeMarkers;
View
313 src/SuffixTools/StreamEncoding.h
@@ -14,11 +14,14 @@
//#define DEBUG_ENCODING 1
#define BITS_PER_BYTE 8
+#define BITS_AVAILABLE 64
typedef std::pair<char, int> RLEPair;
typedef std::vector<RLEPair> RLEPairVector;
typedef std::deque<char> CharDeque;
typedef std::vector<uint8_t> EncodedArray;
+typedef uint64_t EncodedValue;
+typedef std::vector<EncodedValue> EVVector;
namespace StreamEncode
{
@@ -216,11 +219,11 @@ namespace StreamEncode
_readCode(numBitsDecoded, symbolReadLen, pInput, code);
// Parse the code
- HuffmanTreeCodec<char>::DecodePair sdp = symbolEncoder.decode(code);
+ const HuffmanTreeCodec<char>::DecodePair& sdp = symbolEncoder.decode(code);
numBitsDecoded += sdp.bits;
_readCode(numBitsDecoded, runReadLen, pInput, code);
- HuffmanTreeCodec<int>::DecodePair rdp = runEncoder.decode(code);
+ const HuffmanTreeCodec<int>::DecodePair& rdp = runEncoder.decode(code);
numBitsDecoded += rdp.bits;
numSymbolsDecoded += rdp.symbol;
@@ -251,11 +254,11 @@ namespace StreamEncode
_readCode(numBitsDecoded, symbolReadLen, pInput, code);
// Parse the code
- HuffmanTreeCodec<char>::DecodePair sdp = symbolEncoder.decode(code);
+ const HuffmanTreeCodec<char>::DecodePair& sdp = symbolEncoder.decode(code);
numBitsDecoded += sdp.bits;
_readCode(numBitsDecoded, runReadLen, pInput, code);
- HuffmanTreeCodec<int>::DecodePair rdp = runEncoder.decode(code);
+ const HuffmanTreeCodec<int>::DecodePair& rdp = runEncoder.decode(code);
numBitsDecoded += rdp.bits;
// Cap the run length to add at the number of symbols left to process
@@ -285,11 +288,11 @@ namespace StreamEncode
_readCode(numBitsDecoded, symbolReadLen, pInput, code);
// Parse the code
- HuffmanTreeCodec<char>::DecodePair sdp = symbolEncoder.decode(code);
+ const HuffmanTreeCodec<char>::DecodePair& sdp = symbolEncoder.decode(code);
numBitsDecoded += sdp.bits;
_readCode(numBitsDecoded, runReadLen, pInput, code);
- HuffmanTreeCodec<int>::DecodePair rdp = runEncoder.decode(code);
+ const HuffmanTreeCodec<int>::DecodePair& rdp = runEncoder.decode(code);
numBitsDecoded += rdp.bits;
// Cap the run length to add at the number of symbols left to process
@@ -303,4 +306,302 @@ namespace StreamEncode
}
};
+namespace StreamEncode2
+{
+ //
+ inline void printEncoding(const EncodedValue output)
+ {
+ std::cout << "Encoding: " << int2Binary(output, BITS_AVAILABLE) << "\n";
+ }
+
+ // Write the code into the output stream starting at currBit. Returns the number of bits written
+ inline size_t _writeCode(EncodePair& ep, size_t currBit, EncodedValue& output)
+ {
+#ifdef DEBUG_ENCODING
+ printEncoding(output);
+ std::cout << "Writing the code " << int2Binary(ep.code, ep.bits) << " at bit " << currBit << "\n";
+#endif
+ size_t code = ep.code;
+ int codeBits = ep.bits;
+
+ // Calculate the shift values and masks to apply
+ int shiftVal = BITS_AVAILABLE - currBit - codeBits;
+ // Set the mask and shift it into position
+ size_t mask = (1 << codeBits) - 1;
+ size_t inCode = (code & mask);
+
+#ifdef DEBUG_ENCODING
+ std::cout << "Mask: " << int2Binary(mask, codeBits) << "\n";
+ std::cout << "Masked: " << int2Binary(inCode,codeBits) << "\n";
+#endif
+ inCode <<= shiftVal;
+#ifdef DEBUG_ENCODING
+ std::cout << "inCode: " << int2Binary(inCode, BITS_AVAILABLE) << "\n";
+#endif
+ output |= inCode;
+ return codeBits;
+ }
+
+ // Read maxBits from the array starting at currBits and write the value to outCode
+ // Returns the number of bits read
+ inline size_t _readCode(size_t currBit, size_t maxBits, const EncodedValue& input, EncodedValue& outCode)
+ {
+
+#ifdef DEBUG_ENCODING
+ printEncoding(input);
+ std::cout << "Reading " << maxBits << " from array starting at " << currBit << "\n";
+#endif
+ outCode = 0;
+ // If maxBits is larger than the number of bits left in the unit, we
+ // only read bitsRemaining and shift the code up by the difference. This is
+ // to handle the case where a run was encoded at the end of a unit that is less than 8
+ // bits in length. The bits that were not directly read are implicitly assumed to be zero
+ size_t bitsRemaining = BITS_AVAILABLE - currBit;
+
+ if(bitsRemaining > maxBits)
+ {
+ size_t mask = ((1 << maxBits) - 1);
+ int shiftVal = (bitsRemaining - maxBits);
+
+#ifdef DEBUG_ENCODING
+ std::cout << "Mask " << int2Binary(mask, maxBits) << "\n";
+#endif
+
+ mask <<= shiftVal;
+
+#ifdef DEBUG_ENCODING
+ std::cout << "ShiftMask " << int2Binary(mask, maxBits) << "\n";
+#endif
+
+ outCode = (input & mask) >> shiftVal;
+
+#ifdef DEBUG_ENCODING
+ std::cout << "Outcode: " << int2Binary(outCode, maxBits) << "\n";
+#endif
+ return maxBits;
+ }
+ else
+ {
+ size_t mask = ((1 << bitsRemaining) - 1);
+
+#ifdef DEBUG_ENCODING
+ std::cout << "Mask " << int2Binary(mask, bitsRemaining) << "\n";
+#endif
+ int diff = maxBits - bitsRemaining;
+
+ outCode = (input & mask) << diff;
+ return maxBits;
+ }
+ return maxBits;
+ }
+
+ // Encode a stream of characters
+ inline size_t encodeStream(const CharDeque& input, const HuffmanTreeCodec<char>& symbolEncoder, HuffmanTreeCodec<int> & runEncoder, EVVector& outputArray)
+ {
+ // Require the encoder to emit at most 8-bit codes
+ assert(symbolEncoder.getMaxBits() <= BITS_PER_BYTE);
+ assert(runEncoder.getMaxBits() <= BITS_PER_BYTE);
+
+ // Re-construct the stream as a sequence of <symbol, run> pairs
+ // Only allow runs whose length is encoded in the runEncoder
+ CharDeque stream = input;
+ RLEPairVector rlePairVector;
+ while(!stream.empty())
+ {
+ // Count the current run
+ char currChar = stream.front();
+ size_t idx = 0;
+ while(idx != stream.size() && stream[idx] == currChar)
+ ++idx;
+
+ // A run of length idx has ended
+ size_t encodingLength = runEncoder.getGreatestLowerBound(idx);
+ RLEPair pair(currChar, encodingLength);
+ rlePairVector.push_back(pair);
+
+ // Extract idx characters from the stream
+ for(size_t i = 0; i < encodingLength; ++i)
+ stream.pop_front();
+ }
+
+ outputArray.clear();
+ // Perform the encoding
+ size_t currBit = 0;
+ EncodedValue currentValue = 0;
+ EncodePair terminationCode = symbolEncoder.encode('\0');
+
+ RLEPairVector::const_iterator iter = rlePairVector.begin();
+ while(iter != rlePairVector.end())
+ {
+#ifdef DEBUG_ENCODING
+ std::cout << "Encoding pair: " << iter->first << "," << iter->second << "\n";
+#endif
+ EncodePair symEP = symbolEncoder.encode(iter->first);
+ EncodePair rlEP = runEncoder.encode(iter->second);
+
+ if(BITS_AVAILABLE - currBit - terminationCode.bits > symEP.bits + rlEP.bits)
+ {
+ currBit += _writeCode(symEP, currBit, currentValue);
+ currBit += _writeCode(rlEP, currBit, currentValue);
+ ++iter;
+
+ }
+ else
+ {
+ // Add a termination symbol to the value
+ //std::cout << "writing termination code!\n";
+ assert(BITS_AVAILABLE - currBit > terminationCode.bits);
+ currBit += _writeCode(terminationCode, currBit, currentValue);
+ outputArray.push_back(currentValue);
+ currentValue = 0;
+ currBit = 0;
+ }
+ }
+
+ // Terminate the last code if it has any data and push it
+ if(currBit > 0)
+ {
+ assert(BITS_AVAILABLE - currBit > terminationCode.bits);
+ currBit += _writeCode(terminationCode, currBit, currentValue);
+ outputArray.push_back(currentValue);
+ }
+ return input.size();
+ }
+
+ // Decode an encodedarray into a string
+ inline size_t decodeStream(const HuffmanTreeCodec<char>& symbolEncoder, const HuffmanTreeCodec<int>& runEncoder, const EncodedValue* pInput, size_t numSymbols, std::string& decoded)
+ {
+ // Require the encoder to emit at most 8-bit codes
+ assert(symbolEncoder.getMaxBits() <= BITS_PER_BYTE);
+ assert(runEncoder.getMaxBits() <= BITS_PER_BYTE);
+
+ size_t symbolReadLen = symbolEncoder.getMaxBits();
+ size_t runReadLen = runEncoder.getMaxBits();
+
+ size_t currBitsDecoded = 0;
+ size_t numSymbolsDecoded = 0;
+ while(numSymbolsDecoded < numSymbols)
+ {
+ // Read a symbol then a run
+ EncodedValue code = 0;
+ _readCode(currBitsDecoded, symbolReadLen, *pInput, code);
+
+ // Parse the code
+ const HuffmanTreeCodec<char>::DecodePair& sdp = symbolEncoder.decode(code);
+ currBitsDecoded += sdp.bits;
+
+ if(sdp.symbol == '\0')
+ {
+ // termination code, go to the next unit
+ pInput += 1;
+ currBitsDecoded = 0;
+ continue;
+ }
+
+ _readCode(currBitsDecoded, runReadLen, *pInput, code);
+ const HuffmanTreeCodec<int>::DecodePair& rdp = runEncoder.decode(code);
+
+ currBitsDecoded += rdp.bits;
+ numSymbolsDecoded += rdp.symbol;
+
+#ifdef DEBUG_ENCODING
+ std::cout << "Decoded pair: " << sdp.symbol << "," << rdp.symbol << "\n";
+#endif
+ decoded.append(rdp.symbol, sdp.symbol);
+#ifdef DEBUG_ENCODING
+ std::cout << "Decoding so far: " << decoded << "\n";
+#endif
+ }
+ return numSymbols;
+ }
+
+ // Decode from the array pInput up to numSymbols. Add the counts of symbols to counts
+ inline size_t countDecoded(const HuffmanTreeCodec<char>& symbolEncoder, const HuffmanTreeCodec<int>& runEncoder, const EncodedValue* pInput, size_t numSymbols, AlphaCount64& counts)
+ {
+ // Require the encoder to emit at most 8-bit codes
+ assert(symbolEncoder.getMaxBits() <= BITS_PER_BYTE);
+ assert(runEncoder.getMaxBits() <= BITS_PER_BYTE);
+
+ size_t symbolReadLen = symbolEncoder.getMaxBits();
+ size_t runReadLen = runEncoder.getMaxBits();
+
+ size_t currBitsDecoded = 0;
+ size_t numSymbolsDecoded = 0;
+ while(numSymbolsDecoded < numSymbols)
+ {
+ // Read a symbol then a run
+ EncodedValue symCode = 0;
+ _readCode(currBitsDecoded, symbolReadLen, *pInput, symCode);
+
+ // Parse the code
+ //const HuffmanTreeCodec<char>::DecodePair& sdp = symbolEncoder.decode(symCode);
+ currBitsDecoded += 3;//sdp.bits;
+
+ if(symCode == 7)
+ {
+ // termination code, go to the next unit
+ pInput += 1;
+ currBitsDecoded = 0;
+ continue;
+ }
+
+ EncodedValue rlCode = 0;
+ _readCode(currBitsDecoded, runReadLen, *pInput, rlCode);
+ const HuffmanTreeCodec<int>::DecodePair& rdp = runEncoder.decode(rlCode);
+ int rl = rdp.symbol;
+ currBitsDecoded += rdp.bits;
+
+ // Cap the run length to add at the number of symbols left to process
+ size_t addLength = std::min(rl, (int)numSymbols - (int)numSymbolsDecoded);
+ numSymbolsDecoded += addLength;
+ counts.add(ALPHABET[symCode], addLength);
+ }
+ return numSymbolsDecoded;
+ }
+
+ // Decode from the array pInput up to numSymbols. Add the counts of symbols to counts
+ inline size_t countDecoded(const HuffmanTreeCodec<char>& symbolEncoder, const HuffmanTreeCodec<int>& runEncoder, const EncodedValue* pInput, size_t numSymbols, char b, size_t& count)
+ {
+ // Require the encoder to emit at most 8-bit codes
+ assert(symbolEncoder.getMaxBits() <= BITS_PER_BYTE);
+ assert(runEncoder.getMaxBits() <= BITS_PER_BYTE);
+
+ size_t symbolReadLen = symbolEncoder.getMaxBits();
+ size_t runReadLen = runEncoder.getMaxBits();
+
+ size_t currBitsDecoded = 0;
+ size_t numSymbolsDecoded = 0;
+ while(numSymbolsDecoded < numSymbols)
+ {
+ // Read a symbol then a run
+ EncodedValue code = 0;
+ _readCode(currBitsDecoded, symbolReadLen, *pInput, code);
+
+ // Parse the code
+ const HuffmanTreeCodec<char>::DecodePair& sdp = symbolEncoder.decode(code);
+ currBitsDecoded += sdp.bits;
+
+ if(sdp.symbol == '\0')
+ {
+ // termination code, go to the next unit
+ pInput += 1;
+ currBitsDecoded = 0;
+ continue;
+ }
+
+ _readCode(currBitsDecoded, runReadLen, *pInput, code);
+ const HuffmanTreeCodec<int>::DecodePair& rdp = runEncoder.decode(code);
+ currBitsDecoded += rdp.bits;
+
+ // Cap the run length to add at the number of symbols left to process
+ size_t addLength = std::min(rdp.symbol, (int)numSymbols - (int)numSymbolsDecoded);
+ numSymbolsDecoded += addLength;
+
+ if(sdp.symbol == b)
+ count += addLength;
+ }
+ return numSymbolsDecoded;
+ }
+};
+
#endif
View
7 src/Util/Util.cpp
@@ -144,14 +144,15 @@ std::string getFileExtension(const std::string& filename)
}
// slow function to convert an integer to a binary string
-std::string int2Binary(int v, int numBits)
+std::string int2Binary(size_t v, int numBits)
{
std::string tmp;
- int bits = sizeof(int) * 8;
+ int bits = sizeof(size_t) * 8;
for(int i = bits - 1; i >= 0; --i)
{
// test if the i-th bit is set
- int mask = 1 << i;
+ size_t mask = 1;
+ mask <<= i;
char b = (v & mask) ? '1' : '0';
tmp.append(1, b);
}
View
2  src/Util/Util.h
@@ -98,7 +98,7 @@ std::string stripFilename(const std::string& filename);
std::string stripExtension(const std::string& filename);
std::string stripDirectories(const std::string& filename);
std::string getFileExtension(const std::string& filename);
-std::string int2Binary(int v, int numBits = 0);
+std::string int2Binary(size_t v, int numBits = 0);
bool isGzip(const std::string& filename);
Please sign in to comment.
Something went wrong with that request. Please try again.