Skip to content

Commit

Permalink
huff0: Add size estimation function. (#405)
Browse files Browse the repository at this point in the history
  • Loading branch information
klauspost committed Jul 9, 2021
1 parent e9c9800 commit 94867dd
Show file tree
Hide file tree
Showing 3 changed files with 134 additions and 3 deletions.
64 changes: 64 additions & 0 deletions huff0/compress.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,70 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)
return s.Out, false, nil
}

// EstimateSizes will estimate the data sizes
func EstimateSizes(in []byte, s *Scratch) (tableSz, dataSz, reuseSz int, err error) {
s, err = s.prepare(in)
if err != nil {
return 0, 0, 0, err
}

// Create histogram, if none was provided.
tableSz, dataSz, reuseSz = -1, -1, -1
maxCount := s.maxCount
var canReuse = false
if maxCount == 0 {
maxCount, canReuse = s.countSimple(in)
} else {
canReuse = s.canUseTable(s.prevTable)
}

// We want the output size to be less than this:
wantSize := len(in)
if s.WantLogLess > 0 {
wantSize -= wantSize >> s.WantLogLess
}

// Reset for next run.
s.clearCount = true
s.maxCount = 0
if maxCount >= len(in) {
if maxCount > len(in) {
return 0, 0, 0, fmt.Errorf("maxCount (%d) > length (%d)", maxCount, len(in))
}
if len(in) == 1 {
return 0, 0, 0, ErrIncompressible
}
// One symbol, use RLE
return 0, 0, 0, ErrUseRLE
}
if maxCount == 1 || maxCount < (len(in)>>7) {
// Each symbol present maximum once or too well distributed.
return 0, 0, 0, ErrIncompressible
}

// Calculate new table.
err = s.buildCTable()
if err != nil {
return 0, 0, 0, err
}

if false && !s.canUseTable(s.cTable) {
panic("invalid table generated")
}

tableSz, err = s.cTable.estTableSize(s)
if err != nil {
return 0, 0, 0, err
}
if canReuse {
reuseSz = s.prevTable.estimateSize(s.count[:s.symbolLen])
}
dataSz = s.cTable.estimateSize(s.count[:s.symbolLen])

// Restore
return tableSz, dataSz, reuseSz, nil
}

func (s *Scratch) compress1X(src []byte) ([]byte, error) {
return s.compress1xDo(s.Out, src)
}
Expand Down
11 changes: 8 additions & 3 deletions huff0/compress_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ func TestCompress1X(t *testing.T) {
if len(buf0) > BlockSizeMax {
buf0 = buf0[:BlockSizeMax]
}
tbSz, dSz, reSz, _ := EstimateSizes(buf0, &s)
b, re, err := Compress1X(buf0, &s)
if err != test.err1X {
t.Errorf("want error %v (%T), got %v (%T)", test.err1X, test.err1X, err, err)
Expand All @@ -256,6 +257,7 @@ func TestCompress1X(t *testing.T) {
if len(s.OutData) == 0 {
t.Error("got no data output")
}
t.Logf("Estimate: table %d, got %d, data %d, got %d, reuse: %d", tbSz, len(s.OutTable), dSz, len(s.OutData), reSz)
t.Logf("%s: %d -> %d bytes (%.2f:1) re:%t (table: %d bytes)", test.name, len(buf0), len(b), float64(len(buf0))/float64(len(b)), re, len(s.OutTable))
s.Out = nil
bRe, _, err := Compress1X(b, &s)
Expand Down Expand Up @@ -406,7 +408,7 @@ func TestCompress4XReuse(t *testing.T) {
for j := range buf0 {
buf0[j] = byte(int64(i) + (rng.Int63() & 3))
}

tbSz, dSz, reSz, _ := EstimateSizes(buf0, &s)
b, re, err := Compress4X(buf0, &s)
if err != nil {
t.Fatal(err)
Expand All @@ -421,7 +423,7 @@ func TestCompress4XReuse(t *testing.T) {
if re {
t.Error("claimed to have re-used. Unlikely.")
}

t.Logf("Estimate: table %d, got %d, data %d, got %d, reuse: %d", tbSz, len(s.OutTable), dSz, len(s.OutData), reSz)
t.Logf("%s: %d -> %d bytes (%.2f:1) %t (table: %d bytes)", t.Name(), len(buf0), len(b), float64(len(buf0))/float64(len(b)), re, len(s.OutTable))
})
}
Expand All @@ -441,6 +443,7 @@ func TestCompress4XReuseActually(t *testing.T) {
buf0[j] = byte(rng.Int63() & 7)
}

tbSz, dSz, reSz, _ := EstimateSizes(buf0, &s)
b, re, err := Compress4X(buf0, &s)
if err != nil {
t.Fatal(err)
Expand All @@ -458,7 +461,7 @@ func TestCompress4XReuseActually(t *testing.T) {
if !re && i > 0 {
t.Error("Expected table to be reused")
}

t.Logf("Estimate: table %d, got %d, data %d, got %d, reuse: %d", tbSz, len(s.OutTable), dSz, len(s.OutData), reSz)
t.Logf("%s: %d -> %d bytes (%.2f:1) %t (table: %d bytes)", t.Name(), len(buf0), len(b), float64(len(buf0))/float64(len(b)), re, len(s.OutTable))
})
}
Expand Down Expand Up @@ -488,6 +491,7 @@ func TestCompress1XReuse(t *testing.T) {
}
firstData := len(s.OutData)
s.Reuse = ReusePolicyAllow
tbSz, dSz, reSz, _ := EstimateSizes(buf0, &s)
b, re, err := Compress1X(buf0, &s)
if err != nil {
t.Errorf("got secondary error %v (%T)", err, err)
Expand All @@ -505,6 +509,7 @@ func TestCompress1XReuse(t *testing.T) {
if len(b) != firstData {
t.Errorf("data length did not match first: %d, second:%d", firstData, len(b))
}
t.Logf("Estimate: table %d, got %d, data %d, got %d, reuse: %d", tbSz, len(s.OutTable), dSz, len(s.OutData), reSz)
t.Logf("%s: %d -> %d bytes (%.2f:1) %t", test.name, len(buf0), len(b), float64(len(buf0))/float64(len(b)), re)
})
}
Expand Down
62 changes: 62 additions & 0 deletions huff0/huff0.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,68 @@ func (c cTable) write(s *Scratch) error {
return nil
}

func (c cTable) estTableSize(s *Scratch) (sz int, err error) {
var (
// precomputed conversion table
bitsToWeight [tableLogMax + 1]byte
huffLog = s.actualTableLog
// last weight is not saved.
maxSymbolValue = uint8(s.symbolLen - 1)
huffWeight = s.huffWeight[:256]
)
const (
maxFSETableLog = 6
)
// convert to weight
bitsToWeight[0] = 0
for n := uint8(1); n < huffLog+1; n++ {
bitsToWeight[n] = huffLog + 1 - n
}

// Acquire histogram for FSE.
hist := s.fse.Histogram()
hist = hist[:256]
for i := range hist[:16] {
hist[i] = 0
}
for n := uint8(0); n < maxSymbolValue; n++ {
v := bitsToWeight[c[n].nBits] & 15
huffWeight[n] = v
hist[v]++
}

// FSE compress if feasible.
if maxSymbolValue >= 2 {
huffMaxCnt := uint32(0)
huffMax := uint8(0)
for i, v := range hist[:16] {
if v == 0 {
continue
}
huffMax = byte(i)
if v > huffMaxCnt {
huffMaxCnt = v
}
}
s.fse.HistogramFinished(huffMax, int(huffMaxCnt))
s.fse.TableLog = maxFSETableLog
b, err := fse.Compress(huffWeight[:maxSymbolValue], s.fse)
if err == nil && len(b) < int(s.symbolLen>>1) {
sz += 1 + len(b)
return sz, nil
}
// Unable to compress (RLE/uncompressible)
}
// write raw values as 4-bits (max : 15)
if maxSymbolValue > (256 - 128) {
// should not happen : likely means source cannot be compressed
return 0, ErrIncompressible
}
// special case, pack weights 4 bits/weight.
sz += 1 + int(maxSymbolValue/2)
return sz, nil
}

// estimateSize returns the estimated size in bytes of the input represented in the
// histogram supplied.
func (c cTable) estimateSize(hist []uint32) int {
Expand Down

0 comments on commit 94867dd

Please sign in to comment.