Skip to content

Commit

Permalink
inflate: Limit variable shifts (#274)
Browse files Browse the repository at this point in the history
* inflate: Limit variable shifts

Use and operations to speed up variable shifts.

Faster on AMD64:

```
benchmark                               old ns/op     new ns/op     delta
BenchmarkDecodeDigitsSpeed1e4-32        57027         56892         -0.24%
BenchmarkDecodeDigitsSpeed1e5-32        657866        650408        -1.13%
BenchmarkDecodeDigitsSpeed1e6-32        6679774       6425893       -3.80%
BenchmarkDecodeDigitsDefault1e4-32      62810         61858         -1.52%
BenchmarkDecodeDigitsDefault1e5-32      657865        628677        -4.44%
BenchmarkDecodeDigitsDefault1e6-32      6486343       6211232       -4.24%
BenchmarkDecodeDigitsCompress1e4-32     62169         61555         -0.99%
BenchmarkDecodeDigitsCompress1e5-32     677789        668714        -1.34%
BenchmarkDecodeDigitsCompress1e6-32     6851431       6685226       -2.43%
BenchmarkDecodeTwainSpeed1e4-32         60606         59003         -2.64%
BenchmarkDecodeTwainSpeed1e5-32         628151        609357        -2.99%
BenchmarkDecodeTwainSpeed1e6-32         6238098       6015035       -3.58%
BenchmarkDecodeTwainDefault1e4-32       59901         59167         -1.23%
BenchmarkDecodeTwainDefault1e5-32       576772        561311        -2.68%
BenchmarkDecodeTwainDefault1e6-32       5701418       5479259       -3.90%
BenchmarkDecodeTwainCompress1e4-32      58582         56825         -3.00%
BenchmarkDecodeTwainCompress1e5-32      535572        515826        -3.69%
BenchmarkDecodeTwainCompress1e6-32      5265486       5090632       -3.32%
BenchmarkDecodeRandomSpeed1e4-32        323           319           -1.24%
BenchmarkDecodeRandomSpeed1e5-32        1954          1945          -0.46%
BenchmarkDecodeRandomSpeed1e6-32        20016         20026         +0.05%

benchmark                               old MB/s     new MB/s     speedup
BenchmarkDecodeDigitsSpeed1e4-32        175.35       175.77       1.00x
BenchmarkDecodeDigitsSpeed1e5-32        152.01       153.75       1.01x
BenchmarkDecodeDigitsSpeed1e6-32        149.71       155.62       1.04x
BenchmarkDecodeDigitsDefault1e4-32      159.21       161.66       1.02x
BenchmarkDecodeDigitsDefault1e5-32      152.01       159.06       1.05x
BenchmarkDecodeDigitsDefault1e6-32      154.17       161.00       1.04x
BenchmarkDecodeDigitsCompress1e4-32     160.85       162.46       1.01x
BenchmarkDecodeDigitsCompress1e5-32     147.54       149.54       1.01x
BenchmarkDecodeDigitsCompress1e6-32     145.95       149.58       1.02x
BenchmarkDecodeTwainSpeed1e4-32         165.00       169.48       1.03x
BenchmarkDecodeTwainSpeed1e5-32         159.20       164.11       1.03x
BenchmarkDecodeTwainSpeed1e6-32         160.31       166.25       1.04x
BenchmarkDecodeTwainDefault1e4-32       166.94       169.01       1.01x
BenchmarkDecodeTwainDefault1e5-32       173.38       178.15       1.03x
BenchmarkDecodeTwainDefault1e6-32       175.39       182.51       1.04x
BenchmarkDecodeTwainCompress1e4-32      170.70       175.98       1.03x
BenchmarkDecodeTwainCompress1e5-32      186.72       193.86       1.04x
BenchmarkDecodeTwainCompress1e6-32      189.92       196.44       1.03x
BenchmarkDecodeRandomSpeed1e4-32        30915.66     31375.28     1.01x
BenchmarkDecodeRandomSpeed1e5-32        51177.19     51408.19     1.00x
BenchmarkDecodeRandomSpeed1e6-32        49958.99     49936.11     1.00x
```

* Avoid regressing other platforms.
  • Loading branch information
klauspost committed Aug 18, 2020
1 parent e031915 commit f5ee0f4
Show file tree
Hide file tree
Showing 7 changed files with 190 additions and 102 deletions.
10 changes: 5 additions & 5 deletions flate/fast_encoder.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ func (e *fastGen) addBlock(src []byte) int32 {
// hash4 returns the hash of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <32.
func hash4u(u uint32, h uint8) uint32 {
return (u * prime4bytes) >> ((32 - h) & 31)
return (u * prime4bytes) >> ((32 - h) & reg8SizeMask32)
}

type tableEntryPrev struct {
Expand All @@ -138,25 +138,25 @@ type tableEntryPrev struct {
// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <32.
func hash4x64(u uint64, h uint8) uint32 {
return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
return (uint32(u) * prime4bytes) >> ((32 - h) & reg8SizeMask32)
}

// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <64.
func hash7(u uint64, h uint8) uint32 {
return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & reg8SizeMask64))
}

// hash8 returns the hash of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <64.
func hash8(u uint64, h uint8) uint32 {
return uint32((u * prime8bytes) >> ((64 - h) & 63))
return uint32((u * prime8bytes) >> ((64 - h) & reg8SizeMask64))
}

// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <64.
func hash6(u uint64, h uint8) uint32 {
return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & reg8SizeMask64))
}

// matchlen will return the match length between offsets and t in src.
Expand Down
32 changes: 17 additions & 15 deletions flate/gen_inflate.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ readLiteral:
return
}
f.roffset++
b |= uint32(c) << (nb & 31)
b |= uint32(c) << (nb & regSizeMaskUint32)
nb += 8
}
chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
Expand All @@ -104,7 +104,7 @@ readLiteral:
f.err = CorruptInputError(f.roffset)
return
}
f.b = b >> (n & 31)
f.b = b >> (n & regSizeMaskUint32)
f.nb = nb - n
v = int(chunk >> huffmanValueShift)
break
Expand Down Expand Up @@ -167,33 +167,35 @@ readLiteral:
return
}
}
length += int(f.b & uint32(1<<n-1))
f.b >>= n
length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
f.b >>= n & regSizeMaskUint32
f.nb -= n
}
var dist int
var dist uint32
if f.hd == nil {
for f.nb < 5 {
if err = moreBits(); err != nil {
if err = f.moreBits(); err != nil {
if debugDecode {
fmt.Println("morebits f.nb<5:", err)
}
f.err = err
return
}
}
dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
f.b >>= 5
f.nb -= 5
} else {
if dist, err = f.huffSym(f.hd); err != nil {
sym, err := f.huffSym(f.hd)
if err != nil {
if debugDecode {
fmt.Println("huffsym:", err)
}
f.err = err
return
}
dist = uint32(sym)
}
switch {
Expand All @@ -202,20 +204,20 @@ readLiteral:
case dist < maxNumDist:
nb := uint(dist-2) >> 1
// have 1 bit in bottom of dist, need nb more.
extra := (dist & 1) << nb
extra := (dist & 1) << (nb & regSizeMaskUint32)
for f.nb < nb {
if err = moreBits(); err != nil {
if err = f.moreBits(); err != nil {
if debugDecode {
fmt.Println("morebits f.nb<nb:", err)
}
f.err = err
return
}
}
extra |= int(f.b & uint32(1<<nb-1))
f.b >>= nb
extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
f.b >>= nb & regSizeMaskUint32
f.nb -= nb
dist = 1<<(nb+1) + 1 + extra
dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
default:
if debugDecode {
fmt.Println("dist too big:", dist, maxNumDist)
Expand All @@ -225,15 +227,15 @@ readLiteral:
}
// No check on length; encoding can be prescient.
if dist > f.dict.histSize() {
if dist > uint32(f.dict.histSize()) {
if debugDecode {
fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
}
f.err = CorruptInputError(f.roffset)
return
}
f.copyLen, f.copyDist = length, dist
f.copyLen, f.copyDist = length, int(dist)
goto copyHistory
}
Expand Down
8 changes: 4 additions & 4 deletions flate/huffman_bit_writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ func (w *huffmanBitWriter) write(b []byte) {
}

func (w *huffmanBitWriter) writeBits(b int32, nb uint16) {
w.bits |= uint64(b) << (w.nbits & 63)
w.bits |= uint64(b) << (w.nbits & reg16SizeMask64)
w.nbits += nb
if w.nbits >= 48 {
w.writeOutBits()
Expand Down Expand Up @@ -759,7 +759,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
} else {
// inlined
c := lengths[lengthCode&31]
w.bits |= uint64(c.code) << (w.nbits & 63)
w.bits |= uint64(c.code) << (w.nbits & reg16SizeMask64)
w.nbits += c.len
if w.nbits >= 48 {
w.writeOutBits()
Expand All @@ -779,7 +779,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
} else {
// inlined
c := offs[offsetCode&31]
w.bits |= uint64(c.code) << (w.nbits & 63)
w.bits |= uint64(c.code) << (w.nbits & reg16SizeMask64)
w.nbits += c.len
if w.nbits >= 48 {
w.writeOutBits()
Expand Down Expand Up @@ -878,7 +878,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
for _, t := range input {
// Bitwriting inlined, ~30% speedup
c := encoding[t]
w.bits |= uint64(c.code) << ((w.nbits) & 63)
w.bits |= uint64(c.code) << ((w.nbits) & reg16SizeMask64)
w.nbits += c.len
if w.nbits >= 48 {
bits := w.bits
Expand Down
38 changes: 20 additions & 18 deletions flate/inflate.go
Original file line number Diff line number Diff line change
Expand Up @@ -522,8 +522,8 @@ func (f *decompressor) readHuffman() error {
return err
}
}
rep += int(f.b & uint32(1<<nb-1))
f.b >>= nb
rep += int(f.b & uint32(1<<(nb&regSizeMaskUint32)-1))
f.b >>= nb & regSizeMaskUint32
f.nb -= nb
if i+rep > n {
if debugDecode {
Expand Down Expand Up @@ -603,7 +603,7 @@ readLiteral:
return
}
f.roffset++
b |= uint32(c) << (nb & 31)
b |= uint32(c) << (nb & regSizeMaskUint32)
nb += 8
}
chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
Expand All @@ -622,7 +622,7 @@ readLiteral:
f.err = CorruptInputError(f.roffset)
return
}
f.b = b >> (n & 31)
f.b = b >> (n & regSizeMaskUint32)
f.nb = nb - n
v = int(chunk >> huffmanValueShift)
break
Expand Down Expand Up @@ -685,12 +685,12 @@ readLiteral:
return
}
}
length += int(f.b & uint32(1<<n-1))
f.b >>= n
length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
f.b >>= n & regSizeMaskUint32
f.nb -= n
}

var dist int
var dist uint32
if f.hd == nil {
for f.nb < 5 {
if err = f.moreBits(); err != nil {
Expand All @@ -701,17 +701,19 @@ readLiteral:
return
}
}
dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
f.b >>= 5
f.nb -= 5
} else {
if dist, err = f.huffSym(f.hd); err != nil {
sym, err := f.huffSym(f.hd)
if err != nil {
if debugDecode {
fmt.Println("huffsym:", err)
}
f.err = err
return
}
dist = uint32(sym)
}

switch {
Expand All @@ -720,7 +722,7 @@ readLiteral:
case dist < maxNumDist:
nb := uint(dist-2) >> 1
// have 1 bit in bottom of dist, need nb more.
extra := (dist & 1) << nb
extra := (dist & 1) << (nb & regSizeMaskUint32)
for f.nb < nb {
if err = f.moreBits(); err != nil {
if debugDecode {
Expand All @@ -730,10 +732,10 @@ readLiteral:
return
}
}
extra |= int(f.b & uint32(1<<nb-1))
f.b >>= nb
extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
f.b >>= nb & regSizeMaskUint32
f.nb -= nb
dist = 1<<(nb+1) + 1 + extra
dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
default:
if debugDecode {
fmt.Println("dist too big:", dist, maxNumDist)
Expand All @@ -743,15 +745,15 @@ readLiteral:
}

// No check on length; encoding can be prescient.
if dist > f.dict.histSize() {
if dist > uint32(f.dict.histSize()) {
if debugDecode {
fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
}
f.err = CorruptInputError(f.roffset)
return
}

f.copyLen, f.copyDist = length, dist
f.copyLen, f.copyDist = length, int(dist)
goto copyHistory
}

Expand Down Expand Up @@ -869,7 +871,7 @@ func (f *decompressor) moreBits() error {
return noEOF(err)
}
f.roffset++
f.b |= uint32(c) << f.nb
f.b |= uint32(c) << (f.nb & regSizeMaskUint32)
f.nb += 8
return nil
}
Expand All @@ -894,7 +896,7 @@ func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
return 0, noEOF(err)
}
f.roffset++
b |= uint32(c) << (nb & 31)
b |= uint32(c) << (nb & regSizeMaskUint32)
nb += 8
}
chunk := h.chunks[b&(huffmanNumChunks-1)]
Expand All @@ -913,7 +915,7 @@ func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
f.err = CorruptInputError(f.roffset)
return 0, f.err
}
f.b = b >> (n & 31)
f.b = b >> (n & regSizeMaskUint32)
f.nb = nb - n
return int(chunk >> huffmanValueShift), nil
}
Expand Down
Loading

0 comments on commit f5ee0f4

Please sign in to comment.