Skip to content

Commit

Permalink
huff0: Speed up compress1xDo (#774)
Browse files Browse the repository at this point in the history
A specialized encFourSymbols produces better code than two inlined
encTwoSymbols calls in a row. Its arguments need to be different to make
the inlining work. Benchmark results on amd64:

name                                      old speed      new speed      delta
Compress1XReuseNone/digits-8               438MB/s ± 1%   461MB/s ± 1%   +5.18%  (p=0.000 n=10+9)
Compress1XReuseNone/gettysburg-8           254MB/s ± 1%   254MB/s ± 1%     ~     (p=0.412 n=10+9)
Compress1XReuseNone/twain-8                363MB/s ± 1%   367MB/s ± 0%   +1.05%  (p=0.000 n=10+10)
Compress1XReuseNone/low-ent.10k-8          466MB/s ± 0%   485MB/s ± 1%   +4.01%  (p=0.000 n=9+10)
Compress1XReuseNone/superlow-ent-10k-8     305MB/s ± 0%   313MB/s ± 1%   +2.49%  (p=0.000 n=8+9)
Compress1XReuseNone/crash2-8              11.4MB/s ± 0%  11.4MB/s ± 1%     ~     (p=0.458 n=9+8)
Compress1XReuseNone/endzerobits-8         15.7MB/s ± 2%  15.8MB/s ± 1%     ~     (p=0.085 n=10+8)
Compress1XReuseNone/endnonzero-8          7.64MB/s ± 1%  7.65MB/s ± 1%     ~     (p=0.957 n=10+10)
Compress1XReuseNone/case1-8               14.6MB/s ± 1%  14.7MB/s ± 1%     ~     (p=0.381 n=10+10)
Compress1XReuseNone/case2-8               12.3MB/s ± 1%  12.3MB/s ± 0%     ~     (p=0.097 n=9+9)
Compress1XReuseNone/case3-8               13.2MB/s ± 1%  13.1MB/s ± 1%     ~     (p=0.540 n=10+10)
Compress1XReuseNone/pngdata.001-8          302MB/s ± 0%   302MB/s ± 1%     ~     (p=0.815 n=8+9)
Compress1XReuseNone/normcount2-8          34.9MB/s ± 0%  34.9MB/s ± 1%     ~     (p=0.646 n=9+10)
Compress1XReuseAllow/digits-8              444MB/s ± 1%   465MB/s ± 1%   +4.71%  (p=0.000 n=10+10)
Compress1XReuseAllow/gettysburg-8          282MB/s ± 0%   283MB/s ± 1%   +0.39%  (p=0.002 n=9+8)
Compress1XReuseAllow/twain-8               366MB/s ± 1%   369MB/s ± 1%   +1.01%  (p=0.000 n=10+10)
Compress1XReuseAllow/low-ent.10k-8         470MB/s ± 1%   488MB/s ± 0%   +3.82%  (p=0.000 n=9+9)
Compress1XReuseAllow/superlow-ent-10k-8    308MB/s ± 1%   313MB/s ± 1%   +1.83%  (p=0.000 n=10+10)
Compress1XReuseAllow/crash2-8             16.0MB/s ± 1%  16.0MB/s ± 0%     ~     (p=0.356 n=10+10)
Compress1XReuseAllow/endzerobits-8        17.0MB/s ± 0%  17.1MB/s ± 0%   +0.43%  (p=0.001 n=10+9)
Compress1XReuseAllow/endnonzero-8         12.0MB/s ± 1%  12.0MB/s ± 1%     ~     (p=0.858 n=10+9)
Compress1XReuseAllow/case1-8              18.2MB/s ± 1%  18.2MB/s ± 1%     ~     (p=0.724 n=10+10)
Compress1XReuseAllow/case2-8              15.5MB/s ± 1%  15.3MB/s ± 3%   -1.02%  (p=0.049 n=10+10)
Compress1XReuseAllow/case3-8              16.4MB/s ± 0%  16.4MB/s ± 1%     ~     (p=0.887 n=9+10)
Compress1XReuseAllow/pngdata.001-8         303MB/s ± 0%   304MB/s ± 0%   +0.35%  (p=0.000 n=9+9)
Compress1XReuseAllow/normcount2-8         45.0MB/s ± 1%  45.1MB/s ± 0%     ~     (p=0.075 n=9+10)
Compress1XReusePrefer/digits-8             447MB/s ± 1%   467MB/s ± 0%   +4.42%  (p=0.000 n=9+9)
Compress1XReusePrefer/gettysburg-8         425MB/s ± 1%   429MB/s ± 0%   +0.87%  (p=0.000 n=10+10)
Compress1XReusePrefer/twain-8              367MB/s ± 1%   371MB/s ± 0%   +1.11%  (p=0.000 n=10+10)
Compress1XReusePrefer/low-ent.10k-8        474MB/s ± 1%   494MB/s ± 0%   +4.22%  (p=0.000 n=10+10)
Compress1XReusePrefer/superlow-ent-10k-8   313MB/s ± 0%   320MB/s ± 0%   +2.09%  (p=0.000 n=10+9)
Compress1XReusePrefer/crash2-8            63.2MB/s ± 1%  62.9MB/s ± 1%     ~     (p=0.159 n=10+10)
Compress1XReusePrefer/endzerobits-8       24.8MB/s ± 2%  24.9MB/s ± 1%     ~     (p=0.674 n=9+10)
Compress1XReusePrefer/endnonzero-8        33.8MB/s ± 0%  33.9MB/s ± 0%   +0.27%  (p=0.004 n=10+9)
Compress1XReusePrefer/case1-8              150MB/s ± 7%   152MB/s ± 2%     ~     (p=0.175 n=10+9)
Compress1XReusePrefer/case2-8              144MB/s ± 0%   146MB/s ± 1%   +1.11%  (p=0.000 n=10+9)
Compress1XReusePrefer/case3-8              160MB/s ± 0%   160MB/s ± 0%     ~     (p=0.593 n=10+10)
Compress1XReusePrefer/pngdata.001-8        313MB/s ± 1%   314MB/s ± 1%     ~     (p=0.110 n=10+10)
Compress1XReusePrefer/normcount2-8         212MB/s ± 1%   215MB/s ± 0%   +1.24%  (p=0.000 n=10+10)
Compress4XReuseNone/digits-8               444MB/s ± 0%   461MB/s ± 6%   +3.99%  (p=0.008 n=7+9)
Compress4XReuseNone/gettysburg-8           252MB/s ± 1%   251MB/s ± 2%     ~     (p=0.604 n=10+9)
Compress4XReuseNone/twain-8                364MB/s ± 0%   367MB/s ± 1%     ~     (p=0.243 n=9+10)
Compress4XReuseNone/low-ent.10k-8          469MB/s ± 0%   489MB/s ± 1%   +4.18%  (p=0.000 n=9+10)
Compress4XReuseNone/superlow-ent-10k-8     304MB/s ± 1%   315MB/s ± 1%   +3.38%  (p=0.000 n=10+10)
Compress4XReuseNone/case1-8               14.5MB/s ± 0%  14.4MB/s ± 3%     ~     (p=0.619 n=9+9)
Compress4XReuseNone/case2-8               12.1MB/s ± 0%  11.9MB/s ± 2%   -1.44%  (p=0.004 n=10+10)
Compress4XReuseNone/case3-8               12.9MB/s ± 0%  12.9MB/s ± 3%     ~     (p=0.827 n=9+10)
Compress4XReuseNone/pngdata.001-8          301MB/s ± 0%   300MB/s ± 2%     ~     (p=1.000 n=10+10)
Compress4XReuseNone/normcount2-8          34.2MB/s ± 1%  34.0MB/s ± 4%     ~     (p=0.698 n=10+10)
Compress4XReuseAllow/digits-8              445MB/s ± 0%   470MB/s ± 0%   +5.43%  (p=0.000 n=10+10)
Compress4XReuseAllow/gettysburg-8          278MB/s ± 0%   280MB/s ± 1%   +0.48%  (p=0.006 n=9+10)
Compress4XReuseAllow/twain-8               365MB/s ± 0%   368MB/s ± 1%   +0.95%  (p=0.000 n=10+10)
Compress4XReuseAllow/low-ent.10k-8         471MB/s ± 1%   497MB/s ± 0%   +5.62%  (p=0.000 n=10+8)
Compress4XReuseAllow/superlow-ent-10k-8    307MB/s ± 1%   316MB/s ± 1%   +3.03%  (p=0.000 n=10+10)
Compress4XReuseAllow/case1-8              17.8MB/s ± 1%  17.8MB/s ± 0%   +0.36%  (p=0.006 n=10+9)
Compress4XReuseAllow/case2-8              15.0MB/s ± 0%  15.0MB/s ± 1%   -0.35%  (p=0.032 n=8+9)
Compress4XReuseAllow/case3-8              15.9MB/s ± 0%  15.9MB/s ± 0%     ~     (p=0.556 n=9+9)
Compress4XReuseAllow/pngdata.001-8         302MB/s ± 0%   303MB/s ± 1%   +0.40%  (p=0.003 n=10+10)
Compress4XReuseAllow/normcount2-8         42.3MB/s ± 7%  43.4MB/s ± 0%     ~     (p=0.108 n=9+10)
Compress4XReusePrefer/digits-8             428MB/s ± 7%   472MB/s ± 0%  +10.29%  (p=0.000 n=10+8)
Compress4XReusePrefer/gettysburg-8         417MB/s ± 1%   421MB/s ± 1%   +1.03%  (p=0.000 n=9+9)
Compress4XReusePrefer/twain-8              362MB/s ± 4%   370MB/s ± 0%   +2.14%  (p=0.000 n=9+9)
Compress4XReusePrefer/low-ent.10k-8        470MB/s ± 1%   501MB/s ± 0%   +6.67%  (p=0.000 n=9+9)
Compress4XReusePrefer/superlow-ent-10k-8   307MB/s ± 3%   322MB/s ± 0%   +4.79%  (p=0.000 n=10+9)
Compress4XReusePrefer/case1-8              129MB/s ± 3%   134MB/s ± 1%   +3.70%  (p=0.000 n=10+10)
Compress4XReusePrefer/case2-8              120MB/s ± 2%   122MB/s ± 1%   +1.65%  (p=0.001 n=9+10)
Compress4XReusePrefer/case3-8              130MB/s ± 1%   131MB/s ± 0%   +0.79%  (p=0.005 n=10+7)
Compress4XReusePrefer/pngdata.001-8        312MB/s ± 0%   313MB/s ± 0%   +0.34%  (p=0.043 n=10+9)
Compress4XReusePrefer/normcount2-8         183MB/s ± 2%   184MB/s ± 0%   +0.72%  (p=0.011 n=10+10)
Compress1XSizes/digits-100-8              63.0MB/s ± 2%  63.2MB/s ± 2%     ~     (p=0.684 n=10+10)
Compress1XSizes/digits-200-8               111MB/s ± 2%   112MB/s ± 1%   +1.68%  (p=0.000 n=10+10)
Compress1XSizes/digits-500-8               204MB/s ± 2%   207MB/s ± 1%   +1.73%  (p=0.002 n=9+9)
Compress1XSizes/digits-1000-8              287MB/s ± 3%   295MB/s ± 1%   +2.66%  (p=0.000 n=10+10)
Compress1XSizes/digits-5000-8              423MB/s ± 1%   441MB/s ± 1%   +4.34%  (p=0.000 n=9+10)
Compress1XSizes/digits-10000-8             443MB/s ± 1%   460MB/s ± 1%   +3.96%  (p=0.000 n=9+10)
Compress1XSizes/digits-50000-8             442MB/s ± 0%   461MB/s ± 0%   +4.49%  (p=0.000 n=8+10)
Compress4XSizes/digits-100-8              61.6MB/s ± 0%  61.5MB/s ± 1%     ~     (p=0.310 n=9+8)
Compress4XSizes/digits-200-8               108MB/s ± 1%   108MB/s ± 1%   +0.51%  (p=0.033 n=8+10)
Compress4XSizes/digits-500-8               202MB/s ± 1%   206MB/s ± 1%   +2.03%  (p=0.000 n=9+9)
Compress4XSizes/digits-1000-8              280MB/s ± 2%   292MB/s ± 1%   +4.47%  (p=0.000 n=10+10)
Compress4XSizes/digits-5000-8              419MB/s ± 0%   448MB/s ± 1%   +6.98%  (p=0.000 n=8+10)
Compress4XSizes/digits-10000-8             442MB/s ± 1%   474MB/s ± 0%   +7.24%  (p=0.000 n=8+9)
Compress4XSizes/digits-50000-8             437MB/s ± 2%   471MB/s ± 0%   +7.70%  (p=0.000 n=10+10)
  • Loading branch information
greatroar committed Mar 12, 2023
1 parent d900f26 commit 34dac29
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 2 deletions.
16 changes: 16 additions & 0 deletions huff0/bitwriter.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,22 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
b.nBits += encA.nBits + encB.nBits
}

// encFourSymbols adds up to 32 bits from four symbols.
// It will not check if there is space for them,
// so the caller must ensure that b has been flushed recently.
func (b *bitWriter) encFourSymbols(encA, encB, encC, encD cTableEntry) {
bitsA := encA.nBits
bitsB := bitsA + encB.nBits
bitsC := bitsB + encC.nBits
bitsD := bitsC + encD.nBits
combined := uint64(encA.val) |
(uint64(encB.val) << (bitsA & 63)) |
(uint64(encC.val) << (bitsB & 63)) |
(uint64(encD.val) << (bitsC & 63))
b.bitContainer |= combined << (b.nBits & 63)
b.nBits += bitsD
}

// flush32 will flush out, so there are at least 32 bits available for writing.
func (b *bitWriter) flush32() {
if b.nBits < 32 {
Expand Down
3 changes: 1 addition & 2 deletions huff0/compress.go
Original file line number Diff line number Diff line change
Expand Up @@ -248,8 +248,7 @@ func (s *Scratch) compress1xDo(dst, src []byte) ([]byte, error) {
tmp := src[n : n+4]
// tmp should be len 4
bw.flush32()
bw.encTwoSymbols(cTable, tmp[3], tmp[2])
bw.encTwoSymbols(cTable, tmp[1], tmp[0])
bw.encFourSymbols(cTable[tmp[3]], cTable[tmp[2]], cTable[tmp[1]], cTable[tmp[0]])
}
} else {
for ; n >= 0; n -= 4 {
Expand Down

0 comments on commit 34dac29

Please sign in to comment.