From 34dac29a9a3fa3bc6d1bd7735695c4d0e9486754 Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Sun, 12 Mar 2023 10:00:02 +0100 Subject: [PATCH] huff0: Speed up compress1xDo (#774) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A specialized encFourSymbols produces better code than two inlined encTwoSymbols calls in a row. Its arguments need to be different to make the inlining work. Benchmark results on amd64: name old speed new speed delta Compress1XReuseNone/digits-8 438MB/s ± 1% 461MB/s ± 1% +5.18% (p=0.000 n=10+9) Compress1XReuseNone/gettysburg-8 254MB/s ± 1% 254MB/s ± 1% ~ (p=0.412 n=10+9) Compress1XReuseNone/twain-8 363MB/s ± 1% 367MB/s ± 0% +1.05% (p=0.000 n=10+10) Compress1XReuseNone/low-ent.10k-8 466MB/s ± 0% 485MB/s ± 1% +4.01% (p=0.000 n=9+10) Compress1XReuseNone/superlow-ent-10k-8 305MB/s ± 0% 313MB/s ± 1% +2.49% (p=0.000 n=8+9) Compress1XReuseNone/crash2-8 11.4MB/s ± 0% 11.4MB/s ± 1% ~ (p=0.458 n=9+8) Compress1XReuseNone/endzerobits-8 15.7MB/s ± 2% 15.8MB/s ± 1% ~ (p=0.085 n=10+8) Compress1XReuseNone/endnonzero-8 7.64MB/s ± 1% 7.65MB/s ± 1% ~ (p=0.957 n=10+10) Compress1XReuseNone/case1-8 14.6MB/s ± 1% 14.7MB/s ± 1% ~ (p=0.381 n=10+10) Compress1XReuseNone/case2-8 12.3MB/s ± 1% 12.3MB/s ± 0% ~ (p=0.097 n=9+9) Compress1XReuseNone/case3-8 13.2MB/s ± 1% 13.1MB/s ± 1% ~ (p=0.540 n=10+10) Compress1XReuseNone/pngdata.001-8 302MB/s ± 0% 302MB/s ± 1% ~ (p=0.815 n=8+9) Compress1XReuseNone/normcount2-8 34.9MB/s ± 0% 34.9MB/s ± 1% ~ (p=0.646 n=9+10) Compress1XReuseAllow/digits-8 444MB/s ± 1% 465MB/s ± 1% +4.71% (p=0.000 n=10+10) Compress1XReuseAllow/gettysburg-8 282MB/s ± 0% 283MB/s ± 1% +0.39% (p=0.002 n=9+8) Compress1XReuseAllow/twain-8 366MB/s ± 1% 369MB/s ± 1% +1.01% (p=0.000 n=10+10) Compress1XReuseAllow/low-ent.10k-8 470MB/s ± 1% 488MB/s ± 0% +3.82% (p=0.000 n=9+9) Compress1XReuseAllow/superlow-ent-10k-8 308MB/s ± 1% 313MB/s ± 1% +1.83% (p=0.000 n=10+10) Compress1XReuseAllow/crash2-8 16.0MB/s ± 1% 16.0MB/s ± 0% ~ (p=0.356 n=10+10) Compress1XReuseAllow/endzerobits-8 17.0MB/s ± 0% 17.1MB/s ± 0% +0.43% (p=0.001 n=10+9) Compress1XReuseAllow/endnonzero-8 12.0MB/s ± 1% 12.0MB/s ± 1% ~ (p=0.858 n=10+9) Compress1XReuseAllow/case1-8 18.2MB/s ± 1% 18.2MB/s ± 1% ~ (p=0.724 n=10+10) Compress1XReuseAllow/case2-8 15.5MB/s ± 1% 15.3MB/s ± 3% -1.02% (p=0.049 n=10+10) Compress1XReuseAllow/case3-8 16.4MB/s ± 0% 16.4MB/s ± 1% ~ (p=0.887 n=9+10) Compress1XReuseAllow/pngdata.001-8 303MB/s ± 0% 304MB/s ± 0% +0.35% (p=0.000 n=9+9) Compress1XReuseAllow/normcount2-8 45.0MB/s ± 1% 45.1MB/s ± 0% ~ (p=0.075 n=9+10) Compress1XReusePrefer/digits-8 447MB/s ± 1% 467MB/s ± 0% +4.42% (p=0.000 n=9+9) Compress1XReusePrefer/gettysburg-8 425MB/s ± 1% 429MB/s ± 0% +0.87% (p=0.000 n=10+10) Compress1XReusePrefer/twain-8 367MB/s ± 1% 371MB/s ± 0% +1.11% (p=0.000 n=10+10) Compress1XReusePrefer/low-ent.10k-8 474MB/s ± 1% 494MB/s ± 0% +4.22% (p=0.000 n=10+10) Compress1XReusePrefer/superlow-ent-10k-8 313MB/s ± 0% 320MB/s ± 0% +2.09% (p=0.000 n=10+9) Compress1XReusePrefer/crash2-8 63.2MB/s ± 1% 62.9MB/s ± 1% ~ (p=0.159 n=10+10) Compress1XReusePrefer/endzerobits-8 24.8MB/s ± 2% 24.9MB/s ± 1% ~ (p=0.674 n=9+10) Compress1XReusePrefer/endnonzero-8 33.8MB/s ± 0% 33.9MB/s ± 0% +0.27% (p=0.004 n=10+9) Compress1XReusePrefer/case1-8 150MB/s ± 7% 152MB/s ± 2% ~ (p=0.175 n=10+9) Compress1XReusePrefer/case2-8 144MB/s ± 0% 146MB/s ± 1% +1.11% (p=0.000 n=10+9) Compress1XReusePrefer/case3-8 160MB/s ± 0% 160MB/s ± 0% ~ (p=0.593 n=10+10) Compress1XReusePrefer/pngdata.001-8 313MB/s ± 1% 314MB/s ± 1% ~ (p=0.110 n=10+10) Compress1XReusePrefer/normcount2-8 212MB/s ± 1% 215MB/s ± 0% +1.24% (p=0.000 n=10+10) Compress4XReuseNone/digits-8 444MB/s ± 0% 461MB/s ± 6% +3.99% (p=0.008 n=7+9) Compress4XReuseNone/gettysburg-8 252MB/s ± 1% 251MB/s ± 2% ~ (p=0.604 n=10+9) Compress4XReuseNone/twain-8 364MB/s ± 0% 367MB/s ± 1% ~ (p=0.243 n=9+10) Compress4XReuseNone/low-ent.10k-8 469MB/s ± 0% 489MB/s ± 1% +4.18% (p=0.000 n=9+10) Compress4XReuseNone/superlow-ent-10k-8 304MB/s ± 1% 315MB/s ± 1% +3.38% (p=0.000 n=10+10) Compress4XReuseNone/case1-8 14.5MB/s ± 0% 14.4MB/s ± 3% ~ (p=0.619 n=9+9) Compress4XReuseNone/case2-8 12.1MB/s ± 0% 11.9MB/s ± 2% -1.44% (p=0.004 n=10+10) Compress4XReuseNone/case3-8 12.9MB/s ± 0% 12.9MB/s ± 3% ~ (p=0.827 n=9+10) Compress4XReuseNone/pngdata.001-8 301MB/s ± 0% 300MB/s ± 2% ~ (p=1.000 n=10+10) Compress4XReuseNone/normcount2-8 34.2MB/s ± 1% 34.0MB/s ± 4% ~ (p=0.698 n=10+10) Compress4XReuseAllow/digits-8 445MB/s ± 0% 470MB/s ± 0% +5.43% (p=0.000 n=10+10) Compress4XReuseAllow/gettysburg-8 278MB/s ± 0% 280MB/s ± 1% +0.48% (p=0.006 n=9+10) Compress4XReuseAllow/twain-8 365MB/s ± 0% 368MB/s ± 1% +0.95% (p=0.000 n=10+10) Compress4XReuseAllow/low-ent.10k-8 471MB/s ± 1% 497MB/s ± 0% +5.62% (p=0.000 n=10+8) Compress4XReuseAllow/superlow-ent-10k-8 307MB/s ± 1% 316MB/s ± 1% +3.03% (p=0.000 n=10+10) Compress4XReuseAllow/case1-8 17.8MB/s ± 1% 17.8MB/s ± 0% +0.36% (p=0.006 n=10+9) Compress4XReuseAllow/case2-8 15.0MB/s ± 0% 15.0MB/s ± 1% -0.35% (p=0.032 n=8+9) Compress4XReuseAllow/case3-8 15.9MB/s ± 0% 15.9MB/s ± 0% ~ (p=0.556 n=9+9) Compress4XReuseAllow/pngdata.001-8 302MB/s ± 0% 303MB/s ± 1% +0.40% (p=0.003 n=10+10) Compress4XReuseAllow/normcount2-8 42.3MB/s ± 7% 43.4MB/s ± 0% ~ (p=0.108 n=9+10) Compress4XReusePrefer/digits-8 428MB/s ± 7% 472MB/s ± 0% +10.29% (p=0.000 n=10+8) Compress4XReusePrefer/gettysburg-8 417MB/s ± 1% 421MB/s ± 1% +1.03% (p=0.000 n=9+9) Compress4XReusePrefer/twain-8 362MB/s ± 4% 370MB/s ± 0% +2.14% (p=0.000 n=9+9) Compress4XReusePrefer/low-ent.10k-8 470MB/s ± 1% 501MB/s ± 0% +6.67% (p=0.000 n=9+9) Compress4XReusePrefer/superlow-ent-10k-8 307MB/s ± 3% 322MB/s ± 0% +4.79% (p=0.000 n=10+9) Compress4XReusePrefer/case1-8 129MB/s ± 3% 134MB/s ± 1% +3.70% (p=0.000 n=10+10) Compress4XReusePrefer/case2-8 120MB/s ± 2% 122MB/s ± 1% +1.65% (p=0.001 n=9+10) Compress4XReusePrefer/case3-8 130MB/s ± 1% 131MB/s ± 0% +0.79% (p=0.005 n=10+7) Compress4XReusePrefer/pngdata.001-8 312MB/s ± 0% 313MB/s ± 0% +0.34% (p=0.043 n=10+9) Compress4XReusePrefer/normcount2-8 183MB/s ± 2% 184MB/s ± 0% +0.72% (p=0.011 n=10+10) Compress1XSizes/digits-100-8 63.0MB/s ± 2% 63.2MB/s ± 2% ~ (p=0.684 n=10+10) Compress1XSizes/digits-200-8 111MB/s ± 2% 112MB/s ± 1% +1.68% (p=0.000 n=10+10) Compress1XSizes/digits-500-8 204MB/s ± 2% 207MB/s ± 1% +1.73% (p=0.002 n=9+9) Compress1XSizes/digits-1000-8 287MB/s ± 3% 295MB/s ± 1% +2.66% (p=0.000 n=10+10) Compress1XSizes/digits-5000-8 423MB/s ± 1% 441MB/s ± 1% +4.34% (p=0.000 n=9+10) Compress1XSizes/digits-10000-8 443MB/s ± 1% 460MB/s ± 1% +3.96% (p=0.000 n=9+10) Compress1XSizes/digits-50000-8 442MB/s ± 0% 461MB/s ± 0% +4.49% (p=0.000 n=8+10) Compress4XSizes/digits-100-8 61.6MB/s ± 0% 61.5MB/s ± 1% ~ (p=0.310 n=9+8) Compress4XSizes/digits-200-8 108MB/s ± 1% 108MB/s ± 1% +0.51% (p=0.033 n=8+10) Compress4XSizes/digits-500-8 202MB/s ± 1% 206MB/s ± 1% +2.03% (p=0.000 n=9+9) Compress4XSizes/digits-1000-8 280MB/s ± 2% 292MB/s ± 1% +4.47% (p=0.000 n=10+10) Compress4XSizes/digits-5000-8 419MB/s ± 0% 448MB/s ± 1% +6.98% (p=0.000 n=8+10) Compress4XSizes/digits-10000-8 442MB/s ± 1% 474MB/s ± 0% +7.24% (p=0.000 n=8+9) Compress4XSizes/digits-50000-8 437MB/s ± 2% 471MB/s ± 0% +7.70% (p=0.000 n=10+10) --- huff0/bitwriter.go | 16 ++++++++++++++++ huff0/compress.go | 3 +-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/huff0/bitwriter.go b/huff0/bitwriter.go index ec71f7a349..aed2347ced 100644 --- a/huff0/bitwriter.go +++ b/huff0/bitwriter.go @@ -60,6 +60,22 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) { b.nBits += encA.nBits + encB.nBits } +// encFourSymbols adds up to 32 bits from four symbols. +// It will not check if there is space for them, +// so the caller must ensure that b has been flushed recently. +func (b *bitWriter) encFourSymbols(encA, encB, encC, encD cTableEntry) { + bitsA := encA.nBits + bitsB := bitsA + encB.nBits + bitsC := bitsB + encC.nBits + bitsD := bitsC + encD.nBits + combined := uint64(encA.val) | + (uint64(encB.val) << (bitsA & 63)) | + (uint64(encC.val) << (bitsB & 63)) | + (uint64(encD.val) << (bitsC & 63)) + b.bitContainer |= combined << (b.nBits & 63) + b.nBits += bitsD +} + // flush32 will flush out, so there are at least 32 bits available for writing. func (b *bitWriter) flush32() { if b.nBits < 32 { diff --git a/huff0/compress.go b/huff0/compress.go index cdc94856f2..4ee4fa18dd 100644 --- a/huff0/compress.go +++ b/huff0/compress.go @@ -248,8 +248,7 @@ func (s *Scratch) compress1xDo(dst, src []byte) ([]byte, error) { tmp := src[n : n+4] // tmp should be len 4 bw.flush32() - bw.encTwoSymbols(cTable, tmp[3], tmp[2]) - bw.encTwoSymbols(cTable, tmp[1], tmp[0]) + bw.encFourSymbols(cTable[tmp[3]], cTable[tmp[2]], cTable[tmp[1]], cTable[tmp[0]]) } } else { for ; n >= 0; n -= 4 {