s2: Avoid Decode size checks (#328)

Add another decode loop that doesn't check sizes as long as src is big enough. For now only in Go version. Streams: ``` Before: Decompressing. 426243297 -> 1000000000 [234.61%]; 1.941s, 491.3MB/s After: Decompressing. 426243297 -> 1000000000 [234.61%]; 1.895s, 503.3MB/s ``` Some blocks like it a lot: ``` BenchmarkDecodeS2Block/4-pdf/block-32 379 336 -11.35% BenchmarkDecodeS2Block/4-pdf/block-better-32 691 630 -8.83% BenchmarkDecodeS2Block/4-pdf/block-best-32 816 702 -13.97% BenchmarkDecodeS2Block/5-html4/block-32 13645 8727 -36.04% BenchmarkDecodeS2Block/5-html4/block-better-32 14059 9204 -34.53% BenchmarkDecodeS2Block/5-html4/block-best-32 13729 8975 -34.63% ```
klauspost · Mar 2, 2021 · 0d78ef0 · 0d78ef0
1 parent a576225
commit 0d78ef0
Showing 1 changed file with 105 additions and 0 deletions.
diff --git a/s2/decode_other.go b/s2/decode_other.go
@@ -21,6 +21,110 @@ func s2Decode(dst, src []byte) int {
 	}
 	var d, s, length int
 	offset := 0
+
+	// As long as we can read at least 5 bytes...
+	for s < len(src)-5 {
+		switch src[s] & 0x03 {
+		case tagLiteral:
+			x := uint32(src[s] >> 2)
+			switch {
+			case x < 60:
+				s++
+			case x == 60:
+				s += 2
+				x = uint32(src[s-1])
+			case x == 61:
+				s += 3
+				x = uint32(src[s-2]) | uint32(src[s-1])<<8
+			case x == 62:
+				s += 4
+				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+			case x == 63:
+				s += 5
+				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+			}
+			length = int(x) + 1
+			if length > len(dst)-d || length > len(src)-s {
+				return decodeErrCodeCorrupt
+			}
+			if debug {
+				fmt.Println("literals, length:", length, "d-after:", d+length)
+			}
+
+			copy(dst[d:], src[s:s+length])
+			d += length
+			s += length
+			continue
+
+		case tagCopy1:
+			s += 2
+			length = int(src[s-2]) >> 2 & 0x7
+			toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+			if toffset == 0 {
+				if debug {
+					fmt.Print("(repeat) ")
+				}
+				// keep last offset
+				switch length {
+				case 5:
+					s += 1
+					length = int(uint32(src[s-1])) + 4
+				case 6:
+					s += 2
+					length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
+				case 7:
+					s += 3
+					length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
+				default: // 0-> 4
+				}
+			} else {
+				offset = toffset
+			}
+			length += 4
+		case tagCopy2:
+			s += 3
+			length = 1 + int(src[s-3])>>2
+			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+
+		case tagCopy4:
+			s += 5
+			length = 1 + int(src[s-5])>>2
+			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+		}
+
+		if offset <= 0 || d < offset || length > len(dst)-d {
+			return decodeErrCodeCorrupt
+		}
+
+		if debug {
+			fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
+		}
+
+		// Copy from an earlier sub-slice of dst to a later sub-slice.
+		// If no overlap, use the built-in copy:
+		if offset > length {
+			copy(dst[d:d+length], dst[d-offset:])
+			d += length
+			continue
+		}
+
+		// Unlike the built-in copy function, this byte-by-byte copy always runs
+		// forwards, even if the slices overlap. Conceptually, this is:
+		//
+		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
+		//
+		// We align the slices into a and b and show the compiler they are the same size.
+		// This allows the loop to run without bounds checks.
+		a := dst[d : d+length]
+		b := dst[d-offset:]
+		b = b[:len(a)]
+		for i := range a {
+			a[i] = b[i]
+		}
+		d += length
+	}
+
+	// Remaining with extra checks...
 	for s < len(src) {
 		switch src[s] & 0x03 {
 		case tagLiteral:
@@ -151,6 +255,7 @@ func s2Decode(dst, src []byte) int {
 		}
 		d += length
 	}
+
 	if d != len(dst) {
 		return decodeErrCodeCorrupt
 	}