diff --git a/zstd/_generate/gen.go b/zstd/_generate/gen.go index a42aa65b90..884c5c8c0f 100644 --- a/zstd/_generate/gen.go +++ b/zstd/_generate/gen.go @@ -14,6 +14,7 @@ import ( . "github.com/mmcloughlin/avo/build" "github.com/mmcloughlin/avo/buildtags" + "github.com/mmcloughlin/avo/gotypes" . "github.com/mmcloughlin/avo/operand" "github.com/mmcloughlin/avo/reg" ) @@ -27,6 +28,12 @@ const errorMatchLenOfsMismatch = 1 // error reported when ml > maxMatchLen const errorMatchLenTooBig = 2 +// error reported when mo > t or mo > s.windowSize +const errorMatchOffTooBig = 3 + +// error reported when the sum of literal lengths exeeceds the literal buffer size +const errorNotEnoughLiterals = 4 + const maxMatchLen = 131074 // size of struct seqVals @@ -50,6 +57,7 @@ func main() { o := options{ bmi2: false, fiftysix: false, + useSeqs: true, } o.genDecodeSeqAsm("sequenceDecs_decode_amd64") o.fiftysix = true @@ -60,9 +68,17 @@ func main() { o.fiftysix = true o.genDecodeSeqAsm("sequenceDecs_decode_56_bmi2") - exec := executeSimple{} + exec := executeSimple{ + useSeqs: true, + } exec.generateProcedure("sequenceDecs_executeSimple_amd64") + decodeSync := decodeSync{} + decodeSync.setBMI2(false) + decodeSync.generateProcedure("sequenceDecs_decodeSync_amd64") + decodeSync.setBMI2(true) + decodeSync.generateProcedure("sequenceDecs_decodeSync_bmi2") + Generate() b, err := ioutil.ReadFile(out.Value.String()) if err != nil { @@ -111,6 +127,7 @@ func assert(fn func(ok LabelRef)) { type options struct { bmi2 bool fiftysix bool // Less than max 56 bits/loop + useSeqs bool // Generate code that uses the `seqs` auxiliary table } func (o options) genDecodeSeqAsm(name string) { @@ -119,13 +136,23 @@ func (o options) genDecodeSeqAsm(name string) { Doc(name+" decodes a sequence", "") Pragma("noescape") + nop := func(ctx *executeSingleTripleContext, handleLoop func()) {} + + o.generateBody(name, nop) +} + +func (o options) generateBody(name string, executeSingleTriple func(ctx *executeSingleTripleContext, handleLoop func())) { + // registers used by `decode` brValue := GP64() brBitsRead := GP64() brOffset := GP64() llState := GP64() mlState := GP64() ofState := GP64() - seqBase := GP64() + var seqBase reg.GPVirtual // allocated only when o.useSeqs is true + + // values used by `execute` (allocated only when o.useSeqs is false) + ec := executeSingleTripleContext{} // 1. load bitReader (done once) brPointerStash := AllocLocal(8) @@ -141,26 +168,74 @@ func (o options) genDecodeSeqAsm(name string) { } // 2. load states (done once) + var moP Mem + var mlP Mem + var llP Mem + { ctx := Dereference(Param("ctx")) Load(ctx.Field("llState"), llState) Load(ctx.Field("mlState"), mlState) Load(ctx.Field("ofState"), ofState) - Load(ctx.Field("seqs").Base(), seqBase) - } - moP := Mem{Base: seqBase, Disp: 2 * 8} // Pointer to current mo - mlP := Mem{Base: seqBase, Disp: 1 * 8} // Pointer to current ml - llP := Mem{Base: seqBase, Disp: 0 * 8} // Pointer to current ll + if o.useSeqs { + seqBase = GP64() + Load(ctx.Field("seqs").Base(), seqBase) + moP = Mem{Base: seqBase, Disp: 2 * 8} // Pointer to current mo + mlP = Mem{Base: seqBase, Disp: 1 * 8} // Pointer to current ml + llP = Mem{Base: seqBase, Disp: 0 * 8} // Pointer to current ll + } else { + moP = AllocLocal(8) + mlP = AllocLocal(8) + llP = AllocLocal(8) + ec.moPtr = moP + ec.mlPtr = mlP + ec.llPtr = llP + + ec.outBase = GP64() + ec.literals = GP64() + ec.outPosition = GP64() + ec.histLenPtr = AllocLocal(8) + ec.histBasePtr = AllocLocal(8) + ec.windowSizePtr = AllocLocal(8) + + loadField := func(field gotypes.Component, target Mem) { + tmp := GP64() + Load(field, tmp) + MOVQ(tmp, target) + } + + Load(ctx.Field("out").Base(), ec.outBase) + Load(ctx.Field("literals").Base(), ec.literals) + Load(ctx.Field("outPosition"), ec.outPosition) + loadField(ctx.Field("windowSize"), ec.windowSizePtr) + loadField(ctx.Field("history").Base(), ec.histBasePtr) + loadField(ctx.Field("history").Len(), ec.histLenPtr) + + { + tmp := GP64() + MOVQ(ec.histLenPtr, tmp) + ADDQ(tmp, ec.histBasePtr) // Note: we always copy from &hist[len(hist) - v] + } + + Comment("outBase += outPosition") + ADDQ(ec.outPosition, ec.outBase) + } + } // Store previous offsets in registers. var offsets [3]reg.GPVirtual - s := Dereference(Param("s")) - for i := range offsets { - offsets[i] = GP64() - po, _ := s.Field("prevOffset").Index(i).Resolve() - - MOVQ(po.Addr, offsets[i]) + if o.useSeqs { + s := Dereference(Param("s")) + for i := range offsets { + offsets[i] = GP64() + po, err := s.Field("prevOffset").Index(i).Resolve() + if err != nil { + panic(err) + } + + MOVQ(po.Addr, offsets[i]) + } } // MAIN LOOP: @@ -226,11 +301,14 @@ func (o options) genDecodeSeqAsm(name string) { } Label(name + "_skip_update") - // mo = s.adjustOffset(mo, ll, moB) - Comment("Adjust offset") - offset := o.adjustOffset(name+"_adjust", moP, llP, R14, &offsets) + var offset reg.GPVirtual + if o.useSeqs { + offset = o.adjustOffset(name+"_adjust", moP, llP, R14, &offsets) + } else { + offset = o.adjustOffsetInMemory(name+"_adjust", moP, llP, R14) + } MOVQ(offset, moP) // Store offset Comment("Check values") @@ -258,6 +336,7 @@ func (o options) genDecodeSeqAsm(name string) { panic(err) } SUBQ(ll, litRemainP.Addr) // ctx.litRemain -= ll + JS(LabelRef("error_not_enough_literals")) { // if ml > maxMatchLen { // return fmt.Errorf("match len (%d) bigger than max allowed length", ml) @@ -276,7 +355,19 @@ func (o options) genDecodeSeqAsm(name string) { } Label(name + "_match_len_ofs_ok") - ADDQ(U8(seqValsSize), seqBase) + + if !o.useSeqs { + handleLoop := func() { + JMP(LabelRef("handle_loop")) + } + + executeSingleTriple(&ec, handleLoop) + } + + Label("handle_loop") + if o.useSeqs { + ADDQ(U8(seqValsSize), seqBase) + } ctx = Dereference(Param("ctx")) iterationP, err := ctx.Field("iteration").Resolve() if err != nil { @@ -286,11 +377,15 @@ func (o options) genDecodeSeqAsm(name string) { DECQ(iterationP.Addr) JNS(LabelRef(name + "_main_loop")) + Label("loop_finished") + // Store offsets - s = Dereference(Param("s")) - for i := range offsets { - po, _ := s.Field("prevOffset").Index(i).Resolve() - MOVQ(offsets[i], po.Addr) + if o.useSeqs { + s := Dereference(Param("s")) + for i := range offsets { + po, _ := s.Field("prevOffset").Index(i).Resolve() + MOVQ(offsets[i], po.Addr) + } } // update bitreader state before returning @@ -299,16 +394,71 @@ func (o options) genDecodeSeqAsm(name string) { Store(brBitsRead.As8(), br.Field("bitsRead")) Store(brOffset, br.Field("off")) + if !o.useSeqs { + Comment("Update the context") + ctx := Dereference(Param("ctx")) + Store(ec.outPosition, ctx.Field("outPosition")) + + // compute litPosition + tmp := GP64() + Load(ctx.Field("literals").Base(), tmp) + SUBQ(tmp, ec.literals) // litPosition := current - initial literals pointer + Store(ec.literals, ctx.Field("litPosition")) + } + Comment("Return success") o.returnWithCode(0) Comment("Return with match length error") - Label(name + "_error_match_len_ofs_mismatch") - o.returnWithCode(errorMatchLenOfsMismatch) + { + Label(name + "_error_match_len_ofs_mismatch") + if !o.useSeqs { + tmp := GP64() + MOVQ(mlP, tmp) + ctx := Dereference(Param("ctx")) + Store(tmp, ctx.Field("ml")) + } + o.returnWithCode(errorMatchLenOfsMismatch) + } Comment("Return with match too long error") - Label(name + "_error_match_len_too_big") - o.returnWithCode(errorMatchLenTooBig) + { + Label(name + "_error_match_len_too_big") + if !o.useSeqs { + ctx := Dereference(Param("ctx")) + tmp := GP64() + MOVQ(mlP, tmp) + Store(tmp, ctx.Field("ml")) + } + o.returnWithCode(errorMatchLenTooBig) + } + + Comment("Return with match offset too long error") + { + Label("error_match_off_too_big") + if !o.useSeqs { + ctx := Dereference(Param("ctx")) + tmp := GP64() + MOVQ(moP, tmp) + Store(tmp, ctx.Field("mo")) + Store(ec.outPosition, ctx.Field("outPosition")) + } + o.returnWithCode(errorMatchOffTooBig) + } + + Comment("Return with not enough literals error") + { + Label("error_not_enough_literals") + if !o.useSeqs { + ctx := Dereference(Param("ctx")) + tmp := GP64() + MOVQ(llP, tmp) + Store(tmp, ctx.Field("ll")) + } + // Note: the `litRemain` field is updated in-place (for both useSeqs values) + + o.returnWithCode(errorNotEnoughLiterals) + } } func (o options) returnWithCode(returnCode uint32) { @@ -594,7 +744,123 @@ func (o options) adjustOffset(name string, moP, llP Mem, offsetB reg.GPVirtual, return offset } -type executeSimple struct{} +// adjustOffsetInMemory is an adjustOffset version that does not cache prevOffset values in registers. +// It fetches and stores values directly into the fields of `sequenceDecs` structure. +func (o options) adjustOffsetInMemory(name string, moP, llP Mem, offsetB reg.GPVirtual) (offset reg.GPVirtual) { + s := Dereference(Param("s")) + + po0, _ := s.Field("prevOffset").Index(0).Resolve() + po1, _ := s.Field("prevOffset").Index(1).Resolve() + po2, _ := s.Field("prevOffset").Index(2).Resolve() + offset = GP64() + MOVQ(moP, offset) + { + // if offsetB > 1 { + // s.prevOffset[2] = s.prevOffset[1] + // s.prevOffset[1] = s.prevOffset[0] + // s.prevOffset[0] = offset + // return offset + // } + CMPQ(offsetB, U8(1)) + JBE(LabelRef(name + "_offsetB_1_or_0")) + + tmp := XMM() + MOVUPS(po0.Addr, tmp) // tmp = (s.prevOffset[0], s.prevOffset[1]) + MOVQ(offset, po0.Addr) // s.prevOffset[0] = offset + MOVUPS(tmp, po1.Addr) // s.prevOffset[1], s.prevOffset[2] = s.prevOffset[0], s.prevOffset[1] + JMP(LabelRef(name + "_end")) + } + + Label(name + "_offsetB_1_or_0") + // if litLen == 0 { + // offset++ + // } + { + if true { + CMPQ(llP, U32(0)) + JNE(LabelRef(name + "_offset_maybezero")) + INCQ(offset) + JMP(LabelRef(name + "_offset_nonzero")) + } else { + // No idea why this doesn't work: + tmp := GP64() + LEAQ(Mem{Base: offset, Disp: 1}, tmp) + CMPQ(llP, U32(0)) + CMOVQEQ(tmp, offset) + } + + // if offset == 0 { + // return s.prevOffset[0] + // } + { + Label(name + "_offset_maybezero") + TESTQ(offset, offset) + JNZ(LabelRef(name + "_offset_nonzero")) + MOVQ(po0.Addr, offset) + JMP(LabelRef(name + "_end")) + } + } + Label(name + "_offset_nonzero") + { + // if offset == 3 { + // temp = s.prevOffset[0] - 1 + // } else { + // temp = s.prevOffset[offset] + // } + // + // this if got transformed into: + // + // ofs := offset + // shift := 0 + // if offset == 3 { + // ofs = 0 + // shift = -1 + // } + // temp := s.prevOffset[ofs] + shift + // TODO: This should be easier... + CX, DX, R15 := GP64(), GP64(), GP64() + MOVQ(offset, CX) + XORQ(DX, DX) + MOVQ(I32(-1), R15) + CMPQ(offset, U8(3)) + CMOVQEQ(DX, CX) + CMOVQEQ(R15, DX) + prevOffset := GP64() + LEAQ(po0.Addr, prevOffset) // &prevOffset[0] + ADDQ(Mem{Base: prevOffset, Index: CX, Scale: 8}, DX) + temp := DX + // if temp == 0 { + // temp = 1 + // } + JNZ(LabelRef(name + "_temp_valid")) + MOVQ(U32(1), temp) + + Label(name + "_temp_valid") + // if offset != 1 { + // s.prevOffset[2] = s.prevOffset[1] + // } + CMPQ(offset, U8(1)) + JZ(LabelRef(name + "_skip")) + tmp := GP64() + MOVQ(po1.Addr, tmp) + MOVQ(tmp, po2.Addr) // s.prevOffset[2] = s.prevOffset[1] + + Label(name + "_skip") + // s.prevOffset[1] = s.prevOffset[0] + // s.prevOffset[0] = temp + tmp = GP64() + MOVQ(po0.Addr, tmp) + MOVQ(tmp, po1.Addr) // s.prevOffset[1] = s.prevOffset[0] + MOVQ(temp, po0.Addr) // s.prevOffset[0] = temp + MOVQ(temp, offset) // return temp + } + Label(name + "_end") + return offset +} + +type executeSimple struct { + useSeqs bool // Generate code that uses the `seqs` auxiliary table +} // copySize returns register size used to fast copy. // @@ -613,7 +879,6 @@ func (e executeSimple) generateProcedure(name string) { seqsLen := GP64() seqIndex := GP64() outBase := GP64() - outLen := GP64() literals := GP64() outPosition := GP64() windowSize := GP64() @@ -622,13 +887,13 @@ func (e executeSimple) generateProcedure(name string) { { ctx := Dereference(Param("ctx")) + tmp := GP64() Load(ctx.Field("seqs").Len(), seqsLen) TESTQ(seqsLen, seqsLen) JZ(LabelRef("empty_seqs")) Load(ctx.Field("seqs").Base(), seqsBase) Load(ctx.Field("seqIndex"), seqIndex) Load(ctx.Field("out").Base(), outBase) - Load(ctx.Field("out").Len(), outLen) Load(ctx.Field("literals").Base(), literals) Load(ctx.Field("outPosition"), outPosition) Load(ctx.Field("windowSize"), windowSize) @@ -637,7 +902,6 @@ func (e executeSimple) generateProcedure(name string) { ADDQ(histLen, histBase) // Note: we always copy from &hist[len(hist) - v] - tmp := GP64() Comment("seqsBase += 24 * seqIndex") LEAQ(Mem{Base: seqIndex, Index: seqIndex, Scale: 2}, tmp) // * 3 SHLQ(U8(3), tmp) // * 8 @@ -649,18 +913,10 @@ func (e executeSimple) generateProcedure(name string) { Label("main_loop") - ml := GP64() - mo := GP64() - ll := GP64() - moPtr := Mem{Base: seqsBase, Disp: 2 * 8} mlPtr := Mem{Base: seqsBase, Disp: 1 * 8} llPtr := Mem{Base: seqsBase, Disp: 0 * 8} - MOVQ(llPtr, ll) - MOVQ(mlPtr, ml) - MOVQ(moPtr, mo) - // generates the loop tail handleLoop := func() { ADDQ(U8(seqValsSize), seqsBase) // seqs += sizeof(seqVals) @@ -669,27 +925,120 @@ func (e executeSimple) generateProcedure(name string) { JB(LabelRef("main_loop")) } + ctx := executeSingleTripleContext{ + llPtr: llPtr, + moPtr: moPtr, + mlPtr: mlPtr, + literals: literals, + outBase: outBase, + outPosition: outPosition, + histBase: histBase, + histLen: histLen, + windowSize: windowSize, + } + + e.executeSingleTriple(&ctx, handleLoop) + + Label("handle_loop") + handleLoop() + + ret, err := ReturnIndex(0).Resolve() + if err != nil { + panic(err) + } + + returnValue := func(val int) { + + Comment("Return value") + MOVB(U8(val), ret.Addr) + + Comment("Update the context") + ctx := Dereference(Param("ctx")) + Store(seqIndex, ctx.Field("seqIndex")) + Store(outPosition, ctx.Field("outPosition")) + + // compute litPosition + tmp := GP64() + Load(ctx.Field("literals").Base(), tmp) + SUBQ(tmp, literals) // litPosition := current - initial literals pointer + Store(literals, ctx.Field("litPosition")) + } + Label("loop_finished") + returnValue(1) + RET() + + Label("error_match_off_too_big") + returnValue(0) + RET() + + Label("empty_seqs") + Comment("Return value") + MOVB(U8(1), ret.Addr) + RET() +} + +type executeSingleTripleContext struct { + // common values + llPtr Mem + moPtr Mem + mlPtr Mem + + literals reg.GPVirtual + outBase reg.GPVirtual + outPosition reg.GPVirtual + + // values used when useSeqs is true + histBase reg.GPVirtual + histLen reg.GPVirtual + windowSize reg.GPVirtual + + // values used when useSeqs is false + histBasePtr Mem + histLenPtr Mem + windowSizePtr Mem +} + +// executeSingleTriple performs copy from literals and history according +// to the decoded values ll, mo and ml. +func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handleLoop func()) { + ll := GP64() + MOVQ(c.llPtr, ll) + mo := GP64() + MOVQ(c.moPtr, mo) + ml := GP64() + MOVQ(c.mlPtr, ml) + Comment("Copy literals") Label("copy_literals") { TESTQ(ll, ll) JZ(LabelRef("check_offset")) // TODO: Investigate if it is possible to consistently overallocate literals. - e.copyMemoryPrecise("1", literals, outBase, ll) - - ADDQ(ll, literals) - ADDQ(ll, outBase) - ADDQ(ll, outPosition) + e.copyMemoryPrecise("1", c.literals, c.outBase, ll) + ADDQ(ll, c.literals) + ADDQ(ll, c.outBase) + ADDQ(ll, c.outPosition) } Comment("Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)") { Label("check_offset") + tmp := GP64() - LEAQ(Mem{Base: outPosition, Index: histLen, Scale: 1}, tmp) + if e.useSeqs { + LEAQ(Mem{Base: c.outPosition, Index: c.histLen, Scale: 1}, tmp) + } else { + MOVQ(c.outPosition, tmp) + ADDQ(c.histLenPtr, tmp) + } CMPQ(mo, tmp) JG(LabelRef("error_match_off_too_big")) - CMPQ(mo, windowSize) + + if e.useSeqs { + CMPQ(mo, c.windowSize) + } else { + CMPQ(mo, c.windowSizePtr) + } JG(LabelRef("error_match_off_too_big")) } @@ -697,7 +1046,9 @@ func (e executeSimple) generateProcedure(name string) { { v := GP64() MOVQ(mo, v) - SUBQ(outPosition, v) // v := seq.mo - outPosition + + // v := seq.mo - outPosition + SUBQ(c.outPosition, v) JLS(LabelRef("copy_match")) // do nothing if v <= 0 // v := seq.mo - t; v > 0 { @@ -705,11 +1056,21 @@ func (e executeSimple) generateProcedure(name string) { // ... // } assert(func(ok LabelRef) { - TESTQ(histLen, histLen) + if e.useSeqs { + TESTQ(c.histLen, c.histLen) + } else { + t := GP64() + MOVQ(c.histLenPtr, t) + TESTQ(t, t) + } JNZ(ok) }) ptr := GP64() - MOVQ(histBase, ptr) + if e.useSeqs { + MOVQ(c.histBase, ptr) + } else { + MOVQ(c.histBasePtr, ptr) + } SUBQ(v, ptr) // ptr := &hist[len(hist) - v] CMPQ(ml, v) JGE(LabelRef("copy_all_from_history")) @@ -719,10 +1080,9 @@ func (e executeSimple) generateProcedure(name string) { continue } */ - e.copyMemoryPrecise("4", ptr, outBase, ml) - - ADDQ(ml, outPosition) - ADDQ(ml, outBase) + e.copyMemoryPrecise("4", ptr, c.outBase, ml) + ADDQ(ml, c.outPosition) + ADDQ(ml, c.outBase) // Note: for the current go tests this branch is taken in 99.53% cases, // this is why we repeat a little code here. handleLoop() @@ -737,9 +1097,9 @@ func (e executeSimple) generateProcedure(name string) { seq.ml -= v } */ - e.copyMemoryPrecise("5", ptr, outBase, v) - ADDQ(v, outBase) - ADDQ(v, outPosition) + e.copyMemoryPrecise("5", ptr, c.outBase, v) + ADDQ(v, c.outBase) + ADDQ(v, c.outPosition) SUBQ(v, ml) // fallback to the next block } @@ -751,7 +1111,7 @@ func (e executeSimple) generateProcedure(name string) { JZ(LabelRef("handle_loop")) src := GP64() - MOVQ(outBase, src) + MOVQ(c.outBase, src) SUBQ(mo, src) // src = &s.out[t - mo] // start := t - mo @@ -770,57 +1130,20 @@ func (e executeSimple) generateProcedure(name string) { Comment("Copy non-overlapping match") { - e.copyMemory("2", src, outBase, ml) - ADDQ(ml, outBase) - ADDQ(ml, outPosition) + e.copyMemoryPrecise("2", src, c.outBase, ml) + ADDQ(ml, c.outBase) + ADDQ(ml, c.outPosition) JMP(LabelRef("handle_loop")) } Comment("Copy overlapping match") Label("copy_overlapping_match") { - e.copyOverlappedMemory("3", src, outBase, ml) - ADDQ(ml, outBase) - ADDQ(ml, outPosition) + e.copyOverlappedMemory("3", src, c.outBase, ml) + ADDQ(ml, c.outBase) + ADDQ(ml, c.outPosition) } } - - Label("handle_loop") - handleLoop() - - ret, err := ReturnIndex(0).Resolve() - if err != nil { - panic(err) - } - - returnValue := func(val int) { - - Comment("Return value") - MOVB(U8(val), ret.Addr) - - Comment("Update the context") - ctx := Dereference(Param("ctx")) - Store(seqIndex, ctx.Field("seqIndex")) - Store(outPosition, ctx.Field("outPosition")) - - // compute litPosition - tmp := GP64() - Load(ctx.Field("literals").Base(), tmp) - SUBQ(tmp, literals) // litPosition := current - initial literals pointer - Store(literals, ctx.Field("litPosition")) - } - Label("loop_finished") - returnValue(1) - RET() - - Label("error_match_off_too_big") - returnValue(0) - RET() - - Label("empty_seqs") - Comment("Return value") - MOVB(U8(1), ret.Addr) - RET() } // copyMemory will copy memory in blocks of 16 bytes, @@ -916,3 +1239,21 @@ func (e executeSimple) copyOverlappedMemory(suffix string, src, dst, length reg. CMPQ(ofs, length) JB(LabelRef(label)) } + +type decodeSync struct { + decode options + execute executeSimple +} + +func (d *decodeSync) setBMI2(flag bool) { + d.decode.bmi2 = flag +} + +func (d *decodeSync) generateProcedure(name string) { + Package("github.com/klauspost/compress/zstd") + TEXT(name, 0, "func (s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int") + Doc(name+" implements the main loop of sequenceDecs.decodeSync in x86 asm", "") + Pragma("noescape") + + d.decode.generateBody(name, d.execute.executeSingleTriple) +} diff --git a/zstd/decoder_test.go b/zstd/decoder_test.go index c567be1da6..6db26a2369 100644 --- a/zstd/decoder_test.go +++ b/zstd/decoder_test.go @@ -806,6 +806,7 @@ func TestDecoder_Reset(t *testing.T) { t.Error(err, len(decoded)) } if !bytes.Equal(decoded, in) { + t.Logf("size = %d, got = %d", len(decoded), len(in)) t.Fatal("Decoded does not match") } t.Log("Encoded content matched") @@ -1059,7 +1060,12 @@ func testDecoderFile(t *testing.T, fn string, newDec func() (*Decoder, error)) { } wantB := want[tt.Name] - if !bytes.Equal(wantB, got) { + + compareWith := func(got []byte, displayName, name string) bool { + if bytes.Equal(wantB, got) { + return false + } + if len(wantB)+len(got) < 1000 { t.Logf(" got: %v\nwant: %v", got, wantB) } else { @@ -1068,32 +1074,24 @@ func testDecoderFile(t *testing.T, fn string, newDec func() (*Decoder, error)) { err := ioutil.WriteFile(fileName, wantB, os.ModePerm) t.Log("Wrote file", fileName, err) - fileName, _ = filepath.Abs(filepath.Join("testdata", t.Name()+"-got.bin")) + fileName, _ = filepath.Abs(filepath.Join("testdata", t.Name()+"-"+name+".bin")) _ = os.MkdirAll(filepath.Dir(fileName), os.ModePerm) err = ioutil.WriteFile(fileName, got, os.ModePerm) t.Log("Wrote file", fileName, err) } t.Logf("Length, want: %d, got: %d", len(wantB), len(got)) - t.Error("Output mismatch") + t.Errorf("%s mismatch", displayName) + return true + } + + if compareWith(got, "Output", "got") { return } - if !bytes.Equal(wantB, gotDecAll) { - if len(wantB)+len(got) < 1000 { - t.Logf(" got: %v\nwant: %v", got, wantB) - } else { - fileName, _ := filepath.Abs(filepath.Join("testdata", t.Name()+"-want.bin")) - _ = os.MkdirAll(filepath.Dir(fileName), os.ModePerm) - err := ioutil.WriteFile(fileName, wantB, os.ModePerm) - t.Log("Wrote file", fileName, err) - fileName, _ = filepath.Abs(filepath.Join("testdata", t.Name()+"-got.bin")) - _ = os.MkdirAll(filepath.Dir(fileName), os.ModePerm) - err = ioutil.WriteFile(fileName, got, os.ModePerm) - t.Log("Wrote file", fileName, err) - } - t.Logf("Length, want: %d, got: %d", len(wantB), len(got)) - t.Error("DecodeAll Output mismatch") + if compareWith(gotDecAll, "DecodeAll Output", "decoded") { + return } + t.Log(len(got), "bytes returned, matches input, ok!") }) } diff --git a/zstd/seqdec.go b/zstd/seqdec.go index 37d17661e3..6e7dd7f1df 100644 --- a/zstd/seqdec.go +++ b/zstd/seqdec.go @@ -202,6 +202,12 @@ func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error { // decode sequences from the stream with the provided history. func (s *sequenceDecs) decodeSync(hist []byte) error { + if true { + supported, err := s.decodeSyncSimple(hist) + if supported { + return err + } + } br := s.br seqs := s.nSeqs startSize := len(s.out) diff --git a/zstd/seqdec_amd64.go b/zstd/seqdec_amd64.go index 68a602ae63..f1326ec13c 100644 --- a/zstd/seqdec_amd64.go +++ b/zstd/seqdec_amd64.go @@ -9,6 +9,124 @@ import ( "github.com/klauspost/compress/internal/cpuinfo" ) +type decodeSyncAsmContext struct { + llTable []decSymbol + mlTable []decSymbol + ofTable []decSymbol + llState uint64 + mlState uint64 + ofState uint64 + iteration int + litRemain int + out []byte + outPosition int + literals []byte + litPosition int + history []byte + windowSize int + ll int // set on error (not for all errors, please refer to _generate/gen.go) + ml int // set on error (not for all errors, please refer to _generate/gen.go) + mo int // set on error (not for all errors, please refer to _generate/gen.go) +} + +// sequenceDecs_decodeSync_amd64 implements the main loop of sequenceDecs.decodeSync in x86 asm. +// +// Please refer to seqdec_generic.go for the reference implementation. +//go:noescape +func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int + +// sequenceDecs_decodeSync_bmi2 implements the main loop of sequenceDecs.decodeSync in x86 asm with BMI2 extensions. +//go:noescape +func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int + +// decode sequences from the stream with the provided history but without a dictionary. +func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) { + if len(s.dict) > 0 || cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc { + return false, nil + } + + br := s.br + + maxBlockSize := maxCompressedBlockSize + if s.windowSize < maxBlockSize { + maxBlockSize = s.windowSize + } + + ctx := decodeSyncAsmContext{ + llTable: s.litLengths.fse.dt[:maxTablesize], + mlTable: s.matchLengths.fse.dt[:maxTablesize], + ofTable: s.offsets.fse.dt[:maxTablesize], + llState: uint64(s.litLengths.state.state), + mlState: uint64(s.matchLengths.state.state), + ofState: uint64(s.offsets.state.state), + iteration: s.nSeqs - 1, + litRemain: len(s.literals), + out: s.out, + outPosition: len(s.out), + literals: s.literals, + windowSize: s.windowSize, + history: hist, + } + + s.seqSize = 0 + startSize := len(s.out) + + var errCode int + if cpuinfo.HasBMI2() { + errCode = sequenceDecs_decodeSync_bmi2(s, br, &ctx) + } else { + errCode = sequenceDecs_decodeSync_amd64(s, br, &ctx) + } + switch errCode { + case noError: + break + + case errorMatchLenOfsMismatch: + return true, fmt.Errorf("zero matchoff and matchlen (%d) > 0", ctx.ml) + + case errorMatchLenTooBig: + return true, fmt.Errorf("match len (%d) bigger than max allowed length", ctx.ml) + + case errorMatchOffTooBig: + return true, fmt.Errorf("match offset (%d) bigger than current history (%d)", + ctx.mo, ctx.outPosition+len(hist)-startSize) + + case errorNotEnoughLiterals: + return true, fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", + ctx.ll, ctx.litRemain+ctx.ll) + + default: + return true, fmt.Errorf("sequenceDecs_decode returned erronous code %d", errCode) + } + + s.seqSize += ctx.litRemain + if s.seqSize > maxBlockSize { + return true, fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize) + } + err := br.close() + if err != nil { + printf("Closing sequences: %v, %+v\n", err, *br) + return true, err + } + + s.literals = s.literals[ctx.litPosition:] + t := ctx.outPosition + s.out = s.out[:t] + + // Add final literals + s.out = append(s.out, s.literals...) + if debugDecoder { + t += len(s.literals) + if t != len(s.out) { + panic(fmt.Errorf("length mismatch, want %d, got %d", len(s.out), t)) + } + } + + return true, nil +} + +// -------------------------------------------------------------------------------- + type decodeAsmContext struct { llTable []decSymbol mlTable []decSymbol @@ -21,12 +139,20 @@ type decodeAsmContext struct { litRemain int } +const noError = 0 + // error reported when mo == 0 && ml > 0 const errorMatchLenOfsMismatch = 1 // error reported when ml > maxMatchLen const errorMatchLenTooBig = 2 +// error reported when mo > t or mo > s.windowSize +const errorMatchOffTooBig = 3 + +// error reported when the sum of literal lengths exeeceds the literal buffer size +const errorNotEnoughLiterals = 4 + // sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm. // // Please refer to seqdec_generic.go for the reference implementation. @@ -94,6 +220,10 @@ func (s *sequenceDecs) decode(seqs []seqVals) error { case errorMatchLenTooBig: ml := ctx.seqs[i].ml return fmt.Errorf("match len (%d) bigger than max allowed length", ml) + + case errorNotEnoughLiterals: + ll := ctx.seqs[i].ll + return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, ctx.litRemain+ll) } return fmt.Errorf("sequenceDecs_decode_amd64 returned erronous code %d", errCode) @@ -115,6 +245,8 @@ func (s *sequenceDecs) decode(seqs []seqVals) error { return err } +// -------------------------------------------------------------------------------- + type executeAsmContext struct { seqs []seqVals seqIndex int diff --git a/zstd/seqdec_amd64.s b/zstd/seqdec_amd64.s index e12bece2e6..70e302b543 100644 --- a/zstd/seqdec_amd64.s +++ b/zstd/seqdec_amd64.s @@ -261,6 +261,7 @@ sequenceDecs_decode_amd64_adjust_end: ADDQ R15, 256(BP) MOVQ ctx+16(FP), R15 SUBQ R14, 128(R15) + JS error_not_enough_literals CMPQ AX, $0x00020002 JA sequenceDecs_decode_amd64_error_match_len_too_big TESTQ CX, CX @@ -296,6 +297,15 @@ sequenceDecs_decode_amd64_error_match_len_too_big: MOVQ $0x00000002, ret+24(FP) RET + // Return with match offset too long error + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ $0x00000004, ret+24(FP) + RET + // func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: CMOV TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32 @@ -529,6 +539,7 @@ sequenceDecs_decode_56_amd64_adjust_end: ADDQ R15, 256(BP) MOVQ ctx+16(FP), R15 SUBQ R14, 128(R15) + JS error_not_enough_literals CMPQ AX, $0x00020002 JA sequenceDecs_decode_56_amd64_error_match_len_too_big TESTQ CX, CX @@ -564,6 +575,15 @@ sequenceDecs_decode_56_amd64_error_match_len_too_big: MOVQ $0x00000002, ret+24(FP) RET + // Return with match offset too long error + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ $0x00000004, ret+24(FP) + RET + // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: BMI, BMI2, CMOV TEXT ·sequenceDecs_decode_bmi2(SB), $8-32 @@ -800,6 +820,7 @@ sequenceDecs_decode_bmi2_adjust_end: ADDQ R15, 256(BP) MOVQ ctx+16(FP), R15 SUBQ R14, 128(R15) + JS error_not_enough_literals CMPQ R13, $0x00020002 JA sequenceDecs_decode_bmi2_error_match_len_too_big TESTQ CX, CX @@ -835,6 +856,15 @@ sequenceDecs_decode_bmi2_error_match_len_too_big: MOVQ $0x00000002, ret+24(FP) RET + // Return with match offset too long error + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ $0x00000004, ret+24(FP) + RET + // func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: BMI, BMI2, CMOV TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32 @@ -1046,6 +1076,7 @@ sequenceDecs_decode_56_bmi2_adjust_end: ADDQ R15, 256(BP) MOVQ ctx+16(FP), R15 SUBQ R14, 128(R15) + JS error_not_enough_literals CMPQ R13, $0x00020002 JA sequenceDecs_decode_56_bmi2_error_match_len_too_big TESTQ CX, CX @@ -1081,23 +1112,31 @@ sequenceDecs_decode_56_bmi2_error_match_len_too_big: MOVQ $0x00000002, ret+24(FP) RET + // Return with match offset too long error + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ $0x00000004, ret+24(FP) + RET + // func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool // Requires: SSE TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9 - MOVQ ctx+0(FP), DI - MOVQ 8(DI), CX + MOVQ ctx+0(FP), R10 + MOVQ 8(R10), CX TESTQ CX, CX JZ empty_seqs - MOVQ (DI), AX - MOVQ 24(DI), DX - MOVQ 32(DI), BX - MOVQ 40(DI), SI - MOVQ 80(DI), SI - MOVQ 104(DI), R8 - MOVQ 120(DI), R9 - MOVQ 56(DI), R10 - MOVQ 64(DI), DI - ADDQ DI, R10 + MOVQ (R10), AX + MOVQ 24(R10), DX + MOVQ 32(R10), BX + MOVQ 80(R10), SI + MOVQ 104(R10), DI + MOVQ 120(R10), R8 + MOVQ 56(R10), R9 + MOVQ 64(R10), R10 + ADDQ R10, R9 // seqsBase += 24 * seqIndex LEAQ (DX)(DX*2), R11 @@ -1105,39 +1144,39 @@ TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9 ADDQ R11, AX // outBase += outPosition - ADDQ R8, BX + ADDQ DI, BX main_loop: - MOVQ (AX), R13 - MOVQ 8(AX), R11 + MOVQ (AX), R11 MOVQ 16(AX), R12 + MOVQ 8(AX), R13 // Copy literals - TESTQ R13, R13 + TESTQ R11, R11 JZ check_offset XORQ R14, R14 - TESTQ $0x00000001, R13 + TESTQ $0x00000001, R11 JZ copy_1_word MOVB (SI)(R14*1), R15 MOVB R15, (BX)(R14*1) ADDQ $0x01, R14 copy_1_word: - TESTQ $0x00000002, R13 + TESTQ $0x00000002, R11 JZ copy_1_dword MOVW (SI)(R14*1), R15 MOVW R15, (BX)(R14*1) ADDQ $0x02, R14 copy_1_dword: - TESTQ $0x00000004, R13 + TESTQ $0x00000004, R11 JZ copy_1_qword MOVL (SI)(R14*1), R15 MOVL R15, (BX)(R14*1) ADDQ $0x04, R14 copy_1_qword: - TESTQ $0x00000008, R13 + TESTQ $0x00000008, R11 JZ copy_1_test MOVQ (SI)(R14*1), R15 MOVQ R15, (BX)(R14*1) @@ -1150,67 +1189,67 @@ copy_1: ADDQ $0x10, R14 copy_1_test: - CMPQ R14, R13 + CMPQ R14, R11 JB copy_1 - ADDQ R13, SI - ADDQ R13, BX - ADDQ R13, R8 + ADDQ R11, SI + ADDQ R11, BX + ADDQ R11, DI // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) check_offset: - LEAQ (R8)(DI*1), R13 - CMPQ R12, R13 + LEAQ (DI)(R10*1), R11 + CMPQ R12, R11 JG error_match_off_too_big - CMPQ R12, R9 + CMPQ R12, R8 JG error_match_off_too_big // Copy match from history - MOVQ R12, R13 - SUBQ R8, R13 + MOVQ R12, R11 + SUBQ DI, R11 JLS copy_match - MOVQ R10, R14 - SUBQ R13, R14 - CMPQ R11, R13 + MOVQ R9, R14 + SUBQ R11, R14 + CMPQ R13, R11 JGE copy_all_from_history - XORQ R12, R12 - TESTQ $0x00000001, R11 + XORQ R11, R11 + TESTQ $0x00000001, R13 JZ copy_4_word - MOVB (R14)(R12*1), R13 - MOVB R13, (BX)(R12*1) - ADDQ $0x01, R12 + MOVB (R14)(R11*1), R12 + MOVB R12, (BX)(R11*1) + ADDQ $0x01, R11 copy_4_word: - TESTQ $0x00000002, R11 + TESTQ $0x00000002, R13 JZ copy_4_dword - MOVW (R14)(R12*1), R13 - MOVW R13, (BX)(R12*1) - ADDQ $0x02, R12 + MOVW (R14)(R11*1), R12 + MOVW R12, (BX)(R11*1) + ADDQ $0x02, R11 copy_4_dword: - TESTQ $0x00000004, R11 + TESTQ $0x00000004, R13 JZ copy_4_qword - MOVL (R14)(R12*1), R13 - MOVL R13, (BX)(R12*1) - ADDQ $0x04, R12 + MOVL (R14)(R11*1), R12 + MOVL R12, (BX)(R11*1) + ADDQ $0x04, R11 copy_4_qword: - TESTQ $0x00000008, R11 + TESTQ $0x00000008, R13 JZ copy_4_test - MOVQ (R14)(R12*1), R13 - MOVQ R13, (BX)(R12*1) - ADDQ $0x08, R12 + MOVQ (R14)(R11*1), R12 + MOVQ R12, (BX)(R11*1) + ADDQ $0x08, R11 JMP copy_4_test copy_4: - MOVUPS (R14)(R12*1), X0 - MOVUPS X0, (BX)(R12*1) - ADDQ $0x10, R12 + MOVUPS (R14)(R11*1), X0 + MOVUPS X0, (BX)(R11*1) + ADDQ $0x10, R11 copy_4_test: - CMPQ R12, R11 + CMPQ R11, R13 JB copy_4 - ADDQ R11, R8 - ADDQ R11, BX + ADDQ R13, DI + ADDQ R13, BX ADDQ $0x18, AX INCQ DX CMPQ DX, CX @@ -1219,28 +1258,28 @@ copy_4_test: copy_all_from_history: XORQ R15, R15 - TESTQ $0x00000001, R13 + TESTQ $0x00000001, R11 JZ copy_5_word MOVB (R14)(R15*1), BP MOVB BP, (BX)(R15*1) ADDQ $0x01, R15 copy_5_word: - TESTQ $0x00000002, R13 + TESTQ $0x00000002, R11 JZ copy_5_dword MOVW (R14)(R15*1), BP MOVW BP, (BX)(R15*1) ADDQ $0x02, R15 copy_5_dword: - TESTQ $0x00000004, R13 + TESTQ $0x00000004, R11 JZ copy_5_qword MOVL (R14)(R15*1), BP MOVL BP, (BX)(R15*1) ADDQ $0x04, R15 copy_5_qword: - TESTQ $0x00000008, R13 + TESTQ $0x00000008, R11 JZ copy_5_test MOVQ (R14)(R15*1), BP MOVQ BP, (BX)(R15*1) @@ -1253,48 +1292,77 @@ copy_5: ADDQ $0x10, R15 copy_5_test: - CMPQ R15, R13 + CMPQ R15, R11 JB copy_5 - ADDQ R13, BX - ADDQ R13, R8 - SUBQ R13, R11 + ADDQ R11, BX + ADDQ R11, DI + SUBQ R11, R13 // Copy match from the current buffer copy_match: - TESTQ R11, R11 + TESTQ R13, R13 JZ handle_loop - MOVQ BX, R13 - SUBQ R12, R13 + MOVQ BX, R11 + SUBQ R12, R11 // ml <= mo - CMPQ R11, R12 + CMPQ R13, R12 JA copy_overlapping_match // Copy non-overlapping match - XORQ R12, R12 + XORQ R12, R12 + TESTQ $0x00000001, R13 + JZ copy_2_word + MOVB (R11)(R12*1), R14 + MOVB R14, (BX)(R12*1) + ADDQ $0x01, R12 + +copy_2_word: + TESTQ $0x00000002, R13 + JZ copy_2_dword + MOVW (R11)(R12*1), R14 + MOVW R14, (BX)(R12*1) + ADDQ $0x02, R12 + +copy_2_dword: + TESTQ $0x00000004, R13 + JZ copy_2_qword + MOVL (R11)(R12*1), R14 + MOVL R14, (BX)(R12*1) + ADDQ $0x04, R12 + +copy_2_qword: + TESTQ $0x00000008, R13 + JZ copy_2_test + MOVQ (R11)(R12*1), R14 + MOVQ R14, (BX)(R12*1) + ADDQ $0x08, R12 + JMP copy_2_test copy_2: - MOVUPS (R13)(R12*1), X0 + MOVUPS (R11)(R12*1), X0 MOVUPS X0, (BX)(R12*1) ADDQ $0x10, R12 - CMPQ R12, R11 - JB copy_2 - ADDQ R11, BX - ADDQ R11, R8 - JMP handle_loop + +copy_2_test: + CMPQ R12, R13 + JB copy_2 + ADDQ R13, BX + ADDQ R13, DI + JMP handle_loop // Copy overlapping match copy_overlapping_match: XORQ R12, R12 copy_slow_3: - MOVB (R13)(R12*1), R14 + MOVB (R11)(R12*1), R14 MOVB R14, (BX)(R12*1) INCQ R12 - CMPQ R12, R11 + CMPQ R12, R13 JB copy_slow_3 - ADDQ R11, BX - ADDQ R11, R8 + ADDQ R13, BX + ADDQ R13, DI handle_loop: ADDQ $0x18, AX @@ -1309,7 +1377,7 @@ loop_finished: // Update the context MOVQ ctx+0(FP), AX MOVQ DX, 24(AX) - MOVQ R8, 104(AX) + MOVQ DI, 104(AX) MOVQ 80(AX), CX SUBQ CX, SI MOVQ SI, 112(AX) @@ -1322,7 +1390,7 @@ error_match_off_too_big: // Update the context MOVQ ctx+0(FP), AX MOVQ DX, 24(AX) - MOVQ R8, 104(AX) + MOVQ DI, 104(AX) MOVQ 80(AX), CX SUBQ CX, SI MOVQ SI, 112(AX) @@ -1332,3 +1400,1053 @@ empty_seqs: // Return value MOVB $0x01, ret+8(FP) RET + +// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int +// Requires: CMOV, SSE +TEXT ·sequenceDecs_decodeSync_amd64(SB), $56-32 + MOVQ br+8(FP), AX + MOVQ 32(AX), DX + MOVBQZX 40(AX), BX + MOVQ 24(AX), SI + MOVQ (AX), AX + ADDQ SI, AX + MOVQ AX, (SP) + MOVQ ctx+16(FP), AX + MOVQ 72(AX), DI + MOVQ 80(AX), R8 + MOVQ 88(AX), R9 + MOVQ 112(AX), R10 + MOVQ 144(AX), R11 + MOVQ 136(AX), R12 + MOVQ 200(AX), CX + MOVQ CX, 48(SP) + MOVQ 176(AX), CX + MOVQ CX, 40(SP) + MOVQ 184(AX), AX + MOVQ AX, 32(SP) + MOVQ 32(SP), AX + ADDQ AX, 40(SP) + + // outBase += outPosition + ADDQ R12, R10 + +sequenceDecs_decodeSync_amd64_main_loop: + MOVQ (SP), R13 + + // Fill bitreader to have enough for the offset and match length. + CMPQ SI, $0x08 + JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte + MOVQ BX, AX + SHRQ $0x03, AX + SUBQ AX, R13 + MOVQ (R13), DX + SUBQ AX, SI + ANDQ $0x07, BX + JMP sequenceDecs_decodeSync_amd64_fill_end + +sequenceDecs_decodeSync_amd64_fill_byte_by_byte: + CMPQ SI, $0x00 + JLE sequenceDecs_decodeSync_amd64_fill_end + CMPQ BX, $0x07 + JLE sequenceDecs_decodeSync_amd64_fill_end + SHLQ $0x08, DX + SUBQ $0x01, R13 + SUBQ $0x01, SI + SUBQ $0x08, BX + MOVBQZX (R13), AX + ORQ AX, DX + JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte + +sequenceDecs_decodeSync_amd64_fill_end: + // Update offset + MOVQ R9, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + ADDQ CX, BX + NEGL CX + SHRQ CL, R14 + SHRQ $0x20, AX + TESTQ CX, CX + CMOVQEQ CX, R14 + ADDQ R14, AX + MOVQ AX, 8(SP) + + // Update match length + MOVQ R8, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + ADDQ CX, BX + NEGL CX + SHRQ CL, R14 + SHRQ $0x20, AX + TESTQ CX, CX + CMOVQEQ CX, R14 + ADDQ R14, AX + MOVQ AX, 16(SP) + + // Fill bitreader to have enough for the remaining + CMPQ SI, $0x08 + JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte + MOVQ BX, AX + SHRQ $0x03, AX + SUBQ AX, R13 + MOVQ (R13), DX + SUBQ AX, SI + ANDQ $0x07, BX + JMP sequenceDecs_decodeSync_amd64_fill_2_end + +sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte: + CMPQ SI, $0x00 + JLE sequenceDecs_decodeSync_amd64_fill_2_end + CMPQ BX, $0x07 + JLE sequenceDecs_decodeSync_amd64_fill_2_end + SHLQ $0x08, DX + SUBQ $0x01, R13 + SUBQ $0x01, SI + SUBQ $0x08, BX + MOVBQZX (R13), AX + ORQ AX, DX + JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte + +sequenceDecs_decodeSync_amd64_fill_2_end: + // Update literal length + MOVQ DI, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + ADDQ CX, BX + NEGL CX + SHRQ CL, R14 + SHRQ $0x20, AX + TESTQ CX, CX + CMOVQEQ CX, R14 + ADDQ R14, AX + MOVQ AX, 24(SP) + + // Fill bitreader for state updates + MOVQ R13, (SP) + MOVQ R9, AX + SHRQ $0x08, AX + MOVBQZX AL, AX + MOVQ ctx+16(FP), CX + CMPQ 96(CX), $0x00 + JZ sequenceDecs_decodeSync_amd64_skip_update + + // Update Literal Length State + MOVBQZX DI, R13 + SHRQ $0x10, DI + MOVWQZX DI, DI + CMPQ R13, $0x00 + JZ sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero + MOVQ BX, CX + ADDQ R13, BX + MOVQ DX, R14 + SHLQ CL, R14 + MOVQ R13, CX + NEGQ CX + SHRQ CL, R14 + ADDQ R14, DI + +sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero: + // Load ctx.llTable + MOVQ ctx+16(FP), CX + MOVQ (CX), CX + MOVQ (CX)(DI*8), DI + + // Update Match Length State + MOVBQZX R8, R13 + SHRQ $0x10, R8 + MOVWQZX R8, R8 + CMPQ R13, $0x00 + JZ sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero + MOVQ BX, CX + ADDQ R13, BX + MOVQ DX, R14 + SHLQ CL, R14 + MOVQ R13, CX + NEGQ CX + SHRQ CL, R14 + ADDQ R14, R8 + +sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero: + // Load ctx.mlTable + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(R8*8), R8 + + // Update Offset State + MOVBQZX R9, R13 + SHRQ $0x10, R9 + MOVWQZX R9, R9 + CMPQ R13, $0x00 + JZ sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero + MOVQ BX, CX + ADDQ R13, BX + MOVQ DX, R14 + SHLQ CL, R14 + MOVQ R13, CX + NEGQ CX + SHRQ CL, R14 + ADDQ R14, R9 + +sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero: + // Load ctx.ofTable + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R9*8), R9 + +sequenceDecs_decodeSync_amd64_skip_update: + // Adjust offset + MOVQ s+0(FP), CX + MOVQ 8(SP), R13 + CMPQ AX, $0x01 + JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0 + MOVUPS 144(CX), X0 + MOVQ R13, 144(CX) + MOVUPS X0, 152(CX) + JMP sequenceDecs_decodeSync_amd64_adjust_end + +sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0: + CMPQ 24(SP), $0x00000000 + JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero + INCQ R13 + JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero + +sequenceDecs_decodeSync_amd64_adjust_offset_maybezero: + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero + MOVQ 144(CX), R13 + JMP sequenceDecs_decodeSync_amd64_adjust_end + +sequenceDecs_decodeSync_amd64_adjust_offset_nonzero: + MOVQ R13, AX + XORQ R14, R14 + MOVQ $-1, R15 + CMPQ R13, $0x03 + CMOVQEQ R14, AX + CMOVQEQ R15, R14 + LEAQ 144(CX), R15 + ADDQ (R15)(AX*8), R14 + JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid + MOVQ $0x00000001, R14 + +sequenceDecs_decodeSync_amd64_adjust_temp_valid: + CMPQ R13, $0x01 + JZ sequenceDecs_decodeSync_amd64_adjust_skip + MOVQ 152(CX), AX + MOVQ AX, 160(CX) + +sequenceDecs_decodeSync_amd64_adjust_skip: + MOVQ 144(CX), AX + MOVQ AX, 152(CX) + MOVQ R14, 144(CX) + MOVQ R14, R13 + +sequenceDecs_decodeSync_amd64_adjust_end: + MOVQ R13, 8(SP) + + // Check values + MOVQ 16(SP), AX + MOVQ 24(SP), CX + LEAQ (AX)(CX*1), R14 + MOVQ s+0(FP), R15 + ADDQ R14, 256(R15) + MOVQ ctx+16(FP), R14 + SUBQ CX, 104(R14) + JS error_not_enough_literals + CMPQ AX, $0x00020002 + JA sequenceDecs_decodeSync_amd64_error_match_len_too_big + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok + TESTQ AX, AX + JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch + +sequenceDecs_decodeSync_amd64_match_len_ofs_ok: + MOVQ 24(SP), AX + MOVQ 8(SP), CX + MOVQ 16(SP), R13 + + // Copy literals + TESTQ AX, AX + JZ check_offset + XORQ R14, R14 + TESTQ $0x00000001, AX + JZ copy_1_word + MOVB (R11)(R14*1), R15 + MOVB R15, (R10)(R14*1) + ADDQ $0x01, R14 + +copy_1_word: + TESTQ $0x00000002, AX + JZ copy_1_dword + MOVW (R11)(R14*1), R15 + MOVW R15, (R10)(R14*1) + ADDQ $0x02, R14 + +copy_1_dword: + TESTQ $0x00000004, AX + JZ copy_1_qword + MOVL (R11)(R14*1), R15 + MOVL R15, (R10)(R14*1) + ADDQ $0x04, R14 + +copy_1_qword: + TESTQ $0x00000008, AX + JZ copy_1_test + MOVQ (R11)(R14*1), R15 + MOVQ R15, (R10)(R14*1) + ADDQ $0x08, R14 + JMP copy_1_test + +copy_1: + MOVUPS (R11)(R14*1), X0 + MOVUPS X0, (R10)(R14*1) + ADDQ $0x10, R14 + +copy_1_test: + CMPQ R14, AX + JB copy_1 + ADDQ AX, R11 + ADDQ AX, R10 + ADDQ AX, R12 + + // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) +check_offset: + MOVQ R12, AX + ADDQ 32(SP), AX + CMPQ CX, AX + JG error_match_off_too_big + CMPQ CX, 48(SP) + JG error_match_off_too_big + + // Copy match from history + MOVQ CX, AX + SUBQ R12, AX + JLS copy_match + MOVQ 40(SP), R14 + SUBQ AX, R14 + CMPQ R13, AX + JGE copy_all_from_history + XORQ AX, AX + TESTQ $0x00000001, R13 + JZ copy_4_word + MOVB (R14)(AX*1), CL + MOVB CL, (R10)(AX*1) + ADDQ $0x01, AX + +copy_4_word: + TESTQ $0x00000002, R13 + JZ copy_4_dword + MOVW (R14)(AX*1), CX + MOVW CX, (R10)(AX*1) + ADDQ $0x02, AX + +copy_4_dword: + TESTQ $0x00000004, R13 + JZ copy_4_qword + MOVL (R14)(AX*1), CX + MOVL CX, (R10)(AX*1) + ADDQ $0x04, AX + +copy_4_qword: + TESTQ $0x00000008, R13 + JZ copy_4_test + MOVQ (R14)(AX*1), CX + MOVQ CX, (R10)(AX*1) + ADDQ $0x08, AX + JMP copy_4_test + +copy_4: + MOVUPS (R14)(AX*1), X0 + MOVUPS X0, (R10)(AX*1) + ADDQ $0x10, AX + +copy_4_test: + CMPQ AX, R13 + JB copy_4 + ADDQ R13, R12 + ADDQ R13, R10 + JMP handle_loop + JMP loop_finished + +copy_all_from_history: + XORQ R15, R15 + TESTQ $0x00000001, AX + JZ copy_5_word + MOVB (R14)(R15*1), BP + MOVB BP, (R10)(R15*1) + ADDQ $0x01, R15 + +copy_5_word: + TESTQ $0x00000002, AX + JZ copy_5_dword + MOVW (R14)(R15*1), BP + MOVW BP, (R10)(R15*1) + ADDQ $0x02, R15 + +copy_5_dword: + TESTQ $0x00000004, AX + JZ copy_5_qword + MOVL (R14)(R15*1), BP + MOVL BP, (R10)(R15*1) + ADDQ $0x04, R15 + +copy_5_qword: + TESTQ $0x00000008, AX + JZ copy_5_test + MOVQ (R14)(R15*1), BP + MOVQ BP, (R10)(R15*1) + ADDQ $0x08, R15 + JMP copy_5_test + +copy_5: + MOVUPS (R14)(R15*1), X0 + MOVUPS X0, (R10)(R15*1) + ADDQ $0x10, R15 + +copy_5_test: + CMPQ R15, AX + JB copy_5 + ADDQ AX, R10 + ADDQ AX, R12 + SUBQ AX, R13 + + // Copy match from the current buffer +copy_match: + TESTQ R13, R13 + JZ handle_loop + MOVQ R10, AX + SUBQ CX, AX + + // ml <= mo + CMPQ R13, CX + JA copy_overlapping_match + + // Copy non-overlapping match + XORQ CX, CX + TESTQ $0x00000001, R13 + JZ copy_2_word + MOVB (AX)(CX*1), R14 + MOVB R14, (R10)(CX*1) + ADDQ $0x01, CX + +copy_2_word: + TESTQ $0x00000002, R13 + JZ copy_2_dword + MOVW (AX)(CX*1), R14 + MOVW R14, (R10)(CX*1) + ADDQ $0x02, CX + +copy_2_dword: + TESTQ $0x00000004, R13 + JZ copy_2_qword + MOVL (AX)(CX*1), R14 + MOVL R14, (R10)(CX*1) + ADDQ $0x04, CX + +copy_2_qword: + TESTQ $0x00000008, R13 + JZ copy_2_test + MOVQ (AX)(CX*1), R14 + MOVQ R14, (R10)(CX*1) + ADDQ $0x08, CX + JMP copy_2_test + +copy_2: + MOVUPS (AX)(CX*1), X0 + MOVUPS X0, (R10)(CX*1) + ADDQ $0x10, CX + +copy_2_test: + CMPQ CX, R13 + JB copy_2 + ADDQ R13, R10 + ADDQ R13, R12 + JMP handle_loop + + // Copy overlapping match +copy_overlapping_match: + XORQ CX, CX + +copy_slow_3: + MOVB (AX)(CX*1), R14 + MOVB R14, (R10)(CX*1) + INCQ CX + CMPQ CX, R13 + JB copy_slow_3 + ADDQ R13, R10 + ADDQ R13, R12 + +handle_loop: + MOVQ ctx+16(FP), AX + DECQ 96(AX) + JNS sequenceDecs_decodeSync_amd64_main_loop + +loop_finished: + MOVQ br+8(FP), AX + MOVQ DX, 32(AX) + MOVB BL, 40(AX) + MOVQ SI, 24(AX) + + // Update the context + MOVQ ctx+16(FP), AX + MOVQ R12, 136(AX) + MOVQ 144(AX), CX + SUBQ CX, R11 + MOVQ R11, 168(AX) + + // Return success + MOVQ $0x00000000, ret+24(FP) + RET + + // Return with match length error +sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch: + MOVQ 16(SP), AX + MOVQ ctx+16(FP), CX + MOVQ AX, 216(CX) + MOVQ $0x00000001, ret+24(FP) + RET + + // Return with match too long error +sequenceDecs_decodeSync_amd64_error_match_len_too_big: + MOVQ ctx+16(FP), AX + MOVQ 16(SP), CX + MOVQ CX, 216(AX) + MOVQ $0x00000002, ret+24(FP) + RET + + // Return with match offset too long error +error_match_off_too_big: + MOVQ ctx+16(FP), AX + MOVQ 8(SP), CX + MOVQ CX, 224(AX) + MOVQ R12, 136(AX) + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ ctx+16(FP), AX + MOVQ 24(SP), CX + MOVQ CX, 208(AX) + MOVQ $0x00000004, ret+24(FP) + RET + +// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int +// Requires: BMI, BMI2, CMOV, SSE +TEXT ·sequenceDecs_decodeSync_bmi2(SB), $56-32 + MOVQ br+8(FP), CX + MOVQ 32(CX), AX + MOVBQZX 40(CX), DX + MOVQ 24(CX), BX + MOVQ (CX), CX + ADDQ BX, CX + MOVQ CX, (SP) + MOVQ ctx+16(FP), CX + MOVQ 72(CX), SI + MOVQ 80(CX), DI + MOVQ 88(CX), R8 + MOVQ 112(CX), R9 + MOVQ 144(CX), R10 + MOVQ 136(CX), R11 + MOVQ 200(CX), R12 + MOVQ R12, 48(SP) + MOVQ 176(CX), R12 + MOVQ R12, 40(SP) + MOVQ 184(CX), CX + MOVQ CX, 32(SP) + MOVQ 32(SP), CX + ADDQ CX, 40(SP) + + // outBase += outPosition + ADDQ R11, R9 + +sequenceDecs_decodeSync_bmi2_main_loop: + MOVQ (SP), R12 + + // Fill bitreader to have enough for the offset and match length. + CMPQ BX, $0x08 + JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte + MOVQ DX, CX + SHRQ $0x03, CX + SUBQ CX, R12 + MOVQ (R12), AX + SUBQ CX, BX + ANDQ $0x07, DX + JMP sequenceDecs_decodeSync_bmi2_fill_end + +sequenceDecs_decodeSync_bmi2_fill_byte_by_byte: + CMPQ BX, $0x00 + JLE sequenceDecs_decodeSync_bmi2_fill_end + CMPQ DX, $0x07 + JLE sequenceDecs_decodeSync_bmi2_fill_end + SHLQ $0x08, AX + SUBQ $0x01, R12 + SUBQ $0x01, BX + SUBQ $0x08, DX + MOVBQZX (R12), CX + ORQ CX, AX + JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte + +sequenceDecs_decodeSync_bmi2_fill_end: + // Update offset + MOVQ $0x00000808, CX + BEXTRQ CX, R8, R13 + MOVQ AX, R14 + LEAQ (DX)(R13*1), CX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + MOVQ CX, DX + MOVQ R8, CX + SHRQ $0x20, CX + ADDQ R14, CX + MOVQ CX, 8(SP) + + // Update match length + MOVQ $0x00000808, CX + BEXTRQ CX, DI, R13 + MOVQ AX, R14 + LEAQ (DX)(R13*1), CX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + MOVQ CX, DX + MOVQ DI, CX + SHRQ $0x20, CX + ADDQ R14, CX + MOVQ CX, 16(SP) + + // Fill bitreader to have enough for the remaining + CMPQ BX, $0x08 + JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte + MOVQ DX, CX + SHRQ $0x03, CX + SUBQ CX, R12 + MOVQ (R12), AX + SUBQ CX, BX + ANDQ $0x07, DX + JMP sequenceDecs_decodeSync_bmi2_fill_2_end + +sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte: + CMPQ BX, $0x00 + JLE sequenceDecs_decodeSync_bmi2_fill_2_end + CMPQ DX, $0x07 + JLE sequenceDecs_decodeSync_bmi2_fill_2_end + SHLQ $0x08, AX + SUBQ $0x01, R12 + SUBQ $0x01, BX + SUBQ $0x08, DX + MOVBQZX (R12), CX + ORQ CX, AX + JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte + +sequenceDecs_decodeSync_bmi2_fill_2_end: + // Update literal length + MOVQ $0x00000808, CX + BEXTRQ CX, SI, R13 + MOVQ AX, R14 + LEAQ (DX)(R13*1), CX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + MOVQ CX, DX + MOVQ SI, CX + SHRQ $0x20, CX + ADDQ R14, CX + MOVQ CX, 24(SP) + + // Fill bitreader for state updates + MOVQ R12, (SP) + MOVQ $0x00000808, CX + BEXTRQ CX, R8, R12 + MOVQ ctx+16(FP), CX + CMPQ 96(CX), $0x00 + JZ sequenceDecs_decodeSync_bmi2_skip_update + + // Update Literal Length State + MOVBQZX SI, R13 + MOVQ $0x00001010, CX + BEXTRQ CX, SI, SI + LEAQ (DX)(R13*1), CX + MOVQ AX, R14 + MOVQ CX, DX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + ADDQ R14, SI + + // Load ctx.llTable + MOVQ ctx+16(FP), CX + MOVQ (CX), CX + MOVQ (CX)(SI*8), SI + + // Update Match Length State + MOVBQZX DI, R13 + MOVQ $0x00001010, CX + BEXTRQ CX, DI, DI + LEAQ (DX)(R13*1), CX + MOVQ AX, R14 + MOVQ CX, DX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + ADDQ R14, DI + + // Load ctx.mlTable + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(DI*8), DI + + // Update Offset State + MOVBQZX R8, R13 + MOVQ $0x00001010, CX + BEXTRQ CX, R8, R8 + LEAQ (DX)(R13*1), CX + MOVQ AX, R14 + MOVQ CX, DX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + ADDQ R14, R8 + + // Load ctx.ofTable + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R8*8), R8 + +sequenceDecs_decodeSync_bmi2_skip_update: + // Adjust offset + MOVQ s+0(FP), CX + MOVQ 8(SP), R13 + CMPQ R12, $0x01 + JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0 + MOVUPS 144(CX), X0 + MOVQ R13, 144(CX) + MOVUPS X0, 152(CX) + JMP sequenceDecs_decodeSync_bmi2_adjust_end + +sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0: + CMPQ 24(SP), $0x00000000 + JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero + INCQ R13 + JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero + +sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero: + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero + MOVQ 144(CX), R13 + JMP sequenceDecs_decodeSync_bmi2_adjust_end + +sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero: + MOVQ R13, R12 + XORQ R14, R14 + MOVQ $-1, R15 + CMPQ R13, $0x03 + CMOVQEQ R14, R12 + CMOVQEQ R15, R14 + LEAQ 144(CX), R15 + ADDQ (R15)(R12*8), R14 + JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid + MOVQ $0x00000001, R14 + +sequenceDecs_decodeSync_bmi2_adjust_temp_valid: + CMPQ R13, $0x01 + JZ sequenceDecs_decodeSync_bmi2_adjust_skip + MOVQ 152(CX), R12 + MOVQ R12, 160(CX) + +sequenceDecs_decodeSync_bmi2_adjust_skip: + MOVQ 144(CX), R12 + MOVQ R12, 152(CX) + MOVQ R14, 144(CX) + MOVQ R14, R13 + +sequenceDecs_decodeSync_bmi2_adjust_end: + MOVQ R13, 8(SP) + + // Check values + MOVQ 16(SP), CX + MOVQ 24(SP), R12 + LEAQ (CX)(R12*1), R14 + MOVQ s+0(FP), R15 + ADDQ R14, 256(R15) + MOVQ ctx+16(FP), R14 + SUBQ R12, 104(R14) + JS error_not_enough_literals + CMPQ CX, $0x00020002 + JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok + TESTQ CX, CX + JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch + +sequenceDecs_decodeSync_bmi2_match_len_ofs_ok: + MOVQ 24(SP), CX + MOVQ 8(SP), R12 + MOVQ 16(SP), R13 + + // Copy literals + TESTQ CX, CX + JZ check_offset + XORQ R14, R14 + TESTQ $0x00000001, CX + JZ copy_1_word + MOVB (R10)(R14*1), R15 + MOVB R15, (R9)(R14*1) + ADDQ $0x01, R14 + +copy_1_word: + TESTQ $0x00000002, CX + JZ copy_1_dword + MOVW (R10)(R14*1), R15 + MOVW R15, (R9)(R14*1) + ADDQ $0x02, R14 + +copy_1_dword: + TESTQ $0x00000004, CX + JZ copy_1_qword + MOVL (R10)(R14*1), R15 + MOVL R15, (R9)(R14*1) + ADDQ $0x04, R14 + +copy_1_qword: + TESTQ $0x00000008, CX + JZ copy_1_test + MOVQ (R10)(R14*1), R15 + MOVQ R15, (R9)(R14*1) + ADDQ $0x08, R14 + JMP copy_1_test + +copy_1: + MOVUPS (R10)(R14*1), X0 + MOVUPS X0, (R9)(R14*1) + ADDQ $0x10, R14 + +copy_1_test: + CMPQ R14, CX + JB copy_1 + ADDQ CX, R10 + ADDQ CX, R9 + ADDQ CX, R11 + + // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) +check_offset: + MOVQ R11, CX + ADDQ 32(SP), CX + CMPQ R12, CX + JG error_match_off_too_big + CMPQ R12, 48(SP) + JG error_match_off_too_big + + // Copy match from history + MOVQ R12, CX + SUBQ R11, CX + JLS copy_match + MOVQ 40(SP), R14 + SUBQ CX, R14 + CMPQ R13, CX + JGE copy_all_from_history + XORQ CX, CX + TESTQ $0x00000001, R13 + JZ copy_4_word + MOVB (R14)(CX*1), R12 + MOVB R12, (R9)(CX*1) + ADDQ $0x01, CX + +copy_4_word: + TESTQ $0x00000002, R13 + JZ copy_4_dword + MOVW (R14)(CX*1), R12 + MOVW R12, (R9)(CX*1) + ADDQ $0x02, CX + +copy_4_dword: + TESTQ $0x00000004, R13 + JZ copy_4_qword + MOVL (R14)(CX*1), R12 + MOVL R12, (R9)(CX*1) + ADDQ $0x04, CX + +copy_4_qword: + TESTQ $0x00000008, R13 + JZ copy_4_test + MOVQ (R14)(CX*1), R12 + MOVQ R12, (R9)(CX*1) + ADDQ $0x08, CX + JMP copy_4_test + +copy_4: + MOVUPS (R14)(CX*1), X0 + MOVUPS X0, (R9)(CX*1) + ADDQ $0x10, CX + +copy_4_test: + CMPQ CX, R13 + JB copy_4 + ADDQ R13, R11 + ADDQ R13, R9 + JMP handle_loop + JMP loop_finished + +copy_all_from_history: + XORQ R15, R15 + TESTQ $0x00000001, CX + JZ copy_5_word + MOVB (R14)(R15*1), BP + MOVB BP, (R9)(R15*1) + ADDQ $0x01, R15 + +copy_5_word: + TESTQ $0x00000002, CX + JZ copy_5_dword + MOVW (R14)(R15*1), BP + MOVW BP, (R9)(R15*1) + ADDQ $0x02, R15 + +copy_5_dword: + TESTQ $0x00000004, CX + JZ copy_5_qword + MOVL (R14)(R15*1), BP + MOVL BP, (R9)(R15*1) + ADDQ $0x04, R15 + +copy_5_qword: + TESTQ $0x00000008, CX + JZ copy_5_test + MOVQ (R14)(R15*1), BP + MOVQ BP, (R9)(R15*1) + ADDQ $0x08, R15 + JMP copy_5_test + +copy_5: + MOVUPS (R14)(R15*1), X0 + MOVUPS X0, (R9)(R15*1) + ADDQ $0x10, R15 + +copy_5_test: + CMPQ R15, CX + JB copy_5 + ADDQ CX, R9 + ADDQ CX, R11 + SUBQ CX, R13 + + // Copy match from the current buffer +copy_match: + TESTQ R13, R13 + JZ handle_loop + MOVQ R9, CX + SUBQ R12, CX + + // ml <= mo + CMPQ R13, R12 + JA copy_overlapping_match + + // Copy non-overlapping match + XORQ R12, R12 + TESTQ $0x00000001, R13 + JZ copy_2_word + MOVB (CX)(R12*1), R14 + MOVB R14, (R9)(R12*1) + ADDQ $0x01, R12 + +copy_2_word: + TESTQ $0x00000002, R13 + JZ copy_2_dword + MOVW (CX)(R12*1), R14 + MOVW R14, (R9)(R12*1) + ADDQ $0x02, R12 + +copy_2_dword: + TESTQ $0x00000004, R13 + JZ copy_2_qword + MOVL (CX)(R12*1), R14 + MOVL R14, (R9)(R12*1) + ADDQ $0x04, R12 + +copy_2_qword: + TESTQ $0x00000008, R13 + JZ copy_2_test + MOVQ (CX)(R12*1), R14 + MOVQ R14, (R9)(R12*1) + ADDQ $0x08, R12 + JMP copy_2_test + +copy_2: + MOVUPS (CX)(R12*1), X0 + MOVUPS X0, (R9)(R12*1) + ADDQ $0x10, R12 + +copy_2_test: + CMPQ R12, R13 + JB copy_2 + ADDQ R13, R9 + ADDQ R13, R11 + JMP handle_loop + + // Copy overlapping match +copy_overlapping_match: + XORQ R12, R12 + +copy_slow_3: + MOVB (CX)(R12*1), R14 + MOVB R14, (R9)(R12*1) + INCQ R12 + CMPQ R12, R13 + JB copy_slow_3 + ADDQ R13, R9 + ADDQ R13, R11 + +handle_loop: + MOVQ ctx+16(FP), CX + DECQ 96(CX) + JNS sequenceDecs_decodeSync_bmi2_main_loop + +loop_finished: + MOVQ br+8(FP), CX + MOVQ AX, 32(CX) + MOVB DL, 40(CX) + MOVQ BX, 24(CX) + + // Update the context + MOVQ ctx+16(FP), AX + MOVQ R11, 136(AX) + MOVQ 144(AX), CX + SUBQ CX, R10 + MOVQ R10, 168(AX) + + // Return success + MOVQ $0x00000000, ret+24(FP) + RET + + // Return with match length error +sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch: + MOVQ 16(SP), AX + MOVQ ctx+16(FP), CX + MOVQ AX, 216(CX) + MOVQ $0x00000001, ret+24(FP) + RET + + // Return with match too long error +sequenceDecs_decodeSync_bmi2_error_match_len_too_big: + MOVQ ctx+16(FP), AX + MOVQ 16(SP), CX + MOVQ CX, 216(AX) + MOVQ $0x00000002, ret+24(FP) + RET + + // Return with match offset too long error +error_match_off_too_big: + MOVQ ctx+16(FP), AX + MOVQ 8(SP), CX + MOVQ CX, 224(AX) + MOVQ R11, 136(AX) + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ ctx+16(FP), AX + MOVQ 24(SP), CX + MOVQ CX, 208(AX) + MOVQ $0x00000004, ret+24(FP) + RET diff --git a/zstd/seqdec_generic.go b/zstd/seqdec_generic.go index 3775ed5f93..c3452bc3a9 100644 --- a/zstd/seqdec_generic.go +++ b/zstd/seqdec_generic.go @@ -8,6 +8,11 @@ import ( "io" ) +// decode sequences from the stream with the provided history but without dictionary. +func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) { + return false, nil +} + // decode sequences from the stream without the provided history. func (s *sequenceDecs) decode(seqs []seqVals) error { br := s.br