Skip to content

Commit

Permalink
zstd: Improve decoder memcopy (#637)
Browse files Browse the repository at this point in the history
Up to 25% faster decodes, depending on contents.

Use s2 memcopier and eliminate a zero check.


```
benchmark                                                                                       old MB/s     new MB/s     speedup
Benchmark_seqdec_execute/n-12286-lits-13914-prev-9869-1990358-3296656-win-4194304.blk-32        1284.77      1493.64      1.16x
Benchmark_seqdec_execute/n-12485-lits-6960-prev-976039-2250252-2463561-win-4194304.blk-32       1107.87      1580.86      1.43x
Benchmark_seqdec_execute/n-14746-lits-14461-prev-209-8-1379909-win-4194304.blk-32               3947.25      4163.99      1.05x
Benchmark_seqdec_execute/n-1525-lits-1498-prev-2009476-797934-2994405-win-4194304.blk-32        10281.12     10375.47     1.01x
Benchmark_seqdec_execute/n-3478-lits-3628-prev-895243-2104056-2119329-win-4194304.blk-32        8115.99      8862.70      1.09x
Benchmark_seqdec_execute/n-8422-lits-5840-prev-168095-2298675-433830-win-4194304.blk-32         1578.08      2306.80      1.46x
Benchmark_seqdec_execute/n-1000-lits-1057-prev-21887-92-217-win-8388608.blk-32                  17079.65     15875.41     0.93x
Benchmark_seqdec_execute/n-15134-lits-20798-prev-4882976-4884216-4474622-win-8388608.blk-32     2020.09      2077.16      1.03x
Benchmark_seqdec_execute/n-2-lits-0-prev-620601-689171-848-win-8388608.blk-32                   35781.31     35736.03     1.00x
Benchmark_seqdec_execute/n-90-lits-67-prev-19498-23-19710-win-8388608.blk-32                    33125.43     32874.37     0.99x
Benchmark_seqdec_execute/n-931-lits-1179-prev-36502-1526-1518-win-8388608.blk-32                19394.38     19785.45     1.02x
Benchmark_seqdec_execute/n-2898-lits-4062-prev-335-386-751-win-8388608.blk-32                   10494.30     10229.09     0.97x
Benchmark_seqdec_execute/n-4056-lits-12419-prev-10792-66-309849-win-8388608.blk-32              7425.77      8034.31      1.08x
Benchmark_seqdec_execute/n-8028-lits-4568-prev-917-65-920-win-8388608.blk-32                    2855.17      3336.71      1.17x
BenchmarkDecoder_DecoderSmall/kppkn.gtb.zst-32                                                    537.74       653.10        1.21x
BenchmarkDecoder_DecoderSmall/geo.protodata.zst-32                                                1500.59      1610.70       1.07x
BenchmarkDecoder_DecoderSmall/plrabn12.txt.zst-32                                                 410.13       508.09        1.24x
BenchmarkDecoder_DecoderSmall/lcet10.txt.zst-32                                                   467.83       602.22        1.29x
BenchmarkDecoder_DecoderSmall/asyoulik.txt.zst-32                                                 434.53       528.57        1.22x
BenchmarkDecoder_DecoderSmall/alice29.txt.zst-32                                                  433.95       544.60        1.25x
BenchmarkDecoder_DecoderSmall/html_x_4.zst-32                                                     2860.31      3199.64       1.12x
BenchmarkDecoder_DecoderSmall/paper-100k.pdf.zst-32                                               5336.43      5422.59       1.02x
BenchmarkDecoder_DecoderSmall/fireworks.jpeg.zst-32                                               12327.10     12324.96      1.00x
BenchmarkDecoder_DecoderSmall/urls.10K.zst-32                                                     660.52       769.09        1.16x
BenchmarkDecoder_DecoderSmall/html.zst-32                                                         1076.67      1286.06       1.19x
BenchmarkDecoder_DecoderSmall/comp-data.bin.zst-32                                                569.30       574.46        1.01x
BenchmarkDecoder_DecodeAll/kppkn.gtb.zst-32                                                       812.16       822.43        1.01x
BenchmarkDecoder_DecodeAll/geo.protodata.zst-32                                                   1943.14      1906.88       0.98x
BenchmarkDecoder_DecodeAll/plrabn12.txt.zst-32                                                    712.27       723.91        1.02x
BenchmarkDecoder_DecodeAll/lcet10.txt.zst-32                                                      688.23       781.85        1.14x
BenchmarkDecoder_DecodeAll/asyoulik.txt.zst-32                                                    702.87       714.37        1.02x
BenchmarkDecoder_DecodeAll/alice29.txt.zst-32                                                     717.44       738.78        1.03x
BenchmarkDecoder_DecodeAll/html_x_4.zst-32                                                        1960.55      1975.63       1.01x
BenchmarkDecoder_DecodeAll/paper-100k.pdf.zst-32                                                  5981.50      6118.97       1.02x
BenchmarkDecoder_DecodeAll/fireworks.jpeg.zst-32                                                  13140.18     13126.95      1.00x
BenchmarkDecoder_DecodeAll/urls.10K.zst-32                                                        983.71       979.34        1.00x
BenchmarkDecoder_DecodeAll/html.zst-32                                                            1624.80      1585.31       0.98x
BenchmarkDecoder_DecodeAll/comp-data.bin.zst-32                                                   569.84       572.56        1.00x
BenchmarkDecoder_DecodeAllFiles/.tracker-unpacked.bin/fastest-32                                  504.31       623.48        1.24x
BenchmarkDecoder_DecodeAllFiles/.tracker-unpacked.bin/default-32                                  564.68       723.22        1.28x
BenchmarkDecoder_DecodeAllFiles/.tracker-unpacked.bin/better-32                                   615.18       781.33        1.27x
BenchmarkDecoder_DecodeAllFiles/.tracker-unpacked.bin/best-32                                     786.17       862.88        1.10x
BenchmarkDecoder_DecodeAllFiles/.tracker.bin/fastest-32                                           12860.99     12908.39      1.00x
BenchmarkDecoder_DecodeAllFiles/.tracker.bin/default-32                                           619.06       626.95        1.01x
BenchmarkDecoder_DecodeAllFiles/.tracker.bin/better-32                                            630.33       628.85        1.00x
BenchmarkDecoder_DecodeAllFiles/.tracker.bin/best-32                                              609.12       616.50        1.01x
BenchmarkDecoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/fastest-32                              658.22       669.16        1.02x
BenchmarkDecoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/default-32                              723.60       741.86        1.03x
BenchmarkDecoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/better-32                               735.73       750.40        1.02x
BenchmarkDecoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/best-32                                 745.43       764.97        1.03x
BenchmarkDecoder_DecodeAllFiles/e.txt/fastest-32                                                  12801.86     13043.13      1.02x
BenchmarkDecoder_DecodeAllFiles/e.txt/default-32                                                  680.29       683.65        1.00x
BenchmarkDecoder_DecodeAllFiles/e.txt/better-32                                                   739.23       748.08        1.01x
BenchmarkDecoder_DecodeAllFiles/e.txt/best-32                                                     820.16       828.45        1.01x
BenchmarkDecoder_DecodeAllFiles/fse-artifact3.bin/fastest-32                                      1186.63      1177.03       0.99x
BenchmarkDecoder_DecodeAllFiles/fse-artifact3.bin/default-32                                      1384.74      1383.55       1.00x
BenchmarkDecoder_DecodeAllFiles/fse-artifact3.bin/better-32                                       1104.17      1114.92       1.01x
BenchmarkDecoder_DecodeAllFiles/fse-artifact3.bin/best-32                                         409.59       409.66        1.00x
BenchmarkDecoder_DecodeAllFiles/gettysburg.txt/fastest-32                                         392.32       390.94        1.00x
BenchmarkDecoder_DecodeAllFiles/gettysburg.txt/default-32                                         296.47       295.87        1.00x
BenchmarkDecoder_DecodeAllFiles/gettysburg.txt/better-32                                          296.52       296.60        1.00x
BenchmarkDecoder_DecodeAllFiles/gettysburg.txt/best-32                                            299.85       298.91        1.00x
BenchmarkDecoder_DecodeAllFiles/html.txt/fastest-32                                               988.75       999.28        1.01x
BenchmarkDecoder_DecodeAllFiles/html.txt/default-32                                               987.11       1018.97       1.03x
BenchmarkDecoder_DecodeAllFiles/html.txt/better-32                                                1027.64      1030.76       1.00x
BenchmarkDecoder_DecodeAllFiles/html.txt/best-32                                                  973.41       989.37        1.02x
BenchmarkDecoder_DecodeAllFiles/pi.txt/fastest-32                                                 12976.96     12976.25      1.00x
BenchmarkDecoder_DecodeAllFiles/pi.txt/default-32                                                 678.88       680.77        1.00x
BenchmarkDecoder_DecodeAllFiles/pi.txt/better-32                                                  746.38       751.28        1.01x
BenchmarkDecoder_DecodeAllFiles/pi.txt/best-32                                                    823.52       833.27        1.01x
BenchmarkDecoder_DecodeAllFiles/pngdata.bin/fastest-32                                            2115.58      2106.14       1.00x
BenchmarkDecoder_DecodeAllFiles/pngdata.bin/default-32                                            1767.98      1767.57       1.00x
BenchmarkDecoder_DecodeAllFiles/pngdata.bin/better-32                                             2306.86      2288.16       0.99x
BenchmarkDecoder_DecodeAllFiles/pngdata.bin/best-32                                               1660.52      1667.53       1.00x
BenchmarkDecoder_DecodeAllFiles/sharnd.out/fastest-32                                             13027.08     13044.50      1.00x
BenchmarkDecoder_DecodeAllFiles/sharnd.out/default-32                                             13054.18     13081.06      1.00x
BenchmarkDecoder_DecodeAllFiles/sharnd.out/better-32                                              13067.23     13066.65      1.00x
BenchmarkDecoder_DecodeAllFiles/sharnd.out/best-32                                                13079.77     13061.36      1.00x
BenchmarkDecoder_DecodeAllFilesP/.tracker-unpacked.bin/fastest-32                                 10354.84     11876.83      1.15x
BenchmarkDecoder_DecodeAllFilesP/.tracker-unpacked.bin/default-32                                 11557.12     13415.35      1.16x
BenchmarkDecoder_DecodeAllFilesP/.tracker-unpacked.bin/better-32                                  12644.67     14515.52      1.15x
BenchmarkDecoder_DecodeAllFilesP/.tracker-unpacked.bin/best-32                                    15934.00     17307.06      1.09x
BenchmarkDecoder_DecodeAllFilesP/.tracker.bin/fastest-32                                          35354.57     35307.64      1.00x
BenchmarkDecoder_DecodeAllFilesP/.tracker.bin/default-32                                          11392.27     11353.17      1.00x
BenchmarkDecoder_DecodeAllFilesP/.tracker.bin/better-32                                           11793.77     11733.41      0.99x
BenchmarkDecoder_DecodeAllFilesP/.tracker.bin/best-32                                             11203.91     11174.37      1.00x
BenchmarkDecoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/fastest-32                             12089.54     12097.65      1.00x
BenchmarkDecoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/default-32                             12604.67     12647.83      1.00x
BenchmarkDecoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/better-32                              13265.79     13275.92      1.00x
BenchmarkDecoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/best-32                                13078.85     13130.26      1.00x
BenchmarkDecoder_DecodeAllFilesP/e.txt/fastest-32                                                 52477.17     51848.17      0.99x
BenchmarkDecoder_DecodeAllFilesP/e.txt/default-32                                                 11947.06     11922.24      1.00x
BenchmarkDecoder_DecodeAllFilesP/e.txt/better-32                                                  13184.17     13223.10      1.00x
BenchmarkDecoder_DecodeAllFilesP/e.txt/best-32                                                    14630.26     14702.42      1.00x
BenchmarkDecoder_DecodeAllFilesP/fse-artifact3.bin/fastest-32                                     3013.25      3025.30       1.00x
BenchmarkDecoder_DecodeAllFilesP/fse-artifact3.bin/default-32                                     3125.61      2976.92       0.95x
BenchmarkDecoder_DecodeAllFilesP/fse-artifact3.bin/better-32                                      3181.68      3162.28       0.99x
BenchmarkDecoder_DecodeAllFilesP/fse-artifact3.bin/best-32                                        3351.22      3372.69       1.01x
BenchmarkDecoder_DecodeAllFilesP/gettysburg.txt/fastest-32                                        1188.15      1147.96       0.97x
BenchmarkDecoder_DecodeAllFilesP/gettysburg.txt/default-32                                        1215.39      1156.01       0.95x
BenchmarkDecoder_DecodeAllFilesP/gettysburg.txt/better-32                                         1219.20      1177.16       0.97x
BenchmarkDecoder_DecodeAllFilesP/gettysburg.txt/best-32                                           1216.72      1170.21       0.96x
BenchmarkDecoder_DecodeAllFilesP/html.txt/fastest-32                                              16901.32     17180.70      1.02x
BenchmarkDecoder_DecodeAllFilesP/html.txt/default-32                                              16819.66     16997.40      1.01x
BenchmarkDecoder_DecodeAllFilesP/html.txt/better-32                                               17805.12     17946.54      1.01x
BenchmarkDecoder_DecodeAllFilesP/html.txt/best-32                                                 16916.87     17294.25      1.02x
BenchmarkDecoder_DecodeAllFilesP/pi.txt/fastest-32                                                52314.15     52657.88      1.01x
BenchmarkDecoder_DecodeAllFilesP/pi.txt/default-32                                                11878.94     11796.12      0.99x
BenchmarkDecoder_DecodeAllFilesP/pi.txt/better-32                                                 13303.16     13216.13      0.99x
BenchmarkDecoder_DecodeAllFilesP/pi.txt/best-32                                                   14622.76     14697.47      1.01x
BenchmarkDecoder_DecodeAllFilesP/pngdata.bin/fastest-32                                           34134.48     36542.10      1.07x
BenchmarkDecoder_DecodeAllFilesP/pngdata.bin/default-32                                           33589.32     34982.31      1.04x
BenchmarkDecoder_DecodeAllFilesP/pngdata.bin/better-32                                            43754.89     44323.18      1.01x
BenchmarkDecoder_DecodeAllFilesP/pngdata.bin/best-32                                              32422.22     33882.10      1.05x
BenchmarkDecoder_DecodeAllFilesP/sharnd.out/fastest-32                                            52706.00     52863.28      1.00x
BenchmarkDecoder_DecodeAllFilesP/sharnd.out/default-32                                            52527.76     52319.50      1.00x
BenchmarkDecoder_DecodeAllFilesP/sharnd.out/better-32                                             52177.25     52506.60      1.01x
BenchmarkDecoder_DecodeAllFilesP/sharnd.out/best-32                                               52443.28     52402.30      1.00x
BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-32                                               13992.47     14134.26      1.01x
BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-32                                           34107.95     33812.99      0.99x
BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-32                                            12012.34     12123.74      1.01x
BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-32                                              12630.22     13586.02      1.08x
BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-32                                            12327.02     12374.31      1.00x
BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-32                                             11932.73     12059.89      1.01x
BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-32                                                31233.38     36076.61      1.16x
BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-32                                          97435.31     100702.06     1.03x
BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-32                                          62247.22     61824.88      0.99x
BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-32                                                18659.58     18502.10      0.99x
BenchmarkDecoder_DecodeAllParallel/html.zst-32                                                    28464.78     28500.16      1.00x
BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-32                                           3114.03      3132.86       1.01x
BenchmarkDecoderSilesia/multithreaded-writer-32                                                   1099.69      1059.67       0.96x
BenchmarkDecoderSilesia/multithreaded-writer-himem-32                                             1093.10      1054.67       0.96x
BenchmarkDecoderSilesia/singlethreaded-writer-32                                                  803.85       819.16        1.02x
BenchmarkDecoderSilesia/singlethreaded-writerto-32                                                812.83       828.44        1.02x
BenchmarkDecoderSilesia/singlethreaded-himem-32                                                   813.14       824.41        1.01x
BenchmarkDecoderEnwik9/multithreaded-writer-32                                                    877.55       981.68        1.12x
BenchmarkDecoderEnwik9/multithreaded-writer-himem-32                                              961.20       1013.19       1.05x
BenchmarkDecoderEnwik9/singlethreaded-writer-32                                                   632.07       629.32        1.00x
BenchmarkDecoderEnwik9/singlethreaded-writerto-32                                                 634.62       635.76        1.00x
BenchmarkDecoderEnwik9/singlethreaded-himem-32                                                    763.68       755.70        0.99x
BenchmarkDecoderWithCustomFiles/github-june-2days-2019.json.zst/multithreaded-writer-32           1626.86      1658.42       1.02x
BenchmarkDecoderWithCustomFiles/github-june-2days-2019.json.zst/multithreaded-writer-himem-32     2299.80      2305.08       1.00x
BenchmarkDecoderWithCustomFiles/github-june-2days-2019.json.zst/singlethreaded-writer-32          1221.34      1207.19       0.99x
BenchmarkDecoderWithCustomFiles/github-june-2days-2019.json.zst/singlethreaded-writerto-32        1236.18      1224.88       0.99x
BenchmarkDecoderWithCustomFiles/github-june-2days-2019.json.zst/singlethreaded-himem-32           1749.21      1729.03       0.99x
BenchmarkDecoderWithCustomFiles/github-ranks-backup.bin.zst/multithreaded-writer-32               839.51       922.30        1.10x
BenchmarkDecoderWithCustomFiles/github-ranks-backup.bin.zst/multithreaded-writer-himem-32         1055.54      1093.19       1.04x
BenchmarkDecoderWithCustomFiles/github-ranks-backup.bin.zst/singlethreaded-writer-32              574.91       614.02        1.07x
BenchmarkDecoderWithCustomFiles/github-ranks-backup.bin.zst/singlethreaded-writerto-32            579.19       618.97        1.07x
BenchmarkDecoderWithCustomFiles/github-ranks-backup.bin.zst/singlethreaded-himem-32               780.67       863.05        1.11x 
```
  • Loading branch information
klauspost committed Jul 4, 2022
1 parent bf3f0fd commit b16a9af
Show file tree
Hide file tree
Showing 2 changed files with 728 additions and 608 deletions.
124 changes: 68 additions & 56 deletions zstd/_generate/gen.go
Expand Up @@ -1132,7 +1132,7 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
JZ(LabelRef("check_offset"))
// TODO: Investigate if it is possible to consistently overallocate literals.
if e.safeMem {
e.copyMemoryPrecise("1", c.literals, c.outBase, ll)
e.copyMemoryPrecise("1", c.literals, c.outBase, ll, 1)
} else {
e.copyMemoryND("1", c.literals, c.outBase, ll)
ADDQ(ll, c.literals)
Expand Down Expand Up @@ -1194,14 +1194,15 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
}
SUBQ(v, ptr) // ptr := &hist[len(hist) - v]
CMPQ(ml, v)
JGE(LabelRef("copy_all_from_history"))
JG(LabelRef("copy_all_from_history"))
/* if ml <= v {
copy(out[outPosition:], hist[start:start+seq.ml])
t += seq.ml
continue
}
*/
e.copyMemoryPrecise("4", ptr, c.outBase, ml)
// We know ml will be at least 3, since we didn't copy anything yet.
e.copyMemoryPrecise("4", ptr, c.outBase, ml, 3)
ADDQ(ml, c.outPosition)
// Note: for the current go tests this branch is taken in 99.53% cases,
// this is why we repeat a little code here.
Expand All @@ -1217,18 +1218,16 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
seq.ml -= v
}
*/
e.copyMemoryPrecise("5", ptr, c.outBase, v)
e.copyMemoryPrecise("5", ptr, c.outBase, v, 1)
ADDQ(v, c.outPosition)
SUBQ(v, ml)
// fallback to the next block
// ml cannot be 0, since we only jump here is ml > v.
// Copy rest from current block.
}

Comment("Copy match from the current buffer")
Label("copy_match")
{
TESTQ(ml, ml)
JZ(LabelRef("handle_loop"))

src := GP64()
MOVQ(c.outBase, src)
SUBQ(mo, src) // src = &s.out[t - mo]
Expand All @@ -1251,7 +1250,7 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
{
ADDQ(ml, c.outPosition)
if e.safeMem {
e.copyMemoryPrecise("2", src, c.outBase, ml)
e.copyMemoryPrecise("2", src, c.outBase, ml, 1)
} else {
dst := GP64()
MOVQ(c.outBase, dst)
Expand Down Expand Up @@ -1311,7 +1310,16 @@ func (e executeSimple) copyMemoryND(suffix string, src, dst, length reg.GPVirtua
// copyMemoryPrecise will copy memory in blocks of 16 bytes,
// without overreading. It adds length to src and dst,
// preserving length.
func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPVirtual) {
func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPVirtual, minLength int) {
assert(func(ok LabelRef) {
// if length >= minLength, ok
CMPQ(length, U8(minLength))
JAE(ok)
})
if minLength == 0 {
TESTQ(length, length)
JZ(LabelRef("copy_" + suffix + "_end"))
}
n := GP64()
MOVQ(length, n)
SUBQ(U8(16), n)
Expand Down Expand Up @@ -1346,53 +1354,57 @@ func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPV
}

Label("copy_" + suffix + "_small")
ofs := GP64()
s := Mem{Base: src, Index: ofs, Scale: 1}
d := Mem{Base: dst, Index: ofs, Scale: 1}

tmp := GP64()
XORQ(ofs, ofs)

Label("copy_" + suffix + "_byte")
TESTQ(U32(0x1), length)
JZ(LabelRef("copy_" + suffix + "_word"))

// copy one byte if length & 0x01 != 0
MOVB(s, tmp.As8())
MOVB(tmp.As8(), d)
ADDQ(U8(1), ofs)

Label("copy_" + suffix + "_word")
TESTQ(U32(0x2), length)
JZ(LabelRef("copy_" + suffix + "_dword"))

// copy two bytes if length & 0x02 != 0
MOVW(s, tmp.As16())
MOVW(tmp.As16(), d)
ADDQ(U8(2), ofs)

Label("copy_" + suffix + "_dword")
TESTQ(U32(0x4), length)
JZ(LabelRef("copy_" + suffix + "_qword"))

// copy four bytes if length & 0x04 != 0
MOVL(s, tmp.As32())
MOVL(tmp.As32(), d)
ADDQ(U8(4), ofs)

Label("copy_" + suffix + "_qword")
TESTQ(U32(0x8), length)
JZ(LabelRef("copy_" + suffix + "_add"))

// copy eight bytes if length & 0x08 != 0
MOVQ(s, tmp)
MOVQ(tmp, d)
ADDQ(U8(8), ofs)

Label("copy_" + suffix + "_add")
ADDQ(length, dst)
ADDQ(length, src)
{
name := "copy_" + suffix + "_"
end := LabelRef("copy_" + suffix + "_end")
CMPQ(length, U8(3))
JE(LabelRef(name + "move_3"))
if minLength < 3 {
JB(LabelRef(name + "move_1or2"))
}
CMPQ(length, U8(8))
JB(LabelRef(name + "move_4through7"))
JMP(LabelRef(name + "move_8through16"))
AX, CX := GP64(), GP64()

if minLength < 3 {
Label(name + "move_1or2")
MOVB(Mem{Base: src}, AX.As8())
MOVB(Mem{Base: src, Disp: -1, Index: length, Scale: 1}, CX.As8())
MOVB(AX.As8(), Mem{Base: dst})
MOVB(CX.As8(), Mem{Base: dst, Disp: -1, Index: length, Scale: 1})
ADDQ(length, src)
ADDQ(length, dst)
JMP(end)
}

Label(name + "move_3")
MOVW(Mem{Base: src}, AX.As16())
MOVB(Mem{Base: src, Disp: 2}, CX.As8())
MOVW(AX.As16(), Mem{Base: dst})
MOVB(CX.As8(), Mem{Base: dst, Disp: 2})
ADDQ(length, src)
ADDQ(length, dst)
JMP(end)

Label(name + "move_4through7")
MOVL(Mem{Base: src}, AX.As32())
MOVL(Mem{Base: src, Disp: -4, Index: length, Scale: 1}, CX.As32())
MOVL(AX.As32(), Mem{Base: dst})
MOVL(CX.As32(), Mem{Base: dst, Disp: -4, Index: length, Scale: 1})
ADDQ(length, src)
ADDQ(length, dst)
JMP(end)

Label(name + "move_8through16")
MOVQ(Mem{Base: src}, AX)
MOVQ(Mem{Base: src, Disp: -8, Index: length, Scale: 1}, CX)
MOVQ(AX, Mem{Base: dst})
MOVQ(CX, Mem{Base: dst, Disp: -8, Index: length, Scale: 1})
ADDQ(length, src)
ADDQ(length, dst)
JMP(end)
}
Label("copy_" + suffix + "_end")
}

Expand Down

0 comments on commit b16a9af

Please sign in to comment.