Skip to content

Commit

Permalink
EncodeIdx: Use advanced AVX2/GFNI (#248)
Browse files Browse the repository at this point in the history
* EncodeIdx: Use advanced AVX2/GFNI

Speeds up many output EncodeIdx.

Example in #247 twice as fast.

Fixes #247 #245
  • Loading branch information
klauspost committed Apr 6, 2023
1 parent fd3e691 commit c28d6bb
Showing 1 changed file with 40 additions and 18 deletions.
58 changes: 40 additions & 18 deletions reedsolomon.go
Original file line number Diff line number Diff line change
Expand Up @@ -477,11 +477,17 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {

// Calculate what we want per round
r.o.perRound = cpuid.CPU.Cache.L2
if r.o.perRound < 128<<10 {
r.o.perRound = 128 << 10
}

divide := parityShards + 1
if avx2CodeGen && r.o.useAVX2 && (dataShards > maxAvx2Inputs || parityShards > maxAvx2Outputs) {
// Base on L1 cache if we have many inputs.
r.o.perRound = cpuid.CPU.Cache.L1D
if r.o.perRound < 32<<10 {
r.o.perRound = 32 << 10
}
divide = 0
if dataShards > maxAvx2Inputs {
divide += maxAvx2Inputs
Expand All @@ -495,11 +501,6 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
}
}

if r.o.perRound <= 0 {
// Set to 128K if undetectable.
r.o.perRound = 128 << 10
}

if cpuid.CPU.ThreadsPerCore > 1 && r.o.maxGoroutines > cpuid.CPU.PhysicalCores {
// If multiple threads per core, make sure they don't contend for cache.
r.o.perRound /= cpuid.CPU.ThreadsPerCore
Expand All @@ -510,6 +511,11 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
// Align to 64 bytes.
r.o.perRound = ((r.o.perRound + 63) / 64) * 64

// Final sanity check...
if r.o.perRound < 1<<10 {
r.o.perRound = 1 << 10
}

if r.o.minSplitSize <= 0 {
// Set minsplit as high as we can, but still have parity in L1.
cacheSize := cpuid.CPU.Cache.L1D
Expand Down Expand Up @@ -646,6 +652,19 @@ func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) erro
return ErrShardSize
}

if avx2CodeGen && len(dataShard) >= r.o.perRound && len(parity) >= avx2CodeGenMinShards && (r.o.useAVX2 || r.o.useGFNI) {
m := make([][]byte, r.parityShards)
for iRow := range m {
m[iRow] = r.parity[iRow][idx : idx+1]
}
if r.o.useGFNI {
r.codeSomeShardsGFNI(m, [][]byte{dataShard}, parity, len(dataShard), false)
} else {
r.codeSomeShardsAVXP(m, [][]byte{dataShard}, parity, len(dataShard), false)
}
return nil
}

// Process using no goroutines for now.
start, end := 0, r.o.perRound
if end > len(dataShard) {
Expand Down Expand Up @@ -909,16 +928,16 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte
avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice())
defer r.putTmpSlice(avx2Matrix)
} else if r.o.useGFNI && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards &&
r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) {
r.canGFNI(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) {
// It appears there is a switchover point at around 10MB where
// Regular processing is faster...
r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount)
r.codeSomeShardsGFNI(matrixRows, inputs, outputs, byteCount, true)
return
} else if r.o.useAVX2 && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards &&
r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) {
// It appears there is a switchover point at around 10MB where
// Regular processing is faster...
r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount)
r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount, true)
return
}

Expand Down Expand Up @@ -982,7 +1001,8 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte

// Perform the same as codeSomeShards, but split the workload into
// several goroutines.
func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int) {
// If clear is set, the first write will overwrite the output.
func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) {
var wg sync.WaitGroup
gor := r.o.maxGoroutines

Expand Down Expand Up @@ -1022,7 +1042,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
input: inPer,
output: outPer,
m: m,
first: inIdx == 0,
first: inIdx == 0 && clear,
})
outIdx += len(outPer)
outs = outs[len(outPer):]
Expand Down Expand Up @@ -1054,7 +1074,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
input: inPer,
output: outPer,
m: m,
first: inIdx == 0,
first: inIdx == 0 && clear,
})
inIdx += len(inPer)
ins = ins[len(inPer):]
Expand All @@ -1070,6 +1090,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
}

exec := func(start, stop int) {
defer wg.Done()
lstart, lstop := start, start+r.o.perRound
if lstop > stop {
lstop = stop
Expand Down Expand Up @@ -1097,7 +1118,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
for c := range inputs {
in := inputs[c][lstart:lstop]
for iRow := 0; iRow < len(outputs); iRow++ {
if c == 0 {
if c == 0 && clear {
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
} else {
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
Expand All @@ -1110,7 +1131,6 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
lstop = stop
}
}
wg.Done()
}
if gor == 1 {
wg.Add(1)
Expand All @@ -1135,7 +1155,8 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b

// Perform the same as codeSomeShards, but split the workload into
// several goroutines.
func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int) {
// If clear is set, the first write will overwrite the output.
func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) {
var wg sync.WaitGroup
gor := r.o.maxGoroutines

Expand Down Expand Up @@ -1171,7 +1192,7 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b
input: inPer,
output: outPer,
m: m,
first: inIdx == 0,
first: inIdx == 0 && clear,
})
outIdx += len(outPer)
outs = outs[len(outPer):]
Expand Down Expand Up @@ -1202,7 +1223,7 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b
input: inPer,
output: outPer,
m: m,
first: inIdx == 0,
first: inIdx == 0 && clear,
})
inIdx += len(inPer)
ins = ins[len(inPer):]
Expand All @@ -1218,6 +1239,7 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b
}

exec := func(start, stop int) {
defer wg.Done()
lstart, lstop := start, start+r.o.perRound
if lstop > stop {
lstop = stop
Expand Down Expand Up @@ -1245,7 +1267,7 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b
for c := range inputs {
in := inputs[c][lstart:lstop]
for iRow := 0; iRow < len(outputs); iRow++ {
if c == 0 {
if c == 0 && clear {
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
} else {
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
Expand All @@ -1258,8 +1280,8 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b
lstop = stop
}
}
wg.Done()
}

if gor == 1 {
wg.Add(1)
exec(0, byteCount)
Expand Down

0 comments on commit c28d6bb

Please sign in to comment.