From a00e9db6ca6bd1bdaa0a5ca98382bbc5e3a866a7 Mon Sep 17 00:00:00 2001 From: Jonathan Swinney Date: Tue, 14 Oct 2025 18:18:04 +0000 Subject: [PATCH 1/2] aarch64/erasure_code: SVE intrinsics implementation Replace hand-written SVE assembly implementations of n-vector dot product functions with optimized SVE intrinsics that provide better performance through 4x loop unrolling. Key improvements over the original SVE assembly: - 4x unrolled loops processing 64 bytes per iteration (vs single vector) - Unified implementation supports 1-7 vector operations by using the compiler to generate each version. - The compiler also generates SVE2 versions of the same functions which make use of the EOR3 instruction. The implementation maintains the existing nibble-based Galois Field multiplication with 32-byte lookup tables while adding significant performance optimizations. reverts: aedcd375bad640fe91781df1a751ebca26c5f5f0 This change also reverts the above commit which configured systems with SVE width of 128 bits to use the path use NEON instead. NEON was faster since it had more unrolling, but now the SVE also has the same level of unrolling and the availablily of SVE2 makes that path faster still for systems which support it. Signed-off-by: Jonathan Swinney --- erasure_code/aarch64/Makefile.am | 12 +- erasure_code/aarch64/ec_aarch64_dispatcher.c | 52 +-- .../aarch64/ec_aarch64_highlevel_func.c | 97 ++++- erasure_code/aarch64/gf_2vect_dot_prod_sve.S | 168 --------- erasure_code/aarch64/gf_3vect_dot_prod_sve.S | 189 ---------- erasure_code/aarch64/gf_4vect_dot_prod_sve.S | 208 ---------- erasure_code/aarch64/gf_5vect_dot_prod_sve.S | 237 ------------ erasure_code/aarch64/gf_6vect_dot_prod_sve.S | 258 ------------- erasure_code/aarch64/gf_7vect_dot_prod_sve.S | 281 -------------- erasure_code/aarch64/gf_8vect_dot_prod_sve.S | 307 --------------- erasure_code/aarch64/gf_nvect_dot_prod_sve.c | 356 ++++++++++++++++++ erasure_code/aarch64/gf_vect_dot_prod_sve.S | 132 ------- 12 files changed, 469 insertions(+), 1828 deletions(-) delete mode 100644 erasure_code/aarch64/gf_2vect_dot_prod_sve.S delete mode 100644 erasure_code/aarch64/gf_3vect_dot_prod_sve.S delete mode 100644 erasure_code/aarch64/gf_4vect_dot_prod_sve.S delete mode 100644 erasure_code/aarch64/gf_5vect_dot_prod_sve.S delete mode 100644 erasure_code/aarch64/gf_6vect_dot_prod_sve.S delete mode 100644 erasure_code/aarch64/gf_7vect_dot_prod_sve.S delete mode 100644 erasure_code/aarch64/gf_8vect_dot_prod_sve.S create mode 100644 erasure_code/aarch64/gf_nvect_dot_prod_sve.c delete mode 100644 erasure_code/aarch64/gf_vect_dot_prod_sve.S diff --git a/erasure_code/aarch64/Makefile.am b/erasure_code/aarch64/Makefile.am index 47bbf12d..3d9a3644 100644 --- a/erasure_code/aarch64/Makefile.am +++ b/erasure_code/aarch64/Makefile.am @@ -48,13 +48,9 @@ lsrc_aarch64 += \ erasure_code/aarch64/gf_4vect_mad_sve.S \ erasure_code/aarch64/gf_5vect_mad_sve.S \ erasure_code/aarch64/gf_6vect_mad_sve.S \ - erasure_code/aarch64/gf_vect_dot_prod_sve.S \ - erasure_code/aarch64/gf_2vect_dot_prod_sve.S \ - erasure_code/aarch64/gf_3vect_dot_prod_sve.S \ - erasure_code/aarch64/gf_4vect_dot_prod_sve.S \ - erasure_code/aarch64/gf_5vect_dot_prod_sve.S \ - erasure_code/aarch64/gf_6vect_dot_prod_sve.S \ - erasure_code/aarch64/gf_7vect_dot_prod_sve.S \ - erasure_code/aarch64/gf_8vect_dot_prod_sve.S \ erasure_code/aarch64/gf_vect_mul_sve.S \ + erasure_code/aarch64/gf_nvect_dot_prod_sve.c \ erasure_code/aarch64/ec_multibinary_arm.S + +# Ensure SVE intrinsics are compiled with maximum optimization +erasure_code/aarch64/gf_nvect_dot_prod_sve.lo: CFLAGS += -O3 diff --git a/erasure_code/aarch64/ec_aarch64_dispatcher.c b/erasure_code/aarch64/ec_aarch64_dispatcher.c index 93896e66..f15f65bb 100644 --- a/erasure_code/aarch64/ec_aarch64_dispatcher.c +++ b/erasure_code/aarch64/ec_aarch64_dispatcher.c @@ -30,43 +30,18 @@ #include "erasure_code.h" #include "gf_vect_mul.h" -#ifdef __ARM_FEATURE_SVE -// If the compiler defines SVE intrinsics, include that header -#include - -#elif defined(__linux__) -// Otherwise include these headers and define these constants as a fallback for Linux only -#include +#ifdef __linux__ #include -#include -#ifndef PR_SVE_GET_VL -#define PR_SVE_GET_VL 51 -#endif -#ifndef PR_SVE_VL_LEN_MASK -#define PR_SVE_VL_LEN_MASK 0xffff -#endif - +#ifndef HWCAP2_SVE2 +#define HWCAP2_SVE2 (1 << 1) #endif - -static inline size_t -get_sve_vector_length_bytes(void) -{ -#ifdef __ARM_FEATURE_SVE - // Use intrinsic if available at compile time - return svcntb(); -#elif defined(__linux__) - // Fall back to prctl on Linux - long sve_vl = prctl(PR_SVE_GET_VL); - if (sve_vl != -1) { - return sve_vl & PR_SVE_VL_LEN_MASK; - } #endif - return 0; // Unknown or unavailable -} extern void gf_vect_dot_prod_sve(int, int, unsigned char *, unsigned char **, unsigned char *); extern void +gf_vect_dot_prod_sve2(int, int, unsigned char *, unsigned char **, unsigned char *); +extern void gf_vect_dot_prod_neon(int, int, unsigned char *, unsigned char **, unsigned char *); extern void @@ -75,7 +50,9 @@ extern void gf_vect_mad_neon(int, int, int, unsigned char *, unsigned char *, unsigned char *); extern void -ec_encode_data_sve(int, int, int, unsigned char *, unsigned char **, unsigned char **coding); +ec_encode_data_sve(int, int, int, unsigned char *, unsigned char **, unsigned char **); +extern void +ec_encode_data_sve2(int, int, int, unsigned char *, unsigned char **, unsigned char **); extern void ec_encode_data_neon(int, int, int, unsigned char *, unsigned char **, unsigned char **); @@ -93,7 +70,10 @@ DEFINE_INTERFACE_DISPATCHER(gf_vect_dot_prod) { #if defined(__linux__) unsigned long auxval = getauxval(AT_HWCAP); + unsigned long auxval2 = getauxval(AT_HWCAP2); + if ((auxval & HWCAP_SVE) && (auxval2 & HWCAP2_SVE2)) + return gf_vect_dot_prod_sve2; if (auxval & HWCAP_SVE) return gf_vect_dot_prod_sve; if (auxval & HWCAP_ASIMD) @@ -127,15 +107,11 @@ DEFINE_INTERFACE_DISPATCHER(ec_encode_data) { #if defined(__linux__) unsigned long auxval = getauxval(AT_HWCAP); + unsigned long auxval2 = getauxval(AT_HWCAP2); + if ((auxval & HWCAP_SVE) && (auxval2 & HWCAP2_SVE2)) + return ec_encode_data_sve2; if (auxval & HWCAP_SVE) { - size_t vector_length = get_sve_vector_length_bytes(); - - // If 128-bit SVE (16 bytes), use NEON instead - if (vector_length == 16 && (auxval & HWCAP_ASIMD)) { - return ec_encode_data_neon; - } - return ec_encode_data_sve; } if (auxval & HWCAP_ASIMD) diff --git a/erasure_code/aarch64/ec_aarch64_highlevel_func.c b/erasure_code/aarch64/ec_aarch64_highlevel_func.c index 51f8ff02..1a100c8e 100644 --- a/erasure_code/aarch64/ec_aarch64_highlevel_func.c +++ b/erasure_code/aarch64/ec_aarch64_highlevel_func.c @@ -161,12 +161,34 @@ gf_6vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char ** extern void gf_7vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char **src, unsigned char **dest); + +/* SVE2 */ extern void -gf_8vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char **src, - unsigned char **dest); +gf_vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char *dest); +extern void +gf_2vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest); +extern void +gf_3vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest); +extern void +gf_4vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest); +extern void +gf_5vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest); +extern void +gf_6vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest); +extern void +gf_7vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest); + extern void gf_vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, unsigned char *dest); + extern void gf_2vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, unsigned char **dest); @@ -254,6 +276,77 @@ ec_encode_data_sve(int len, int k, int rows, unsigned char *g_tbls, unsigned cha } } +void +ec_encode_data_sve2(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data, + unsigned char **coding) +{ + if (len < 16) { + ec_encode_data_base(len, k, rows, g_tbls, data, coding); + return; + } + + while (rows > 11) { + gf_6vect_dot_prod_sve2(len, k, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + rows -= 6; + } + + switch (rows) { + case 11: + /* 7 + 4 */ + gf_7vect_dot_prod_sve2(len, k, g_tbls, data, coding); + g_tbls += 7 * k * 32; + coding += 7; + gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding); + break; + case 10: + /* 6 + 4 */ + gf_6vect_dot_prod_sve2(len, k, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding); + break; + case 9: + /* 5 + 4 */ + gf_5vect_dot_prod_sve2(len, k, g_tbls, data, coding); + g_tbls += 5 * k * 32; + coding += 5; + gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding); + break; + case 8: + /* 4 + 4 */ + gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding); + g_tbls += 4 * k * 32; + coding += 4; + gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding); + break; + case 7: + gf_7vect_dot_prod_sve2(len, k, g_tbls, data, coding); + break; + case 6: + gf_6vect_dot_prod_sve2(len, k, g_tbls, data, coding); + break; + case 5: + gf_5vect_dot_prod_sve2(len, k, g_tbls, data, coding); + break; + case 4: + gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding); + break; + case 3: + gf_3vect_dot_prod_sve2(len, k, g_tbls, data, coding); + break; + case 2: + gf_2vect_dot_prod_sve2(len, k, g_tbls, data, coding); + break; + case 1: + gf_vect_dot_prod_sve2(len, k, g_tbls, data, *coding); + break; + default: + break; + } +} + void ec_encode_data_update_sve(int len, int k, int rows, int vec_i, unsigned char *g_tbls, unsigned char *data, unsigned char **coding) diff --git a/erasure_code/aarch64/gf_2vect_dot_prod_sve.S b/erasure_code/aarch64/gf_2vect_dot_prod_sve.S deleted file mode 100644 index 99b5f15c..00000000 --- a/erasure_code/aarch64/gf_2vect_dot_prod_sve.S +++ /dev/null @@ -1,168 +0,0 @@ -/************************************************************* - Copyright (c) 2021 Linaro Ltd. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Huawei Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ -.text -.align 6 -.arch armv8-a+sve - -#include "../include/aarch64_label.h" - -.global cdecl(gf_2vect_dot_prod_sve) -#ifndef __APPLE__ -.type gf_2vect_dot_prod_sve, %function -#endif -/* void gf_2vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char **dest); - */ - -/* arguments */ -x_len .req x0 /* vector length */ -x_vec .req x1 /* number of source vectors (ie. data blocks) */ -x_tbl .req x2 -x_src .req x3 -x_dest .req x4 - -/* returns */ -w_ret .req w0 - -/* local variables */ -x_vec_i .req x5 -x_ptr .req x6 -x_pos .req x7 - -x_tbl1 .req x8 -x_tbl2 .req x9 -x_dest1 .req x10 -x_dest2 .req x_dest /* reused */ - -/* r16,r17,r18,r29,r30: special role registers, avoided */ -/* r19..r29 and SP must be preserved */ - -/* vectors */ -z_mask0f .req z0 - -z_src .req z1 -z_src_lo .req z2 -z_src_hi .req z_src - -z_dest1 .req z3 - -z_gft1_lo .req z4 -z_gft1_hi .req z5 -q_gft1_lo .req q4 -q_gft1_hi .req q5 - -/* bottom 64-bit of v8..v15 must be preserved if used */ -z_gft2_lo .req z17 -z_gft2_hi .req z18 -q_gft2_lo .req q17 -q_gft2_hi .req q18 - -z_dest2 .req z27 - -cdecl(gf_2vect_dot_prod_sve): - /* less than 16 bytes, return_fail */ - cmp x_len, #16 - blt .return_fail - - mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ - mov x_pos, #0 - lsl x_vec, x_vec, #3 - ldp x_dest1, x_dest2, [x_dest, #8*0] - -/* Loop 1: x_len, vector length */ -.Lloopsve_vl: - whilelo p0.b, x_pos, x_len - b.none .return_pass - - mov x_vec_i, #0 /* clear x_vec_i */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - mov z_dest1.b, #0 /* clear z_dest1 */ - mov z_dest2.b, #0 /* clear z_dest2 */ - - /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ - mov x_tbl1, x_tbl /* reset x_tbl1 */ - add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ - -/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ -.Lloopsve_vl_vects: - /* load src data, governed by p0 */ - ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ - /* split 4-bit lo; 4-bit hi */ - and z_src_lo.d, z_src.d, z_mask0f.d - lsr z_src_hi.b, z_src.b, #4 - - - /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ - /* load gf_table's */ - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ - ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 - - /* prefetch */ - prfb pldl2keep, p0, [x_tbl1] - prfb pldl2keep, p0, [x_tbl2] - - /* calc for next */ - add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - /* dest 1 */ - /* table indexing, ie. gf(2^8) multiplication */ - tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b - tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b - /* exclusive or, ie. gf(2^8) add */ - eor z_dest1.d, z_gft1_lo.d, z_dest1.d - eor z_dest1.d, z_dest1.d, z_gft1_hi.d - - /* dest 2 */ - tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b - tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b - eor z_dest2.d, z_gft2_lo.d, z_dest2.d - eor z_dest2.d, z_dest2.d, z_gft2_hi.d - - cmp x_vec_i, x_vec - blt .Lloopsve_vl_vects -/* end of Loop 2 */ - - /* store dest data, governed by p0 */ - st1b z_dest1.b, p0, [x_dest1, x_pos] - st1b z_dest2.b, p0, [x_dest2, x_pos] - - /* increment one vector length */ - incb x_pos - b .Lloopsve_vl -/* end of Loop 1 */ - -.return_pass: - mov w_ret, #0 - ret - -.return_fail: - mov w_ret, #1 - ret diff --git a/erasure_code/aarch64/gf_3vect_dot_prod_sve.S b/erasure_code/aarch64/gf_3vect_dot_prod_sve.S deleted file mode 100644 index 8f6414ee..00000000 --- a/erasure_code/aarch64/gf_3vect_dot_prod_sve.S +++ /dev/null @@ -1,189 +0,0 @@ -/************************************************************* - Copyright (c) 2021 Linaro Ltd. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Huawei Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ -.text -.align 6 -.arch armv8-a+sve - -#include "../include/aarch64_label.h" - -.global cdecl(gf_3vect_dot_prod_sve) -#ifndef __APPLE__ -.type gf_3vect_dot_prod_sve, %function -#endif -/* void gf_3vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char **dest); - */ - -/* arguments */ -x_len .req x0 /* vector length */ -x_vec .req x1 /* number of source vectors (ie. data blocks) */ -x_tbl .req x2 -x_src .req x3 -x_dest .req x4 - -/* returns */ -w_ret .req w0 - -/* local variables */ -x_vec_i .req x5 -x_ptr .req x6 -x_pos .req x7 - -x_tbl1 .req x8 -x_tbl2 .req x9 -x_tbl3 .req x10 -x_dest1 .req x11 -x_dest2 .req x12 -x_dest3 .req x_dest /* reused */ - -/* r16,r17,r18,r29,r30: special role registers, avoided */ -/* r19..r29 and SP must be preserved */ - -/* vectors */ -z_mask0f .req z0 - -z_src .req z1 -z_src_lo .req z2 -z_src_hi .req z_src - -z_dest1 .req z3 - -z_gft1_lo .req z4 -z_gft1_hi .req z5 -q_gft1_lo .req q4 -q_gft1_hi .req q5 - -/* bottom 64-bit of v8..v15 must be preserved if used */ -z_gft2_lo .req z17 -z_gft2_hi .req z18 -q_gft2_lo .req q17 -q_gft2_hi .req q18 - -z_gft3_lo .req z19 -z_gft3_hi .req z20 -q_gft3_lo .req q19 -q_gft3_hi .req q20 - -z_dest2 .req z27 -z_dest3 .req z28 - -cdecl(gf_3vect_dot_prod_sve): - /* less than 16 bytes, return_fail */ - cmp x_len, #16 - blt .return_fail - - mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ - mov x_pos, #0 - lsl x_vec, x_vec, #3 - ldp x_dest1, x_dest2, [x_dest, #8*0] - ldr x_dest3, [x_dest, #8*2] - -/* Loop 1: x_len, vector length */ -.Lloopsve_vl: - whilelo p0.b, x_pos, x_len - b.none .return_pass - - mov x_vec_i, #0 /* clear x_vec_i */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - mov z_dest1.b, #0 /* clear z_dest1 */ - mov z_dest2.b, #0 /* clear z_dest2 */ - mov z_dest3.b, #0 /* clear z_dest3 */ - - /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ - mov x_tbl1, x_tbl /* reset x_tbl1 */ - add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ - add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ - -/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ -.Lloopsve_vl_vects: - /* load src data, governed by p0 */ - ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ - /* split 4-bit lo; 4-bit hi */ - and z_src_lo.d, z_src.d, z_mask0f.d - lsr z_src_hi.b, z_src.b, #4 - - - /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ - /* load gf_table's */ - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ - ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 - - /* prefetch */ - prfb pldl2keep, p0, [x_tbl1] - prfb pldl2keep, p0, [x_tbl2] - - /* calc for next */ - add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - /* dest 1 */ - /* table indexing, ie. gf(2^8) multiplication */ - tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b - tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b - /* exclusive or, ie. gf(2^8) add */ - eor z_dest1.d, z_gft1_lo.d, z_dest1.d - eor z_dest1.d, z_dest1.d, z_gft1_hi.d - - ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 - prfb pldl2keep, p0, [x_tbl3] - - /* dest 2 */ - tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b - tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b - eor z_dest2.d, z_gft2_lo.d, z_dest2.d - eor z_dest2.d, z_dest2.d, z_gft2_hi.d - - /* dest 3 */ - tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b - tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b - eor z_dest3.d, z_gft3_lo.d, z_dest3.d - eor z_dest3.d, z_dest3.d, z_gft3_hi.d - - cmp x_vec_i, x_vec - blt .Lloopsve_vl_vects -/* end of Loop 2 */ - - /* store dest data, governed by p0 */ - st1b z_dest1.b, p0, [x_dest1, x_pos] - st1b z_dest2.b, p0, [x_dest2, x_pos] - st1b z_dest3.b, p0, [x_dest3, x_pos] - - /* increment one vector length */ - incb x_pos - b .Lloopsve_vl -/* end of Loop 1 */ - -.return_pass: - mov w_ret, #0 - ret - -.return_fail: - mov w_ret, #1 - ret diff --git a/erasure_code/aarch64/gf_4vect_dot_prod_sve.S b/erasure_code/aarch64/gf_4vect_dot_prod_sve.S deleted file mode 100644 index eb354279..00000000 --- a/erasure_code/aarch64/gf_4vect_dot_prod_sve.S +++ /dev/null @@ -1,208 +0,0 @@ -/************************************************************* - Copyright (c) 2021 Linaro Ltd. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Huawei Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ -.text -.align 6 -.arch armv8-a+sve - -#include "../include/aarch64_label.h" - -.global cdecl(gf_4vect_dot_prod_sve) -#ifndef __APPLE__ -.type gf_4vect_dot_prod_sve, %function -#endif -/* void gf_4vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char **dest); - */ - -/* arguments */ -x_len .req x0 /* vector length */ -x_vec .req x1 /* number of source vectors (ie. data blocks) */ -x_tbl .req x2 -x_src .req x3 -x_dest .req x4 - -/* returns */ -w_ret .req w0 - -/* local variables */ -x_vec_i .req x5 -x_ptr .req x6 -x_pos .req x7 - -x_tbl1 .req x8 -x_tbl2 .req x9 -x_tbl3 .req x10 -x_tbl4 .req x11 -x_dest1 .req x12 -x_dest2 .req x13 -x_dest3 .req x14 -x_dest4 .req x_dest /* reused */ - -/* r16,r17,r18,r29,r30: special role registers, avoided */ -/* r19..r29 and SP must be preserved */ - -/* vectors */ -z_mask0f .req z0 - -z_src .req z1 -z_src_lo .req z2 -z_src_hi .req z_src - -z_dest1 .req z3 - -z_gft1_lo .req z4 -z_gft1_hi .req z5 -q_gft1_lo .req q4 -q_gft1_hi .req q5 - -/* bottom 64-bit of v8..v15 must be preserved if used */ -z_gft2_lo .req z17 -z_gft2_hi .req z18 -q_gft2_lo .req q17 -q_gft2_hi .req q18 - -z_gft3_lo .req z19 -z_gft3_hi .req z20 -q_gft3_lo .req q19 -q_gft3_hi .req q20 - -z_gft4_lo .req z21 -z_gft4_hi .req z22 -q_gft4_lo .req q21 -q_gft4_hi .req q22 - -z_dest2 .req z27 -z_dest3 .req z28 -z_dest4 .req z29 - -cdecl(gf_4vect_dot_prod_sve): - /* less than 16 bytes, return_fail */ - cmp x_len, #16 - blt .return_fail - - mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ - mov x_pos, #0 - lsl x_vec, x_vec, #3 - ldp x_dest1, x_dest2, [x_dest, #8*0] - ldp x_dest3, x_dest4, [x_dest, #8*2] - -/* Loop 1: x_len, vector length */ -.Lloopsve_vl: - whilelo p0.b, x_pos, x_len - b.none .return_pass - - mov x_vec_i, #0 /* clear x_vec_i */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - mov z_dest1.b, #0 /* clear z_dest1 */ - mov z_dest2.b, #0 /* clear z_dest2 */ - mov z_dest3.b, #0 /* clear z_dest3 */ - mov z_dest4.b, #0 /* clear z_dest4 */ - - /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ - mov x_tbl1, x_tbl /* reset x_tbl1 */ - add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ - add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ - add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ - -/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ -.Lloopsve_vl_vects: - /* load src data, governed by p0 */ - ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ - /* split 4-bit lo; 4-bit hi */ - and z_src_lo.d, z_src.d, z_mask0f.d - lsr z_src_hi.b, z_src.b, #4 - - - /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ - /* load gf_table's */ - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ - ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 - - /* prefetch */ - prfb pldl2keep, p0, [x_tbl1] - prfb pldl2keep, p0, [x_tbl2] - - /* calc for next */ - add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - /* dest 1 */ - /* table indexing, ie. gf(2^8) multiplication */ - tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b - tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b - /* exclusive or, ie. gf(2^8) add */ - eor z_dest1.d, z_gft1_lo.d, z_dest1.d - eor z_dest1.d, z_dest1.d, z_gft1_hi.d - - ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 - ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 - prfb pldl2keep, p0, [x_tbl3] - prfb pldl2keep, p0, [x_tbl4] - - /* dest 2 */ - tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b - tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b - eor z_dest2.d, z_gft2_lo.d, z_dest2.d - eor z_dest2.d, z_dest2.d, z_gft2_hi.d - - /* dest 3 */ - tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b - tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b - eor z_dest3.d, z_gft3_lo.d, z_dest3.d - eor z_dest3.d, z_dest3.d, z_gft3_hi.d - - /* dest 4 */ - tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b - tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b - eor z_dest4.d, z_gft4_lo.d, z_dest4.d - eor z_dest4.d, z_dest4.d, z_gft4_hi.d - - cmp x_vec_i, x_vec - blt .Lloopsve_vl_vects -/* end of Loop 2 */ - - /* store dest data, governed by p0 */ - st1b z_dest1.b, p0, [x_dest1, x_pos] - st1b z_dest2.b, p0, [x_dest2, x_pos] - st1b z_dest3.b, p0, [x_dest3, x_pos] - st1b z_dest4.b, p0, [x_dest4, x_pos] - - /* increment one vector length */ - incb x_pos - b .Lloopsve_vl -/* end of Loop 1 */ - -.return_pass: - mov w_ret, #0 - ret - -.return_fail: - mov w_ret, #1 - ret diff --git a/erasure_code/aarch64/gf_5vect_dot_prod_sve.S b/erasure_code/aarch64/gf_5vect_dot_prod_sve.S deleted file mode 100644 index bb7cd018..00000000 --- a/erasure_code/aarch64/gf_5vect_dot_prod_sve.S +++ /dev/null @@ -1,237 +0,0 @@ -/************************************************************* - Copyright (c) 2021 Linaro Ltd. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Huawei Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ -.text -.align 6 -.arch armv8-a+sve - -#include "../include/aarch64_label.h" - -.global cdecl(gf_5vect_dot_prod_sve) -#ifndef __APPLE__ -.type gf_5vect_dot_prod_sve, %function -#endif -/* void gf_5vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char **dest); - */ - -/* arguments */ -x_len .req x0 /* vector length */ -x_vec .req x1 /* number of source vectors (ie. data blocks) */ -x_tbl .req x2 -x_src .req x3 -x_dest .req x4 - -/* returns */ -w_ret .req w0 - -/* local variables */ -x_vec_i .req x5 -x_ptr .req x6 -x_pos .req x7 - -x_tbl1 .req x8 -x_tbl2 .req x9 -x_tbl3 .req x10 -x_tbl4 .req x11 -x_tbl5 .req x12 -x_dest1 .req x13 -x_dest2 .req x14 -x_dest4 .req x15 -x_dest5 .req x_dest /* reused */ - -/* r16,r17,r18,r29,r30: special role registers, avoided */ -/* r19..r29 and SP must be preserved */ -x_dest3 .req x19 - -/* vectors */ -z_mask0f .req z0 - -z_src .req z1 -z_src_lo .req z2 -z_src_hi .req z_src - -z_dest1 .req z3 - -z_gft1_lo .req z4 -z_gft1_hi .req z5 -q_gft1_lo .req q4 -q_gft1_hi .req q5 - -/* bottom 64-bit of v8..v15 must be preserved if used */ -z_gft2_lo .req z17 -z_gft2_hi .req z18 -q_gft2_lo .req q17 -q_gft2_hi .req q18 - -z_gft3_lo .req z19 -z_gft3_hi .req z20 -q_gft3_lo .req q19 -q_gft3_hi .req q20 - -z_gft4_lo .req z21 -z_gft4_hi .req z22 -q_gft4_lo .req q21 -q_gft4_hi .req q22 - -z_gft5_lo .req z23 -z_gft5_hi .req z24 -q_gft5_lo .req q23 -q_gft5_hi .req q24 - -z_dest2 .req z27 -z_dest3 .req z28 -z_dest4 .req z29 -z_dest5 .req z30 - -cdecl(gf_5vect_dot_prod_sve): - /* less than 16 bytes, return_fail */ - cmp x_len, #16 - blt .return_fail - - /* save r19..r29 */ - sub sp, sp, #16 /* alignment */ - str x19, [sp] - - mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ - mov x_pos, #0 - lsl x_vec, x_vec, #3 - ldp x_dest1, x_dest2, [x_dest, #8*0] - ldp x_dest3, x_dest4, [x_dest, #8*2] - ldr x_dest5, [x_dest, #8*4] - -/* Loop 1: x_len, vector length */ -.Lloopsve_vl: - whilelo p0.b, x_pos, x_len - b.none .return_pass - - mov x_vec_i, #0 /* clear x_vec_i */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - mov z_dest1.b, #0 /* clear z_dest1 */ - mov z_dest2.b, #0 /* clear z_dest2 */ - mov z_dest3.b, #0 /* clear z_dest3 */ - mov z_dest4.b, #0 /* clear z_dest4 */ - mov z_dest5.b, #0 /* clear z_dest5 */ - - /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ - mov x_tbl1, x_tbl /* reset x_tbl1 */ - add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ - add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ - add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ - add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ - -/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ -.Lloopsve_vl_vects: - /* load src data, governed by p0 */ - ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ - /* split 4-bit lo; 4-bit hi */ - and z_src_lo.d, z_src.d, z_mask0f.d - lsr z_src_hi.b, z_src.b, #4 - - - /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ - /* load gf_table's */ - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ - ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 - - /* prefetch */ - prfb pldl2keep, p0, [x_tbl1] - prfb pldl2keep, p0, [x_tbl2] - - /* calc for next */ - add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - /* dest 1 */ - /* table indexing, ie. gf(2^8) multiplication */ - tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b - tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b - /* exclusive or, ie. gf(2^8) add */ - eor z_dest1.d, z_gft1_lo.d, z_dest1.d - eor z_dest1.d, z_dest1.d, z_gft1_hi.d - - ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 - ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 - prfb pldl2keep, p0, [x_tbl3] - prfb pldl2keep, p0, [x_tbl4] - - /* dest 2 */ - tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b - tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b - eor z_dest2.d, z_gft2_lo.d, z_dest2.d - eor z_dest2.d, z_dest2.d, z_gft2_hi.d - - /* dest 3 */ - tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b - tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b - eor z_dest3.d, z_gft3_lo.d, z_dest3.d - eor z_dest3.d, z_dest3.d, z_gft3_hi.d - - ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 - prfb pldl2keep, p0, [x_tbl5] - - /* dest 4 */ - tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b - tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b - eor z_dest4.d, z_gft4_lo.d, z_dest4.d - eor z_dest4.d, z_dest4.d, z_gft4_hi.d - - /* dest 5 */ - tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b - tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b - eor z_dest5.d, z_gft5_lo.d, z_dest5.d - eor z_dest5.d, z_dest5.d, z_gft5_hi.d - - cmp x_vec_i, x_vec - blt .Lloopsve_vl_vects -/* end of Loop 2 */ - - /* store dest data, governed by p0 */ - st1b z_dest1.b, p0, [x_dest1, x_pos] - st1b z_dest2.b, p0, [x_dest2, x_pos] - st1b z_dest3.b, p0, [x_dest3, x_pos] - st1b z_dest4.b, p0, [x_dest4, x_pos] - st1b z_dest5.b, p0, [x_dest5, x_pos] - - /* increment one vector length */ - incb x_pos - b .Lloopsve_vl -/* end of Loop 1 */ - -.return_pass: - /* restore r19..r29 */ - ldr x19, [sp] - add sp, sp, #16 - - mov w_ret, #0 - ret - -.return_fail: - mov w_ret, #1 - ret diff --git a/erasure_code/aarch64/gf_6vect_dot_prod_sve.S b/erasure_code/aarch64/gf_6vect_dot_prod_sve.S deleted file mode 100644 index acc98953..00000000 --- a/erasure_code/aarch64/gf_6vect_dot_prod_sve.S +++ /dev/null @@ -1,258 +0,0 @@ -/************************************************************* - Copyright (c) 2021 Linaro Ltd. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Huawei Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ -.text -.align 6 -.arch armv8-a+sve - -#include "../include/aarch64_label.h" - -.global cdecl(gf_6vect_dot_prod_sve) -#ifndef __APPLE__ -.type gf_6vect_dot_prod_sve, %function -#endif -/* void gf_6vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char **dest); - */ - -/* arguments */ -x_len .req x0 /* vector length */ -x_vec .req x1 /* number of source vectors (ie. data blocks) */ -x_tbl .req x2 -x_src .req x3 -x_dest .req x4 - -/* returns */ -w_ret .req w0 - -/* local variables */ -x_vec_i .req x5 -x_ptr .req x6 -x_pos .req x7 - -x_tbl1 .req x8 -x_tbl2 .req x9 -x_tbl3 .req x10 -x_tbl4 .req x11 -x_tbl5 .req x12 -x_tbl6 .req x13 -x_dest1 .req x14 -x_dest2 .req x15 -x_dest6 .req x_dest /* reused */ - -/* r16,r17,r18,r29,r30: special role registers, avoided */ -/* r19..r29 and SP must be preserved */ -x_dest3 .req x19 -x_dest4 .req x20 -x_dest5 .req x21 - -/* vectors */ -z_mask0f .req z0 - -z_src .req z1 -z_src_lo .req z2 -z_src_hi .req z_src - -z_dest1 .req z3 - -z_gft1_lo .req z4 -z_gft1_hi .req z5 -q_gft1_lo .req q4 -q_gft1_hi .req q5 - -/* bottom 64-bit of v8..v15 must be preserved if used */ -z_gft2_lo .req z17 -z_gft2_hi .req z18 -q_gft2_lo .req q17 -q_gft2_hi .req q18 - -z_gft3_lo .req z19 -z_gft3_hi .req z20 -q_gft3_lo .req q19 -q_gft3_hi .req q20 - -z_gft4_lo .req z21 -z_gft4_hi .req z22 -q_gft4_lo .req q21 -q_gft4_hi .req q22 - -z_gft5_lo .req z23 -z_gft5_hi .req z24 -q_gft5_lo .req q23 -q_gft5_hi .req q24 - -z_gft6_lo .req z25 -z_gft6_hi .req z26 -q_gft6_lo .req q25 -q_gft6_hi .req q26 - -z_dest2 .req z27 -z_dest3 .req z28 -z_dest4 .req z29 -z_dest5 .req z30 -z_dest6 .req z31 - -cdecl(gf_6vect_dot_prod_sve): - /* less than 16 bytes, return_fail */ - cmp x_len, #16 - blt .return_fail - - /* save r19..r29 */ - sub sp, sp, #32 /* alignment */ - stp x19, x20, [sp] - str x21, [sp, #16] - - mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ - mov x_pos, #0 - lsl x_vec, x_vec, #3 - ldp x_dest1, x_dest2, [x_dest, #8*0] - ldp x_dest3, x_dest4, [x_dest, #8*2] - ldp x_dest5, x_dest6, [x_dest, #8*4] /* x_dest6 reuses x_dest */ - -/* Loop 1: x_len, vector length */ -.Lloopsve_vl: - whilelo p0.b, x_pos, x_len - b.none .return_pass - - mov x_vec_i, #0 /* clear x_vec_i */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - mov z_dest1.b, #0 /* clear z_dest1 */ - mov z_dest2.b, #0 /* clear z_dest2 */ - mov z_dest3.b, #0 /* clear z_dest3 */ - mov z_dest4.b, #0 /* clear z_dest4 */ - mov z_dest5.b, #0 /* clear z_dest5 */ - mov z_dest6.b, #0 /* clear z_dest6 */ - - /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ - mov x_tbl1, x_tbl /* reset x_tbl1 */ - add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ - add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ - add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ - add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ - add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */ - -/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ -.Lloopsve_vl_vects: - /* load src data, governed by p0 */ - ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ - /* split 4-bit lo; 4-bit hi */ - and z_src_lo.d, z_src.d, z_mask0f.d - lsr z_src_hi.b, z_src.b, #4 - - - /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ - /* load gf_table's */ - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ - ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 - - /* prefetch */ - prfb pldl2keep, p0, [x_tbl1] - prfb pldl2keep, p0, [x_tbl2] - - /* calc for next and prefetch */ - add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - /* dest 1 */ - /* table indexing, ie. gf(2^8) multiplication */ - tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b - tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b - /* exclusive or, ie. gf(2^8) add */ - eor z_dest1.d, z_gft1_lo.d, z_dest1.d - eor z_dest1.d, z_dest1.d, z_gft1_hi.d - - ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 - ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 - prfb pldl2keep, p0, [x_tbl3] - prfb pldl2keep, p0, [x_tbl4] - - /* dest 2 */ - tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b - tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b - eor z_dest2.d, z_gft2_lo.d, z_dest2.d - eor z_dest2.d, z_dest2.d, z_gft2_hi.d - - /* dest 3 */ - tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b - tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b - eor z_dest3.d, z_gft3_lo.d, z_dest3.d - eor z_dest3.d, z_dest3.d, z_gft3_hi.d - - ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 - ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32 - prfb pldl2keep, p0, [x_tbl5] - prfb pldl2keep, p0, [x_tbl6] - - /* dest 4 */ - tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b - tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b - eor z_dest4.d, z_gft4_lo.d, z_dest4.d - eor z_dest4.d, z_dest4.d, z_gft4_hi.d - - /* dest 5 */ - tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b - tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b - eor z_dest5.d, z_gft5_lo.d, z_dest5.d - eor z_dest5.d, z_dest5.d, z_gft5_hi.d - - /* dest 6 */ - tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b - tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b - eor z_dest6.d, z_gft6_lo.d, z_dest6.d - eor z_dest6.d, z_dest6.d, z_gft6_hi.d - - cmp x_vec_i, x_vec - blt .Lloopsve_vl_vects -/* end of Loop 2 */ - - /* store dest data, governed by p0 */ - st1b z_dest1.b, p0, [x_dest1, x_pos] - st1b z_dest2.b, p0, [x_dest2, x_pos] - st1b z_dest3.b, p0, [x_dest3, x_pos] - st1b z_dest4.b, p0, [x_dest4, x_pos] - st1b z_dest5.b, p0, [x_dest5, x_pos] - st1b z_dest6.b, p0, [x_dest6, x_pos] - - /* increment one vector length */ - incb x_pos - b .Lloopsve_vl -/* end of Loop 1 */ - -.return_pass: - /* restore r19..r29 */ - ldr x21, [sp, #16] - ldp x19, x20, [sp] - add sp, sp, #32 - - mov w_ret, #0 - ret - -.return_fail: - mov w_ret, #1 - ret diff --git a/erasure_code/aarch64/gf_7vect_dot_prod_sve.S b/erasure_code/aarch64/gf_7vect_dot_prod_sve.S deleted file mode 100644 index 0f74873d..00000000 --- a/erasure_code/aarch64/gf_7vect_dot_prod_sve.S +++ /dev/null @@ -1,281 +0,0 @@ -/************************************************************* - Copyright (c) 2021 Linaro Ltd. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Huawei Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ -.text -.align 6 -.arch armv8-a+sve - -#include "../include/aarch64_label.h" - -.global cdecl(gf_7vect_dot_prod_sve) -#ifndef __APPLE__ -.type gf_7vect_dot_prod_sve, %function -#endif -/* void gf_7vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char **dest); - */ - -/* arguments */ -x_len .req x0 /* vector length */ -x_vec .req x1 /* number of source vectors (ie. data blocks) */ -x_tbl .req x2 -x_src .req x3 -x_dest .req x4 - -/* returns */ -w_ret .req w0 - -/* local variables */ -x_vec_i .req x5 -x_ptr .req x6 -x_pos .req x7 - -x_tbl1 .req x8 -x_tbl2 .req x9 -x_tbl3 .req x10 -x_tbl4 .req x11 -x_tbl5 .req x12 -x_tbl6 .req x13 -x_tbl7 .req x14 - -x_dest1 .req x15 - -/* r16,r17,r18,r29,r30: special role registers, avoided */ -/* r19..r29 and SP must be preserved */ -x_dest2 .req x19 -x_dest3 .req x20 -x_dest4 .req x21 -x_dest5 .req x22 -x_dest6 .req x23 -x_dest7 .req x_dest /* reused */ - -/* vectors */ -z_mask0f .req z0 - -z_src .req z1 -z_src_lo .req z2 -z_src_hi .req z_src - -z_dest1 .req z3 -z_gft1_lo .req z4 -z_gft1_hi .req z5 -q_gft1_lo .req q4 -q_gft1_hi .req q5 - -z_gft7_lo .req z6 -z_gft7_hi .req z7 -q_gft7_lo .req q6 -q_gft7_hi .req q7 - -/* bottom 64-bit of v8..v15 must be preserved if used */ -z_dest7 .req z16 - -z_gft2_lo .req z17 -z_gft2_hi .req z18 -q_gft2_lo .req q17 -q_gft2_hi .req q18 - -z_gft3_lo .req z19 -z_gft3_hi .req z20 -q_gft3_lo .req q19 -q_gft3_hi .req q20 - -z_gft4_lo .req z21 -z_gft4_hi .req z22 -q_gft4_lo .req q21 -q_gft4_hi .req q22 - -z_gft5_lo .req z23 -z_gft5_hi .req z24 -q_gft5_lo .req q23 -q_gft5_hi .req q24 - -z_gft6_lo .req z25 -z_gft6_hi .req z26 -q_gft6_lo .req q25 -q_gft6_hi .req q26 - -z_dest2 .req z27 -z_dest3 .req z28 -z_dest4 .req z29 -z_dest5 .req z30 -z_dest6 .req z31 - -cdecl(gf_7vect_dot_prod_sve): - /* less than 16 bytes, return_fail */ - cmp x_len, #16 - blt .return_fail - - /* save r19..r29 */ - sub sp, sp, #48 /* alignment */ - stp x19, x20, [sp] - stp x21, x22, [sp, #16] - str x23, [sp, #32] - - mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ - mov x_pos, #0 - lsl x_vec, x_vec, #3 - ldp x_dest1, x_dest2, [x_dest, #8*0] - ldp x_dest3, x_dest4, [x_dest, #8*2] - ldp x_dest5, x_dest6, [x_dest, #8*4] - ldr x_dest7, [x_dest, #8*6] /* x_dest7 reuses x_dest */ - -/* Loop 1: x_len, vector length */ -.Lloopsve_vl: - whilelo p0.b, x_pos, x_len - b.none .return_pass - - mov x_vec_i, #0 /* clear x_vec_i */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - mov z_dest1.b, #0 /* clear z_dest1 */ - mov z_dest2.b, #0 /* clear z_dest2 */ - mov z_dest3.b, #0 /* clear z_dest3 */ - mov z_dest4.b, #0 /* clear z_dest4 */ - mov z_dest5.b, #0 /* clear z_dest5 */ - mov z_dest6.b, #0 /* clear z_dest6 */ - mov z_dest7.b, #0 /* clear z_dest7 */ - - /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ - mov x_tbl1, x_tbl /* reset x_tbl1 */ - add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ - add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ - add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ - add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ - add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */ - add x_tbl7, x_tbl6, x_vec, LSL #2 /* reset x_tbl7 */ - -/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ -.Lloopsve_vl_vects: - /* load src data, governed by p0 */ - ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ - /* split 4-bit lo; 4-bit hi */ - and z_src_lo.d, z_src.d, z_mask0f.d - lsr z_src_hi.b, z_src.b, #4 - - /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ - /* load gf_table's */ - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ - ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 - - /* prefetch */ - prfb pldl2keep, p0, [x_tbl1] - prfb pldl2keep, p0, [x_tbl2] - - /* calc for next and prefetch */ - add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - /* dest 1 */ - /* table indexing, ie. gf(2^8) multiplication */ - tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b - tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b - /* exclusive or, ie. gf(2^8) add */ - eor z_dest1.d, z_gft1_lo.d, z_dest1.d - eor z_dest1.d, z_gft1_hi.d, z_dest1.d - - ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 - ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 - prfb pldl2keep, p0, [x_tbl3] - prfb pldl2keep, p0, [x_tbl4] - - /* dest 2 */ - tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b - tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b - eor z_dest2.d, z_gft2_lo.d, z_dest2.d - eor z_dest2.d, z_gft2_hi.d, z_dest2.d - - /* dest 3 */ - tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b - tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b - eor z_dest3.d, z_gft3_lo.d, z_dest3.d - eor z_dest3.d, z_gft3_hi.d, z_dest3.d - - ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 - ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32 - prfb pldl2keep, p0, [x_tbl5] - prfb pldl2keep, p0, [x_tbl6] - - /* dest 4 */ - tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b - tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b - eor z_dest4.d, z_gft4_lo.d, z_dest4.d - eor z_dest4.d, z_gft4_hi.d, z_dest4.d - - /* dest 5 */ - tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b - tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b - eor z_dest5.d, z_gft5_lo.d, z_dest5.d - eor z_dest5.d, z_gft5_hi.d, z_dest5.d - - ldp q_gft7_lo, q_gft7_hi, [x_tbl7], #32 - prfb pldl2keep, p0, [x_tbl7] - - /* dest 6 */ - tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b - tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b - eor z_dest6.d, z_gft6_lo.d, z_dest6.d - eor z_dest6.d, z_gft6_hi.d, z_dest6.d - - /* dest 7 */ - tbl z_gft7_lo.b, {z_gft7_lo.b}, z_src_lo.b - tbl z_gft7_hi.b, {z_gft7_hi.b}, z_src_hi.b - eor z_dest7.d, z_gft7_lo.d, z_dest7.d - eor z_dest7.d, z_gft7_hi.d, z_dest7.d - - cmp x_vec_i, x_vec - blt .Lloopsve_vl_vects -/* end of Loop 2 */ - - /* store dest data, governed by p0 */ - st1b z_dest1.b, p0, [x_dest1, x_pos] - st1b z_dest2.b, p0, [x_dest2, x_pos] - st1b z_dest3.b, p0, [x_dest3, x_pos] - st1b z_dest4.b, p0, [x_dest4, x_pos] - st1b z_dest5.b, p0, [x_dest5, x_pos] - st1b z_dest6.b, p0, [x_dest6, x_pos] - st1b z_dest7.b, p0, [x_dest7, x_pos] - - /* increment one vector length */ - incb x_pos - b .Lloopsve_vl -/* end of Loop 1 */ - -.return_pass: - /* restore r19..r29 */ - ldr x23, [sp, #32] - ldp x21, x22, [sp, #16] - ldp x19, x20, [sp] - add sp, sp, #48 - - mov w_ret, #0 - ret - -.return_fail: - mov w_ret, #1 - ret diff --git a/erasure_code/aarch64/gf_8vect_dot_prod_sve.S b/erasure_code/aarch64/gf_8vect_dot_prod_sve.S deleted file mode 100644 index 20768f48..00000000 --- a/erasure_code/aarch64/gf_8vect_dot_prod_sve.S +++ /dev/null @@ -1,307 +0,0 @@ -/************************************************************* - Copyright (c) 2021 Linaro Ltd. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Huawei Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ -.text -.align 6 -.arch armv8-a+sve - -#include "../include/aarch64_label.h" - -.global cdecl(gf_8vect_dot_prod_sve) -#ifndef __APPLE__ -.type gf_8vect_dot_prod_sve, %function -#endif -/* void gf_8vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char **dest); - */ - -/* arguments */ -x_len .req x0 /* vector length */ -x_vec .req x1 /* number of source vectors (ie. data blocks) */ -x_tbl .req x2 -x_src .req x3 -x_dest .req x4 - -/* returns */ -w_ret .req w0 - -/* local variables */ -x_vec_i .req x5 -x_ptr .req x6 -x_pos .req x7 - -x_tbl1 .req x8 -x_tbl2 .req x9 -x_tbl3 .req x10 -x_tbl4 .req x11 -x_tbl5 .req x12 -x_tbl6 .req x13 -x_tbl7 .req x14 - -x_dest1 .req x15 - -/* r16,r17,r18,r29,r30: special role registers, avoided */ -/* r19..r29 and SP must be preserved */ -x_dest2 .req x19 -x_dest3 .req x20 -x_dest4 .req x21 -x_dest5 .req x22 -x_dest6 .req x23 -x_dest7 .req x24 -x_dest8 .req x_dest /* reused */ -x_tbl8 .req x25 - -/* vectors */ -z_mask0f .req z0 - -z_src .req z1 -z_src_lo .req z2 -z_src_hi .req z_src - -z_dest1 .req z3 -z_gft1_lo .req z4 -z_gft1_hi .req z5 -q_gft1_lo .req q4 -q_gft1_hi .req q5 - -z_gft7_lo .req z6 -z_gft7_hi .req z7 -q_gft7_lo .req q6 -q_gft7_hi .req q7 - -/* bottom 64-bit of v8..v15 must be preserved if used */ -z_dest7 .req z8 - -z_gft8_lo .req z9 -z_gft8_hi .req z10 -q_gft8_lo .req q9 -q_gft8_hi .req q10 - -z_dest8 .req z16 - -z_gft2_lo .req z17 -z_gft2_hi .req z18 -q_gft2_lo .req q17 -q_gft2_hi .req q18 - -z_gft3_lo .req z19 -z_gft3_hi .req z20 -q_gft3_lo .req q19 -q_gft3_hi .req q20 - -z_gft4_lo .req z21 -z_gft4_hi .req z22 -q_gft4_lo .req q21 -q_gft4_hi .req q22 - -z_gft5_lo .req z23 -z_gft5_hi .req z24 -q_gft5_lo .req q23 -q_gft5_hi .req q24 - -z_gft6_lo .req z25 -z_gft6_hi .req z26 -q_gft6_lo .req q25 -q_gft6_hi .req q26 - -z_dest2 .req z27 -z_dest3 .req z28 -z_dest4 .req z29 -z_dest5 .req z30 -z_dest6 .req z31 - -cdecl(gf_8vect_dot_prod_sve): - /* less than 16 bytes, return_fail */ - cmp x_len, #16 - blt .return_fail - - /* save r19..r29 */ - sub sp, sp, #80 /* alignment */ - stp x19, x20, [sp] - stp x21, x22, [sp, #16] - stp x23, x24, [sp, #32] - stp d8, d9, [sp, #48] - str d10, [sp, #56] - str x25, [sp, #64] - - mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ - mov x_pos, #0 - lsl x_vec, x_vec, #3 - ldp x_dest1, x_dest2, [x_dest, #8*0] - ldp x_dest3, x_dest4, [x_dest, #8*2] - ldp x_dest5, x_dest6, [x_dest, #8*4] - ldp x_dest7, x_dest8, [x_dest, #8*6] /* x_dest8 reuses x_dest */ - -/* Loop 1: x_len, vector length */ -.Lloopsve_vl: - whilelo p0.b, x_pos, x_len - b.none .return_pass - - mov x_vec_i, #0 /* clear x_vec_i */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - mov z_dest1.b, #0 /* clear z_dest1 */ - mov z_dest2.b, #0 /* clear z_dest2 */ - mov z_dest3.b, #0 /* clear z_dest3 */ - mov z_dest4.b, #0 /* clear z_dest4 */ - mov z_dest5.b, #0 /* clear z_dest5 */ - mov z_dest6.b, #0 /* clear z_dest6 */ - mov z_dest7.b, #0 /* clear z_dest7 */ - mov z_dest8.b, #0 /* clear z_dest8 */ - - /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ - mov x_tbl1, x_tbl /* reset x_tbl1 */ - add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ - add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ - add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ - add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ - add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */ - add x_tbl7, x_tbl6, x_vec, LSL #2 /* reset x_tbl7 */ - add x_tbl8, x_tbl7, x_vec, LSL #2 /* reset x_tbl8 */ - -/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ -.Lloopsve_vl_vects: - /* load src data, governed by p0 */ - ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ - /* split 4-bit lo; 4-bit hi */ - and z_src_lo.d, z_src.d, z_mask0f.d - lsr z_src_hi.b, z_src.b, #4 - - /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ - /* load gf_table's */ - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ - ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 - - /* prefetch */ - prfb pldl2keep, p0, [x_tbl1] - prfb pldl2keep, p0, [x_tbl2] - - /* calc for next and prefetch */ - add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - /* dest 1 */ - /* table indexing, ie. gf(2^8) multiplication */ - tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b - tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b - /* exclusive or, ie. gf(2^8) add */ - eor z_dest1.d, z_gft1_lo.d, z_dest1.d - eor z_dest1.d, z_gft1_hi.d, z_dest1.d - - ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 - ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 - prfb pldl2keep, p0, [x_tbl3] - prfb pldl2keep, p0, [x_tbl4] - - /* dest 2 */ - tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b - tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b - eor z_dest2.d, z_gft2_lo.d, z_dest2.d - eor z_dest2.d, z_gft2_hi.d, z_dest2.d - - /* dest 3 */ - tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b - tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b - eor z_dest3.d, z_gft3_lo.d, z_dest3.d - eor z_dest3.d, z_gft3_hi.d, z_dest3.d - - ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 - ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32 - prfb pldl2keep, p0, [x_tbl5] - prfb pldl2keep, p0, [x_tbl6] - - /* dest 4 */ - tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b - tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b - eor z_dest4.d, z_gft4_lo.d, z_dest4.d - eor z_dest4.d, z_gft4_hi.d, z_dest4.d - - /* dest 5 */ - tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b - tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b - eor z_dest5.d, z_gft5_lo.d, z_dest5.d - eor z_dest5.d, z_gft5_hi.d, z_dest5.d - - ldp q_gft7_lo, q_gft7_hi, [x_tbl7], #32 - ldp q_gft8_lo, q_gft8_hi, [x_tbl8], #32 - prfb pldl2keep, p0, [x_tbl7] - prfb pldl2keep, p0, [x_tbl8] - - /* dest 6 */ - tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b - tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b - eor z_dest6.d, z_gft6_lo.d, z_dest6.d - eor z_dest6.d, z_gft6_hi.d, z_dest6.d - - /* dest 7 */ - tbl z_gft7_lo.b, {z_gft7_lo.b}, z_src_lo.b - tbl z_gft7_hi.b, {z_gft7_hi.b}, z_src_hi.b - eor z_dest7.d, z_gft7_lo.d, z_dest7.d - eor z_dest7.d, z_gft7_hi.d, z_dest7.d - - /* dest 8 */ - tbl z_gft8_lo.b, {z_gft8_lo.b}, z_src_lo.b - tbl z_gft8_hi.b, {z_gft8_hi.b}, z_src_hi.b - eor z_dest8.d, z_gft8_lo.d, z_dest8.d - eor z_dest8.d, z_gft8_hi.d, z_dest8.d - - cmp x_vec_i, x_vec - blt .Lloopsve_vl_vects -/* end of Loop 2 */ - - /* store dest data, governed by p0 */ - st1b z_dest1.b, p0, [x_dest1, x_pos] - st1b z_dest2.b, p0, [x_dest2, x_pos] - st1b z_dest3.b, p0, [x_dest3, x_pos] - st1b z_dest4.b, p0, [x_dest4, x_pos] - st1b z_dest5.b, p0, [x_dest5, x_pos] - st1b z_dest6.b, p0, [x_dest6, x_pos] - st1b z_dest7.b, p0, [x_dest7, x_pos] - st1b z_dest8.b, p0, [x_dest8, x_pos] - - /* increment one vector length */ - incb x_pos - b .Lloopsve_vl -/* end of Loop 1 */ - -.return_pass: - /* restore r19..r29 */ - ldr x25, [sp, #64] - ldr d10, [sp, #56] - ldp d8, d9, [sp, #48] - ldp x23, x24, [sp, #32] - ldp x21, x22, [sp, #16] - ldp x19, x20, [sp] - add sp, sp, #80 - - mov w_ret, #0 - ret - -.return_fail: - mov w_ret, #1 - ret diff --git a/erasure_code/aarch64/gf_nvect_dot_prod_sve.c b/erasure_code/aarch64/gf_nvect_dot_prod_sve.c new file mode 100644 index 00000000..ebf31399 --- /dev/null +++ b/erasure_code/aarch64/gf_nvect_dot_prod_sve.c @@ -0,0 +1,356 @@ +/************************************************************** + Copyright 2025 Amazon.com, Inc. or its affiliates. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Amazon.com, Inc. nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include + +// This implementation of the nvect_dot_prod uses several techniques for optimization: +// +// 1. Instead of a separate assembly implementation for each n-vect function, a single +// implementation in C can be optimized by the compiler to produce all of the versions. +// This is accomplished with a static function with the main implementation and +// non-static (i.e. exported) functions with the nvect argument hard coded. The compiler +// will inline the main function into each exported function and discard unused portions +// of the code. +// +// 2. SVE data types cannot be used in arrays since their sizes are not known. Instead +// split them out into separate variables and use switch-case blocks to do what we +// would normally do with a simple loop over an array. This also ensures that the +// compiler does not use loops in the output. +// +// 3. Additional loop unrolling: in addition to unrolling to the vector width, we also +// unroll 4x more and process 4x the vector width in each iteration of the loop. +// +// 4. A second version of each function is built with +sve2. SVE2 introduces the EOR3 +// instruction which allows consolidation of some of the XOR operations. The compiler +// can do this automatically in optimization so a separate implementation isn't required. +// We simply allow the compiler to generate SVE2 versions as well. + +__attribute__((target("+sve"), always_inline)) static inline void +gf_nvect_dot_prod_sve_unrolled(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest, int nvect) +{ + if (len < 16) + return; + + const svuint8_t mask0f = svdup_u8(0x0f); + const svbool_t predicate_true = svptrue_b8(); + int sve_len = svcntb(); + int pos = 0; + + // 4x unrolled main loop - SVE predicates handle ALL remaining data automatically + while (pos < len) { + // Create predicates for 4 batches - SVE masks beyond array bounds + svbool_t predicate_0 = svwhilelt_b8_s32(pos + sve_len * 0, len); + svbool_t predicate_1 = svwhilelt_b8_s32(pos + sve_len * 1, len); + svbool_t predicate_2 = svwhilelt_b8_s32(pos + sve_len * 2, len); + svbool_t predicate_3 = svwhilelt_b8_s32(pos + sve_len * 3, len); + + // Exit if no active lanes in first predicate + if (!svptest_any(predicate_true, predicate_0)) + break; + + // Initialize destination accumulators - use individual variables + svuint8_t dest_acc0_0, dest_acc0_1, dest_acc0_2, dest_acc0_3, dest_acc0_4, + dest_acc0_5, dest_acc0_6; + svuint8_t dest_acc1_0, dest_acc1_1, dest_acc1_2, dest_acc1_3, dest_acc1_4, + dest_acc1_5, dest_acc1_6; + svuint8_t dest_acc2_0, dest_acc2_1, dest_acc2_2, dest_acc2_3, dest_acc2_4, + dest_acc2_5, dest_acc2_6; + svuint8_t dest_acc3_0, dest_acc3_1, dest_acc3_2, dest_acc3_3, dest_acc3_4, + dest_acc3_5, dest_acc3_6; + + // Initialize based on nvect + switch (nvect) { + case 7: + dest_acc0_6 = dest_acc1_6 = dest_acc2_6 = dest_acc3_6 = + svdup_u8(0); // fallthrough + case 6: + dest_acc0_5 = dest_acc1_5 = dest_acc2_5 = dest_acc3_5 = + svdup_u8(0); // fallthrough + case 5: + dest_acc0_4 = dest_acc1_4 = dest_acc2_4 = dest_acc3_4 = + svdup_u8(0); // fallthrough + case 4: + dest_acc0_3 = dest_acc1_3 = dest_acc2_3 = dest_acc3_3 = + svdup_u8(0); // fallthrough + case 3: + dest_acc0_2 = dest_acc1_2 = dest_acc2_2 = dest_acc3_2 = + svdup_u8(0); // fallthrough + case 2: + dest_acc0_1 = dest_acc1_1 = dest_acc2_1 = dest_acc3_1 = + svdup_u8(0); // fallthrough + case 1: + dest_acc0_0 = dest_acc1_0 = dest_acc2_0 = dest_acc3_0 = svdup_u8(0); + break; + } + + // Process all source vectors + for (int v = 0; v < vlen; v++) { + // Load 4 batches of source data + svuint8_t src_data0 = svld1_u8(predicate_0, &src[v][pos + sve_len * 0]); + svuint8_t src_data1 = svld1_u8(predicate_1, &src[v][pos + sve_len * 1]); + svuint8_t src_data2 = svld1_u8(predicate_2, &src[v][pos + sve_len * 2]); + svuint8_t src_data3 = svld1_u8(predicate_3, &src[v][pos + sve_len * 3]); + + // Extract nibbles for all batches + svuint8_t src_lo0 = svand_x(predicate_0, src_data0, mask0f); + svuint8_t src_hi0 = svlsr_x(predicate_0, src_data0, 4); + svuint8_t src_lo1 = svand_x(predicate_1, src_data1, mask0f); + svuint8_t src_hi1 = svlsr_x(predicate_1, src_data1, 4); + svuint8_t src_lo2 = svand_x(predicate_2, src_data2, mask0f); + svuint8_t src_hi2 = svlsr_x(predicate_2, src_data2, 4); + svuint8_t src_lo3 = svand_x(predicate_3, src_data3, mask0f); + svuint8_t src_hi3 = svlsr_x(predicate_3, src_data3, 4); + + // Process each destination with unrolled batches + for (int d = 0; d < nvect; d++) { + unsigned char *tbl_base = &gftbls[d * vlen * 32 + v * 32]; + svuint8_t tbl_lo = svld1_u8(predicate_true, tbl_base); + svuint8_t tbl_hi = svld1_u8(predicate_true, tbl_base + 16); + + // Batch 0 + svuint8_t gf_lo0 = svtbl_u8(tbl_lo, src_lo0); + svuint8_t gf_hi0 = svtbl_u8(tbl_hi, src_hi0); + + // Batch 1 + svuint8_t gf_lo1 = svtbl_u8(tbl_lo, src_lo1); + svuint8_t gf_hi1 = svtbl_u8(tbl_hi, src_hi1); + + // Batch 2 + svuint8_t gf_lo2 = svtbl_u8(tbl_lo, src_lo2); + svuint8_t gf_hi2 = svtbl_u8(tbl_hi, src_hi2); + + // Batch 3 + svuint8_t gf_lo3 = svtbl_u8(tbl_lo, src_lo3); + svuint8_t gf_hi3 = svtbl_u8(tbl_hi, src_hi3); + + svuint8_t gf_result0 = sveor_x(predicate_0, gf_lo0, gf_hi0); + svuint8_t gf_result1 = sveor_x(predicate_1, gf_lo1, gf_hi1); + svuint8_t gf_result2 = sveor_x(predicate_2, gf_lo2, gf_hi2); + svuint8_t gf_result3 = sveor_x(predicate_3, gf_lo3, gf_hi3); + + // Accumulate results + switch (d) { + case 0: + dest_acc0_0 = sveor_x(predicate_0, dest_acc0_0, gf_result0); + dest_acc1_0 = sveor_x(predicate_1, dest_acc1_0, gf_result1); + dest_acc2_0 = sveor_x(predicate_2, dest_acc2_0, gf_result2); + dest_acc3_0 = sveor_x(predicate_3, dest_acc3_0, gf_result3); + break; + case 1: + dest_acc0_1 = sveor_x(predicate_0, dest_acc0_1, gf_result0); + dest_acc1_1 = sveor_x(predicate_1, dest_acc1_1, gf_result1); + dest_acc2_1 = sveor_x(predicate_2, dest_acc2_1, gf_result2); + dest_acc3_1 = sveor_x(predicate_3, dest_acc3_1, gf_result3); + break; + case 2: + dest_acc0_2 = sveor_x(predicate_0, dest_acc0_2, gf_result0); + dest_acc1_2 = sveor_x(predicate_1, dest_acc1_2, gf_result1); + dest_acc2_2 = sveor_x(predicate_2, dest_acc2_2, gf_result2); + dest_acc3_2 = sveor_x(predicate_3, dest_acc3_2, gf_result3); + break; + case 3: + dest_acc0_3 = sveor_x(predicate_0, dest_acc0_3, gf_result0); + dest_acc1_3 = sveor_x(predicate_1, dest_acc1_3, gf_result1); + dest_acc2_3 = sveor_x(predicate_2, dest_acc2_3, gf_result2); + dest_acc3_3 = sveor_x(predicate_3, dest_acc3_3, gf_result3); + break; + case 4: + dest_acc0_4 = sveor_x(predicate_0, dest_acc0_4, gf_result0); + dest_acc1_4 = sveor_x(predicate_1, dest_acc1_4, gf_result1); + dest_acc2_4 = sveor_x(predicate_2, dest_acc2_4, gf_result2); + dest_acc3_4 = sveor_x(predicate_3, dest_acc3_4, gf_result3); + break; + case 5: + dest_acc0_5 = sveor_x(predicate_0, dest_acc0_5, gf_result0); + dest_acc1_5 = sveor_x(predicate_1, dest_acc1_5, gf_result1); + dest_acc2_5 = sveor_x(predicate_2, dest_acc2_5, gf_result2); + dest_acc3_5 = sveor_x(predicate_3, dest_acc3_5, gf_result3); + break; + case 6: + dest_acc0_6 = sveor_x(predicate_0, dest_acc0_6, gf_result0); + dest_acc1_6 = sveor_x(predicate_1, dest_acc1_6, gf_result1); + dest_acc2_6 = sveor_x(predicate_2, dest_acc2_6, gf_result2); + dest_acc3_6 = sveor_x(predicate_3, dest_acc3_6, gf_result3); + break; + } + } + } + + // Store results for all batches + switch (nvect) { + case 7: + svst1_u8(predicate_0, &dest[6][pos + sve_len * 0], dest_acc0_6); + svst1_u8(predicate_1, &dest[6][pos + sve_len * 1], dest_acc1_6); + svst1_u8(predicate_2, &dest[6][pos + sve_len * 2], dest_acc2_6); + svst1_u8(predicate_3, &dest[6][pos + sve_len * 3], dest_acc3_6); + // fallthrough + case 6: + svst1_u8(predicate_0, &dest[5][pos + sve_len * 0], dest_acc0_5); + svst1_u8(predicate_1, &dest[5][pos + sve_len * 1], dest_acc1_5); + svst1_u8(predicate_2, &dest[5][pos + sve_len * 2], dest_acc2_5); + svst1_u8(predicate_3, &dest[5][pos + sve_len * 3], dest_acc3_5); + // fallthrough + case 5: + svst1_u8(predicate_0, &dest[4][pos + sve_len * 0], dest_acc0_4); + svst1_u8(predicate_1, &dest[4][pos + sve_len * 1], dest_acc1_4); + svst1_u8(predicate_2, &dest[4][pos + sve_len * 2], dest_acc2_4); + svst1_u8(predicate_3, &dest[4][pos + sve_len * 3], dest_acc3_4); + // fallthrough + case 4: + svst1_u8(predicate_0, &dest[3][pos + sve_len * 0], dest_acc0_3); + svst1_u8(predicate_1, &dest[3][pos + sve_len * 1], dest_acc1_3); + svst1_u8(predicate_2, &dest[3][pos + sve_len * 2], dest_acc2_3); + svst1_u8(predicate_3, &dest[3][pos + sve_len * 3], dest_acc3_3); + // fallthrough + case 3: + svst1_u8(predicate_0, &dest[2][pos + sve_len * 0], dest_acc0_2); + svst1_u8(predicate_1, &dest[2][pos + sve_len * 1], dest_acc1_2); + svst1_u8(predicate_2, &dest[2][pos + sve_len * 2], dest_acc2_2); + svst1_u8(predicate_3, &dest[2][pos + sve_len * 3], dest_acc3_2); + // fallthrough + case 2: + svst1_u8(predicate_0, &dest[1][pos + sve_len * 0], dest_acc0_1); + svst1_u8(predicate_1, &dest[1][pos + sve_len * 1], dest_acc1_1); + svst1_u8(predicate_2, &dest[1][pos + sve_len * 2], dest_acc2_1); + svst1_u8(predicate_3, &dest[1][pos + sve_len * 3], dest_acc3_1); + // fallthrough + case 1: + svst1_u8(predicate_0, &dest[0][pos + sve_len * 0], dest_acc0_0); + svst1_u8(predicate_1, &dest[0][pos + sve_len * 1], dest_acc1_0); + svst1_u8(predicate_2, &dest[0][pos + sve_len * 2], dest_acc2_0); + svst1_u8(predicate_3, &dest[0][pos + sve_len * 3], dest_acc3_0); + break; + } + + pos += sve_len * 4; + } +} + +// Optimized wrapper functions +__attribute__((target("+sve"))) void +gf_vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char *dest) +{ + unsigned char *dest_array[1] = { dest }; + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest_array, 1); +} + +__attribute__((target("+sve"))) void +gf_2vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 2); +} + +__attribute__((target("+sve"))) void +gf_3vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 3); +} + +__attribute__((target("+sve"))) void +gf_4vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 4); +} + +__attribute__((target("+sve"))) void +gf_5vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 5); +} + +__attribute__((target("+sve"))) void +gf_6vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 6); +} + +__attribute__((target("+sve"))) void +gf_7vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 7); +} + +// SVE2 wrapper functions - compiler will optimize eor to eor3 automatically +__attribute__((target("+sve+sve2"))) void +gf_vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char *dest) +{ + unsigned char *dest_array[1] = { dest }; + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest_array, 1); +} + +__attribute__((target("+sve+sve2"))) void +gf_2vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 2); +} + +__attribute__((target("+sve+sve2"))) void +gf_3vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 3); +} + +__attribute__((target("+sve+sve2"))) void +gf_4vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 4); +} + +__attribute__((target("+sve+sve2"))) void +gf_5vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 5); +} + +__attribute__((target("+sve+sve2"))) void +gf_6vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 6); +} + +__attribute__((target("+sve+sve2"))) void +gf_7vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 7); +} diff --git a/erasure_code/aarch64/gf_vect_dot_prod_sve.S b/erasure_code/aarch64/gf_vect_dot_prod_sve.S deleted file mode 100644 index 48ce151f..00000000 --- a/erasure_code/aarch64/gf_vect_dot_prod_sve.S +++ /dev/null @@ -1,132 +0,0 @@ -/************************************************************** - Copyright (c) 2021 Linaro Ltd. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Huawei Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ -.text -.align 6 -.arch armv8-a+sve - -#include "../include/aarch64_label.h" - -.global cdecl(gf_vect_dot_prod_sve) -#ifndef __APPLE__ -.type gf_vect_dot_prod_sve, %function -#endif -/* void gf_vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char *dest); - */ - -/* arguments */ -x_len .req x0 /* vector length */ -x_vec .req x1 /* number of source vectors (ie. data blocks) */ -x_tbl .req x2 -x_src .req x3 -x_dest1 .req x4 - -/* returns */ -w_ret .req w0 - -/* local variables */ -x_vec_i .req x5 -x_ptr .req x6 -x_pos .req x7 -x_tbl1 .req x8 - -/* vectors */ -z_mask0f .req z0 - -z_src .req z1 -z_src_lo .req z2 -z_src_hi .req z_src - -z_dest .req z3 - -z_gft1_lo .req z4 -z_gft1_hi .req z5 -q_gft1_lo .req q4 -q_gft1_hi .req q5 - -cdecl(gf_vect_dot_prod_sve): - /* less than 16 bytes, return_fail */ - cmp x_len, #16 - blt .return_fail - - mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ - mov x_pos, #0 - lsl x_vec, x_vec, #3 - -/* Loop 1: x_len, vector length */ -.Lloopsve_vl: - whilelo p0.b, x_pos, x_len - b.none .return_pass - - mov z_dest.b, #0 /* clear z_dest */ - mov x_vec_i, #0 /* clear x_vec_i */ - mov x_tbl1, x_tbl /* reset x_tbl1 */ - -/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ -.Lloopsve_vl_vects: - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - /* load src data, governed by p0 */ - ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ - - add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ - - /* load gf_table */ - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is added by #32 - for each src vect */ - - /* split 4-bit lo; 4-bit hi */ - and z_src_lo.d, z_src.d, z_mask0f.d - lsr z_src_hi.b, z_src.b, #4 - - /* table indexing, ie. gf(2^8) multiplication */ - tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b - tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b - - /* exclusive or, ie. gf(2^8) add */ - eor z_dest.d, z_gft1_lo.d, z_dest.d - eor z_dest.d, z_gft1_hi.d, z_dest.d - - cmp x_vec_i, x_vec - blt .Lloopsve_vl_vects - - /* end of Loop 2 */ - /* store dest data, governed by p0 */ - st1b z_dest.b, p0, [x_dest1, x_pos] - /* increment one vector length */ - incb x_pos - - b .Lloopsve_vl - -.return_pass: - mov w_ret, #0 - ret - -.return_fail: - mov w_ret, #1 - ret From b01e834f18a00e3b1f8f1cff8dbc45fa213c65ce Mon Sep 17 00:00:00 2001 From: Jonathan Swinney Date: Tue, 14 Oct 2025 18:18:13 +0000 Subject: [PATCH 2/2] aarch64: Optimize SVE encode functions to use peak-performance vector combinations Update both ec_encode_data_sve() and ec_encode_data_sve2() to use optimal 4 and 5 vector combinations based on benchmark results showing these achieve the highest performance. Key optimizations: - Loop over 4-vector operations when rows > 7 (peak performance) - Use 4+3 combination for 7 vectors instead of single 7-vector call - Use 4+2 combination for 6 vectors instead of single 6-vector call - Keep 5-vector for 5 vectors (second-best performance) - Applies to both SVE and SVE2 variants for consistent optimization This leverages the benchmark findings that 4 and 5 vector operations achieve 40+ GB/s performance, significantly better than 6-7 vector operations which drop to 30-36 GB/s. Signed-off-by: Jonathan Swinney --- .../aarch64/ec_aarch64_highlevel_func.c | 92 ++++++------------- 1 file changed, 26 insertions(+), 66 deletions(-) diff --git a/erasure_code/aarch64/ec_aarch64_highlevel_func.c b/erasure_code/aarch64/ec_aarch64_highlevel_func.c index 1a100c8e..882ed1f3 100644 --- a/erasure_code/aarch64/ec_aarch64_highlevel_func.c +++ b/erasure_code/aarch64/ec_aarch64_highlevel_func.c @@ -214,47 +214,27 @@ ec_encode_data_sve(int len, int k, int rows, unsigned char *g_tbls, unsigned cha return; } - while (rows > 11) { - gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding); - g_tbls += 6 * k * 32; - coding += 6; - rows -= 6; + while (rows > 7) { + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 4 * k * 32; + coding += 4; + rows -= 4; } switch (rows) { - case 11: - /* 7 + 4 */ - gf_7vect_dot_prod_sve(len, k, g_tbls, data, coding); - g_tbls += 7 * k * 32; - coding += 7; - gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); - break; - case 10: - /* 6 + 4 */ - gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding); - g_tbls += 6 * k * 32; - coding += 6; - gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); - break; - case 9: - /* 5 + 4 */ - gf_5vect_dot_prod_sve(len, k, g_tbls, data, coding); - g_tbls += 5 * k * 32; - coding += 5; - gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); - break; - case 8: - /* 4 + 4 */ + case 7: + /* 4 + 3 */ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); g_tbls += 4 * k * 32; coding += 4; - gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); - break; - case 7: - gf_7vect_dot_prod_sve(len, k, g_tbls, data, coding); + gf_3vect_dot_prod_sve(len, k, g_tbls, data, coding); break; case 6: - gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding); + /* 4 + 2 */ + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 4 * k * 32; + coding += 4; + gf_2vect_dot_prod_sve(len, k, g_tbls, data, coding); break; case 5: gf_5vect_dot_prod_sve(len, k, g_tbls, data, coding); @@ -285,47 +265,27 @@ ec_encode_data_sve2(int len, int k, int rows, unsigned char *g_tbls, unsigned ch return; } - while (rows > 11) { - gf_6vect_dot_prod_sve2(len, k, g_tbls, data, coding); - g_tbls += 6 * k * 32; - coding += 6; - rows -= 6; + while (rows > 7) { + gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding); + g_tbls += 4 * k * 32; + coding += 4; + rows -= 4; } switch (rows) { - case 11: - /* 7 + 4 */ - gf_7vect_dot_prod_sve2(len, k, g_tbls, data, coding); - g_tbls += 7 * k * 32; - coding += 7; - gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding); - break; - case 10: - /* 6 + 4 */ - gf_6vect_dot_prod_sve2(len, k, g_tbls, data, coding); - g_tbls += 6 * k * 32; - coding += 6; - gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding); - break; - case 9: - /* 5 + 4 */ - gf_5vect_dot_prod_sve2(len, k, g_tbls, data, coding); - g_tbls += 5 * k * 32; - coding += 5; - gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding); - break; - case 8: - /* 4 + 4 */ + case 7: + /* 4 + 3 */ gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding); g_tbls += 4 * k * 32; coding += 4; - gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding); - break; - case 7: - gf_7vect_dot_prod_sve2(len, k, g_tbls, data, coding); + gf_3vect_dot_prod_sve2(len, k, g_tbls, data, coding); break; case 6: - gf_6vect_dot_prod_sve2(len, k, g_tbls, data, coding); + /* 4 + 2 */ + gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding); + g_tbls += 4 * k * 32; + coding += 4; + gf_2vect_dot_prod_sve2(len, k, g_tbls, data, coding); break; case 5: gf_5vect_dot_prod_sve2(len, k, g_tbls, data, coding);