diff --git a/erasure_code/aarch64/Makefile.am b/erasure_code/aarch64/Makefile.am index 47bbf12d..3d9a3644 100644 --- a/erasure_code/aarch64/Makefile.am +++ b/erasure_code/aarch64/Makefile.am @@ -48,13 +48,9 @@ lsrc_aarch64 += \ erasure_code/aarch64/gf_4vect_mad_sve.S \ erasure_code/aarch64/gf_5vect_mad_sve.S \ erasure_code/aarch64/gf_6vect_mad_sve.S \ - erasure_code/aarch64/gf_vect_dot_prod_sve.S \ - erasure_code/aarch64/gf_2vect_dot_prod_sve.S \ - erasure_code/aarch64/gf_3vect_dot_prod_sve.S \ - erasure_code/aarch64/gf_4vect_dot_prod_sve.S \ - erasure_code/aarch64/gf_5vect_dot_prod_sve.S \ - erasure_code/aarch64/gf_6vect_dot_prod_sve.S \ - erasure_code/aarch64/gf_7vect_dot_prod_sve.S \ - erasure_code/aarch64/gf_8vect_dot_prod_sve.S \ erasure_code/aarch64/gf_vect_mul_sve.S \ + erasure_code/aarch64/gf_nvect_dot_prod_sve.c \ erasure_code/aarch64/ec_multibinary_arm.S + +# Ensure SVE intrinsics are compiled with maximum optimization +erasure_code/aarch64/gf_nvect_dot_prod_sve.lo: CFLAGS += -O3 diff --git a/erasure_code/aarch64/ec_aarch64_dispatcher.c b/erasure_code/aarch64/ec_aarch64_dispatcher.c index 93896e66..f15f65bb 100644 --- a/erasure_code/aarch64/ec_aarch64_dispatcher.c +++ b/erasure_code/aarch64/ec_aarch64_dispatcher.c @@ -30,43 +30,18 @@ #include "erasure_code.h" #include "gf_vect_mul.h" -#ifdef __ARM_FEATURE_SVE -// If the compiler defines SVE intrinsics, include that header -#include - -#elif defined(__linux__) -// Otherwise include these headers and define these constants as a fallback for Linux only -#include +#ifdef __linux__ #include -#include -#ifndef PR_SVE_GET_VL -#define PR_SVE_GET_VL 51 -#endif -#ifndef PR_SVE_VL_LEN_MASK -#define PR_SVE_VL_LEN_MASK 0xffff -#endif - +#ifndef HWCAP2_SVE2 +#define HWCAP2_SVE2 (1 << 1) #endif - -static inline size_t -get_sve_vector_length_bytes(void) -{ -#ifdef __ARM_FEATURE_SVE - // Use intrinsic if available at compile time - return svcntb(); -#elif defined(__linux__) - // Fall back to prctl on Linux - long sve_vl = prctl(PR_SVE_GET_VL); - if (sve_vl != -1) { - return sve_vl & PR_SVE_VL_LEN_MASK; - } #endif - return 0; // Unknown or unavailable -} extern void gf_vect_dot_prod_sve(int, int, unsigned char *, unsigned char **, unsigned char *); extern void +gf_vect_dot_prod_sve2(int, int, unsigned char *, unsigned char **, unsigned char *); +extern void gf_vect_dot_prod_neon(int, int, unsigned char *, unsigned char **, unsigned char *); extern void @@ -75,7 +50,9 @@ extern void gf_vect_mad_neon(int, int, int, unsigned char *, unsigned char *, unsigned char *); extern void -ec_encode_data_sve(int, int, int, unsigned char *, unsigned char **, unsigned char **coding); +ec_encode_data_sve(int, int, int, unsigned char *, unsigned char **, unsigned char **); +extern void +ec_encode_data_sve2(int, int, int, unsigned char *, unsigned char **, unsigned char **); extern void ec_encode_data_neon(int, int, int, unsigned char *, unsigned char **, unsigned char **); @@ -93,7 +70,10 @@ DEFINE_INTERFACE_DISPATCHER(gf_vect_dot_prod) { #if defined(__linux__) unsigned long auxval = getauxval(AT_HWCAP); + unsigned long auxval2 = getauxval(AT_HWCAP2); + if ((auxval & HWCAP_SVE) && (auxval2 & HWCAP2_SVE2)) + return gf_vect_dot_prod_sve2; if (auxval & HWCAP_SVE) return gf_vect_dot_prod_sve; if (auxval & HWCAP_ASIMD) @@ -127,15 +107,11 @@ DEFINE_INTERFACE_DISPATCHER(ec_encode_data) { #if defined(__linux__) unsigned long auxval = getauxval(AT_HWCAP); + unsigned long auxval2 = getauxval(AT_HWCAP2); + if ((auxval & HWCAP_SVE) && (auxval2 & HWCAP2_SVE2)) + return ec_encode_data_sve2; if (auxval & HWCAP_SVE) { - size_t vector_length = get_sve_vector_length_bytes(); - - // If 128-bit SVE (16 bytes), use NEON instead - if (vector_length == 16 && (auxval & HWCAP_ASIMD)) { - return ec_encode_data_neon; - } - return ec_encode_data_sve; } if (auxval & HWCAP_ASIMD) diff --git a/erasure_code/aarch64/ec_aarch64_highlevel_func.c b/erasure_code/aarch64/ec_aarch64_highlevel_func.c index 51f8ff02..882ed1f3 100644 --- a/erasure_code/aarch64/ec_aarch64_highlevel_func.c +++ b/erasure_code/aarch64/ec_aarch64_highlevel_func.c @@ -161,12 +161,34 @@ gf_6vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char ** extern void gf_7vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char **src, unsigned char **dest); + +/* SVE2 */ extern void -gf_8vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char **src, - unsigned char **dest); +gf_vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char *dest); +extern void +gf_2vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest); +extern void +gf_3vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest); +extern void +gf_4vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest); +extern void +gf_5vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest); +extern void +gf_6vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest); +extern void +gf_7vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest); + extern void gf_vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, unsigned char *dest); + extern void gf_2vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, unsigned char **dest); @@ -192,62 +214,93 @@ ec_encode_data_sve(int len, int k, int rows, unsigned char *g_tbls, unsigned cha return; } - while (rows > 11) { - gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding); - g_tbls += 6 * k * 32; - coding += 6; - rows -= 6; + while (rows > 7) { + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 4 * k * 32; + coding += 4; + rows -= 4; } switch (rows) { - case 11: - /* 7 + 4 */ - gf_7vect_dot_prod_sve(len, k, g_tbls, data, coding); - g_tbls += 7 * k * 32; - coding += 7; + case 7: + /* 4 + 3 */ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 4 * k * 32; + coding += 4; + gf_3vect_dot_prod_sve(len, k, g_tbls, data, coding); break; - case 10: - /* 6 + 4 */ - gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding); - g_tbls += 6 * k * 32; - coding += 6; + case 6: + /* 4 + 2 */ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 4 * k * 32; + coding += 4; + gf_2vect_dot_prod_sve(len, k, g_tbls, data, coding); break; - case 9: - /* 5 + 4 */ + case 5: gf_5vect_dot_prod_sve(len, k, g_tbls, data, coding); - g_tbls += 5 * k * 32; - coding += 5; - gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); break; - case 8: - /* 4 + 4 */ + case 4: gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 3: + gf_3vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 2: + gf_2vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 1: + gf_vect_dot_prod_sve(len, k, g_tbls, data, *coding); + break; + default: + break; + } +} + +void +ec_encode_data_sve2(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data, + unsigned char **coding) +{ + if (len < 16) { + ec_encode_data_base(len, k, rows, g_tbls, data, coding); + return; + } + + while (rows > 7) { + gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding); g_tbls += 4 * k * 32; coding += 4; - gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); - break; + rows -= 4; + } + + switch (rows) { case 7: - gf_7vect_dot_prod_sve(len, k, g_tbls, data, coding); + /* 4 + 3 */ + gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding); + g_tbls += 4 * k * 32; + coding += 4; + gf_3vect_dot_prod_sve2(len, k, g_tbls, data, coding); break; case 6: - gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding); + /* 4 + 2 */ + gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding); + g_tbls += 4 * k * 32; + coding += 4; + gf_2vect_dot_prod_sve2(len, k, g_tbls, data, coding); break; case 5: - gf_5vect_dot_prod_sve(len, k, g_tbls, data, coding); + gf_5vect_dot_prod_sve2(len, k, g_tbls, data, coding); break; case 4: - gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding); break; case 3: - gf_3vect_dot_prod_sve(len, k, g_tbls, data, coding); + gf_3vect_dot_prod_sve2(len, k, g_tbls, data, coding); break; case 2: - gf_2vect_dot_prod_sve(len, k, g_tbls, data, coding); + gf_2vect_dot_prod_sve2(len, k, g_tbls, data, coding); break; case 1: - gf_vect_dot_prod_sve(len, k, g_tbls, data, *coding); + gf_vect_dot_prod_sve2(len, k, g_tbls, data, *coding); break; default: break; diff --git a/erasure_code/aarch64/gf_2vect_dot_prod_sve.S b/erasure_code/aarch64/gf_2vect_dot_prod_sve.S deleted file mode 100644 index 99b5f15c..00000000 --- a/erasure_code/aarch64/gf_2vect_dot_prod_sve.S +++ /dev/null @@ -1,168 +0,0 @@ -/************************************************************* - Copyright (c) 2021 Linaro Ltd. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Huawei Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ -.text -.align 6 -.arch armv8-a+sve - -#include "../include/aarch64_label.h" - -.global cdecl(gf_2vect_dot_prod_sve) -#ifndef __APPLE__ -.type gf_2vect_dot_prod_sve, %function -#endif -/* void gf_2vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char **dest); - */ - -/* arguments */ -x_len .req x0 /* vector length */ -x_vec .req x1 /* number of source vectors (ie. data blocks) */ -x_tbl .req x2 -x_src .req x3 -x_dest .req x4 - -/* returns */ -w_ret .req w0 - -/* local variables */ -x_vec_i .req x5 -x_ptr .req x6 -x_pos .req x7 - -x_tbl1 .req x8 -x_tbl2 .req x9 -x_dest1 .req x10 -x_dest2 .req x_dest /* reused */ - -/* r16,r17,r18,r29,r30: special role registers, avoided */ -/* r19..r29 and SP must be preserved */ - -/* vectors */ -z_mask0f .req z0 - -z_src .req z1 -z_src_lo .req z2 -z_src_hi .req z_src - -z_dest1 .req z3 - -z_gft1_lo .req z4 -z_gft1_hi .req z5 -q_gft1_lo .req q4 -q_gft1_hi .req q5 - -/* bottom 64-bit of v8..v15 must be preserved if used */ -z_gft2_lo .req z17 -z_gft2_hi .req z18 -q_gft2_lo .req q17 -q_gft2_hi .req q18 - -z_dest2 .req z27 - -cdecl(gf_2vect_dot_prod_sve): - /* less than 16 bytes, return_fail */ - cmp x_len, #16 - blt .return_fail - - mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ - mov x_pos, #0 - lsl x_vec, x_vec, #3 - ldp x_dest1, x_dest2, [x_dest, #8*0] - -/* Loop 1: x_len, vector length */ -.Lloopsve_vl: - whilelo p0.b, x_pos, x_len - b.none .return_pass - - mov x_vec_i, #0 /* clear x_vec_i */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - mov z_dest1.b, #0 /* clear z_dest1 */ - mov z_dest2.b, #0 /* clear z_dest2 */ - - /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ - mov x_tbl1, x_tbl /* reset x_tbl1 */ - add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ - -/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ -.Lloopsve_vl_vects: - /* load src data, governed by p0 */ - ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ - /* split 4-bit lo; 4-bit hi */ - and z_src_lo.d, z_src.d, z_mask0f.d - lsr z_src_hi.b, z_src.b, #4 - - - /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ - /* load gf_table's */ - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ - ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 - - /* prefetch */ - prfb pldl2keep, p0, [x_tbl1] - prfb pldl2keep, p0, [x_tbl2] - - /* calc for next */ - add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - /* dest 1 */ - /* table indexing, ie. gf(2^8) multiplication */ - tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b - tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b - /* exclusive or, ie. gf(2^8) add */ - eor z_dest1.d, z_gft1_lo.d, z_dest1.d - eor z_dest1.d, z_dest1.d, z_gft1_hi.d - - /* dest 2 */ - tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b - tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b - eor z_dest2.d, z_gft2_lo.d, z_dest2.d - eor z_dest2.d, z_dest2.d, z_gft2_hi.d - - cmp x_vec_i, x_vec - blt .Lloopsve_vl_vects -/* end of Loop 2 */ - - /* store dest data, governed by p0 */ - st1b z_dest1.b, p0, [x_dest1, x_pos] - st1b z_dest2.b, p0, [x_dest2, x_pos] - - /* increment one vector length */ - incb x_pos - b .Lloopsve_vl -/* end of Loop 1 */ - -.return_pass: - mov w_ret, #0 - ret - -.return_fail: - mov w_ret, #1 - ret diff --git a/erasure_code/aarch64/gf_3vect_dot_prod_sve.S b/erasure_code/aarch64/gf_3vect_dot_prod_sve.S deleted file mode 100644 index 8f6414ee..00000000 --- a/erasure_code/aarch64/gf_3vect_dot_prod_sve.S +++ /dev/null @@ -1,189 +0,0 @@ -/************************************************************* - Copyright (c) 2021 Linaro Ltd. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Huawei Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ -.text -.align 6 -.arch armv8-a+sve - -#include "../include/aarch64_label.h" - -.global cdecl(gf_3vect_dot_prod_sve) -#ifndef __APPLE__ -.type gf_3vect_dot_prod_sve, %function -#endif -/* void gf_3vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char **dest); - */ - -/* arguments */ -x_len .req x0 /* vector length */ -x_vec .req x1 /* number of source vectors (ie. data blocks) */ -x_tbl .req x2 -x_src .req x3 -x_dest .req x4 - -/* returns */ -w_ret .req w0 - -/* local variables */ -x_vec_i .req x5 -x_ptr .req x6 -x_pos .req x7 - -x_tbl1 .req x8 -x_tbl2 .req x9 -x_tbl3 .req x10 -x_dest1 .req x11 -x_dest2 .req x12 -x_dest3 .req x_dest /* reused */ - -/* r16,r17,r18,r29,r30: special role registers, avoided */ -/* r19..r29 and SP must be preserved */ - -/* vectors */ -z_mask0f .req z0 - -z_src .req z1 -z_src_lo .req z2 -z_src_hi .req z_src - -z_dest1 .req z3 - -z_gft1_lo .req z4 -z_gft1_hi .req z5 -q_gft1_lo .req q4 -q_gft1_hi .req q5 - -/* bottom 64-bit of v8..v15 must be preserved if used */ -z_gft2_lo .req z17 -z_gft2_hi .req z18 -q_gft2_lo .req q17 -q_gft2_hi .req q18 - -z_gft3_lo .req z19 -z_gft3_hi .req z20 -q_gft3_lo .req q19 -q_gft3_hi .req q20 - -z_dest2 .req z27 -z_dest3 .req z28 - -cdecl(gf_3vect_dot_prod_sve): - /* less than 16 bytes, return_fail */ - cmp x_len, #16 - blt .return_fail - - mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ - mov x_pos, #0 - lsl x_vec, x_vec, #3 - ldp x_dest1, x_dest2, [x_dest, #8*0] - ldr x_dest3, [x_dest, #8*2] - -/* Loop 1: x_len, vector length */ -.Lloopsve_vl: - whilelo p0.b, x_pos, x_len - b.none .return_pass - - mov x_vec_i, #0 /* clear x_vec_i */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - mov z_dest1.b, #0 /* clear z_dest1 */ - mov z_dest2.b, #0 /* clear z_dest2 */ - mov z_dest3.b, #0 /* clear z_dest3 */ - - /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ - mov x_tbl1, x_tbl /* reset x_tbl1 */ - add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ - add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ - -/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ -.Lloopsve_vl_vects: - /* load src data, governed by p0 */ - ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ - /* split 4-bit lo; 4-bit hi */ - and z_src_lo.d, z_src.d, z_mask0f.d - lsr z_src_hi.b, z_src.b, #4 - - - /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ - /* load gf_table's */ - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ - ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 - - /* prefetch */ - prfb pldl2keep, p0, [x_tbl1] - prfb pldl2keep, p0, [x_tbl2] - - /* calc for next */ - add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - /* dest 1 */ - /* table indexing, ie. gf(2^8) multiplication */ - tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b - tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b - /* exclusive or, ie. gf(2^8) add */ - eor z_dest1.d, z_gft1_lo.d, z_dest1.d - eor z_dest1.d, z_dest1.d, z_gft1_hi.d - - ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 - prfb pldl2keep, p0, [x_tbl3] - - /* dest 2 */ - tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b - tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b - eor z_dest2.d, z_gft2_lo.d, z_dest2.d - eor z_dest2.d, z_dest2.d, z_gft2_hi.d - - /* dest 3 */ - tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b - tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b - eor z_dest3.d, z_gft3_lo.d, z_dest3.d - eor z_dest3.d, z_dest3.d, z_gft3_hi.d - - cmp x_vec_i, x_vec - blt .Lloopsve_vl_vects -/* end of Loop 2 */ - - /* store dest data, governed by p0 */ - st1b z_dest1.b, p0, [x_dest1, x_pos] - st1b z_dest2.b, p0, [x_dest2, x_pos] - st1b z_dest3.b, p0, [x_dest3, x_pos] - - /* increment one vector length */ - incb x_pos - b .Lloopsve_vl -/* end of Loop 1 */ - -.return_pass: - mov w_ret, #0 - ret - -.return_fail: - mov w_ret, #1 - ret diff --git a/erasure_code/aarch64/gf_4vect_dot_prod_sve.S b/erasure_code/aarch64/gf_4vect_dot_prod_sve.S deleted file mode 100644 index eb354279..00000000 --- a/erasure_code/aarch64/gf_4vect_dot_prod_sve.S +++ /dev/null @@ -1,208 +0,0 @@ -/************************************************************* - Copyright (c) 2021 Linaro Ltd. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Huawei Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ -.text -.align 6 -.arch armv8-a+sve - -#include "../include/aarch64_label.h" - -.global cdecl(gf_4vect_dot_prod_sve) -#ifndef __APPLE__ -.type gf_4vect_dot_prod_sve, %function -#endif -/* void gf_4vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char **dest); - */ - -/* arguments */ -x_len .req x0 /* vector length */ -x_vec .req x1 /* number of source vectors (ie. data blocks) */ -x_tbl .req x2 -x_src .req x3 -x_dest .req x4 - -/* returns */ -w_ret .req w0 - -/* local variables */ -x_vec_i .req x5 -x_ptr .req x6 -x_pos .req x7 - -x_tbl1 .req x8 -x_tbl2 .req x9 -x_tbl3 .req x10 -x_tbl4 .req x11 -x_dest1 .req x12 -x_dest2 .req x13 -x_dest3 .req x14 -x_dest4 .req x_dest /* reused */ - -/* r16,r17,r18,r29,r30: special role registers, avoided */ -/* r19..r29 and SP must be preserved */ - -/* vectors */ -z_mask0f .req z0 - -z_src .req z1 -z_src_lo .req z2 -z_src_hi .req z_src - -z_dest1 .req z3 - -z_gft1_lo .req z4 -z_gft1_hi .req z5 -q_gft1_lo .req q4 -q_gft1_hi .req q5 - -/* bottom 64-bit of v8..v15 must be preserved if used */ -z_gft2_lo .req z17 -z_gft2_hi .req z18 -q_gft2_lo .req q17 -q_gft2_hi .req q18 - -z_gft3_lo .req z19 -z_gft3_hi .req z20 -q_gft3_lo .req q19 -q_gft3_hi .req q20 - -z_gft4_lo .req z21 -z_gft4_hi .req z22 -q_gft4_lo .req q21 -q_gft4_hi .req q22 - -z_dest2 .req z27 -z_dest3 .req z28 -z_dest4 .req z29 - -cdecl(gf_4vect_dot_prod_sve): - /* less than 16 bytes, return_fail */ - cmp x_len, #16 - blt .return_fail - - mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ - mov x_pos, #0 - lsl x_vec, x_vec, #3 - ldp x_dest1, x_dest2, [x_dest, #8*0] - ldp x_dest3, x_dest4, [x_dest, #8*2] - -/* Loop 1: x_len, vector length */ -.Lloopsve_vl: - whilelo p0.b, x_pos, x_len - b.none .return_pass - - mov x_vec_i, #0 /* clear x_vec_i */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - mov z_dest1.b, #0 /* clear z_dest1 */ - mov z_dest2.b, #0 /* clear z_dest2 */ - mov z_dest3.b, #0 /* clear z_dest3 */ - mov z_dest4.b, #0 /* clear z_dest4 */ - - /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ - mov x_tbl1, x_tbl /* reset x_tbl1 */ - add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ - add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ - add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ - -/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ -.Lloopsve_vl_vects: - /* load src data, governed by p0 */ - ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ - /* split 4-bit lo; 4-bit hi */ - and z_src_lo.d, z_src.d, z_mask0f.d - lsr z_src_hi.b, z_src.b, #4 - - - /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ - /* load gf_table's */ - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ - ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 - - /* prefetch */ - prfb pldl2keep, p0, [x_tbl1] - prfb pldl2keep, p0, [x_tbl2] - - /* calc for next */ - add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - /* dest 1 */ - /* table indexing, ie. gf(2^8) multiplication */ - tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b - tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b - /* exclusive or, ie. gf(2^8) add */ - eor z_dest1.d, z_gft1_lo.d, z_dest1.d - eor z_dest1.d, z_dest1.d, z_gft1_hi.d - - ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 - ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 - prfb pldl2keep, p0, [x_tbl3] - prfb pldl2keep, p0, [x_tbl4] - - /* dest 2 */ - tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b - tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b - eor z_dest2.d, z_gft2_lo.d, z_dest2.d - eor z_dest2.d, z_dest2.d, z_gft2_hi.d - - /* dest 3 */ - tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b - tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b - eor z_dest3.d, z_gft3_lo.d, z_dest3.d - eor z_dest3.d, z_dest3.d, z_gft3_hi.d - - /* dest 4 */ - tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b - tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b - eor z_dest4.d, z_gft4_lo.d, z_dest4.d - eor z_dest4.d, z_dest4.d, z_gft4_hi.d - - cmp x_vec_i, x_vec - blt .Lloopsve_vl_vects -/* end of Loop 2 */ - - /* store dest data, governed by p0 */ - st1b z_dest1.b, p0, [x_dest1, x_pos] - st1b z_dest2.b, p0, [x_dest2, x_pos] - st1b z_dest3.b, p0, [x_dest3, x_pos] - st1b z_dest4.b, p0, [x_dest4, x_pos] - - /* increment one vector length */ - incb x_pos - b .Lloopsve_vl -/* end of Loop 1 */ - -.return_pass: - mov w_ret, #0 - ret - -.return_fail: - mov w_ret, #1 - ret diff --git a/erasure_code/aarch64/gf_5vect_dot_prod_sve.S b/erasure_code/aarch64/gf_5vect_dot_prod_sve.S deleted file mode 100644 index bb7cd018..00000000 --- a/erasure_code/aarch64/gf_5vect_dot_prod_sve.S +++ /dev/null @@ -1,237 +0,0 @@ -/************************************************************* - Copyright (c) 2021 Linaro Ltd. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Huawei Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ -.text -.align 6 -.arch armv8-a+sve - -#include "../include/aarch64_label.h" - -.global cdecl(gf_5vect_dot_prod_sve) -#ifndef __APPLE__ -.type gf_5vect_dot_prod_sve, %function -#endif -/* void gf_5vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char **dest); - */ - -/* arguments */ -x_len .req x0 /* vector length */ -x_vec .req x1 /* number of source vectors (ie. data blocks) */ -x_tbl .req x2 -x_src .req x3 -x_dest .req x4 - -/* returns */ -w_ret .req w0 - -/* local variables */ -x_vec_i .req x5 -x_ptr .req x6 -x_pos .req x7 - -x_tbl1 .req x8 -x_tbl2 .req x9 -x_tbl3 .req x10 -x_tbl4 .req x11 -x_tbl5 .req x12 -x_dest1 .req x13 -x_dest2 .req x14 -x_dest4 .req x15 -x_dest5 .req x_dest /* reused */ - -/* r16,r17,r18,r29,r30: special role registers, avoided */ -/* r19..r29 and SP must be preserved */ -x_dest3 .req x19 - -/* vectors */ -z_mask0f .req z0 - -z_src .req z1 -z_src_lo .req z2 -z_src_hi .req z_src - -z_dest1 .req z3 - -z_gft1_lo .req z4 -z_gft1_hi .req z5 -q_gft1_lo .req q4 -q_gft1_hi .req q5 - -/* bottom 64-bit of v8..v15 must be preserved if used */ -z_gft2_lo .req z17 -z_gft2_hi .req z18 -q_gft2_lo .req q17 -q_gft2_hi .req q18 - -z_gft3_lo .req z19 -z_gft3_hi .req z20 -q_gft3_lo .req q19 -q_gft3_hi .req q20 - -z_gft4_lo .req z21 -z_gft4_hi .req z22 -q_gft4_lo .req q21 -q_gft4_hi .req q22 - -z_gft5_lo .req z23 -z_gft5_hi .req z24 -q_gft5_lo .req q23 -q_gft5_hi .req q24 - -z_dest2 .req z27 -z_dest3 .req z28 -z_dest4 .req z29 -z_dest5 .req z30 - -cdecl(gf_5vect_dot_prod_sve): - /* less than 16 bytes, return_fail */ - cmp x_len, #16 - blt .return_fail - - /* save r19..r29 */ - sub sp, sp, #16 /* alignment */ - str x19, [sp] - - mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ - mov x_pos, #0 - lsl x_vec, x_vec, #3 - ldp x_dest1, x_dest2, [x_dest, #8*0] - ldp x_dest3, x_dest4, [x_dest, #8*2] - ldr x_dest5, [x_dest, #8*4] - -/* Loop 1: x_len, vector length */ -.Lloopsve_vl: - whilelo p0.b, x_pos, x_len - b.none .return_pass - - mov x_vec_i, #0 /* clear x_vec_i */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - mov z_dest1.b, #0 /* clear z_dest1 */ - mov z_dest2.b, #0 /* clear z_dest2 */ - mov z_dest3.b, #0 /* clear z_dest3 */ - mov z_dest4.b, #0 /* clear z_dest4 */ - mov z_dest5.b, #0 /* clear z_dest5 */ - - /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ - mov x_tbl1, x_tbl /* reset x_tbl1 */ - add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ - add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ - add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ - add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ - -/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ -.Lloopsve_vl_vects: - /* load src data, governed by p0 */ - ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ - /* split 4-bit lo; 4-bit hi */ - and z_src_lo.d, z_src.d, z_mask0f.d - lsr z_src_hi.b, z_src.b, #4 - - - /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ - /* load gf_table's */ - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ - ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 - - /* prefetch */ - prfb pldl2keep, p0, [x_tbl1] - prfb pldl2keep, p0, [x_tbl2] - - /* calc for next */ - add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - /* dest 1 */ - /* table indexing, ie. gf(2^8) multiplication */ - tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b - tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b - /* exclusive or, ie. gf(2^8) add */ - eor z_dest1.d, z_gft1_lo.d, z_dest1.d - eor z_dest1.d, z_dest1.d, z_gft1_hi.d - - ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 - ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 - prfb pldl2keep, p0, [x_tbl3] - prfb pldl2keep, p0, [x_tbl4] - - /* dest 2 */ - tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b - tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b - eor z_dest2.d, z_gft2_lo.d, z_dest2.d - eor z_dest2.d, z_dest2.d, z_gft2_hi.d - - /* dest 3 */ - tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b - tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b - eor z_dest3.d, z_gft3_lo.d, z_dest3.d - eor z_dest3.d, z_dest3.d, z_gft3_hi.d - - ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 - prfb pldl2keep, p0, [x_tbl5] - - /* dest 4 */ - tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b - tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b - eor z_dest4.d, z_gft4_lo.d, z_dest4.d - eor z_dest4.d, z_dest4.d, z_gft4_hi.d - - /* dest 5 */ - tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b - tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b - eor z_dest5.d, z_gft5_lo.d, z_dest5.d - eor z_dest5.d, z_dest5.d, z_gft5_hi.d - - cmp x_vec_i, x_vec - blt .Lloopsve_vl_vects -/* end of Loop 2 */ - - /* store dest data, governed by p0 */ - st1b z_dest1.b, p0, [x_dest1, x_pos] - st1b z_dest2.b, p0, [x_dest2, x_pos] - st1b z_dest3.b, p0, [x_dest3, x_pos] - st1b z_dest4.b, p0, [x_dest4, x_pos] - st1b z_dest5.b, p0, [x_dest5, x_pos] - - /* increment one vector length */ - incb x_pos - b .Lloopsve_vl -/* end of Loop 1 */ - -.return_pass: - /* restore r19..r29 */ - ldr x19, [sp] - add sp, sp, #16 - - mov w_ret, #0 - ret - -.return_fail: - mov w_ret, #1 - ret diff --git a/erasure_code/aarch64/gf_6vect_dot_prod_sve.S b/erasure_code/aarch64/gf_6vect_dot_prod_sve.S deleted file mode 100644 index acc98953..00000000 --- a/erasure_code/aarch64/gf_6vect_dot_prod_sve.S +++ /dev/null @@ -1,258 +0,0 @@ -/************************************************************* - Copyright (c) 2021 Linaro Ltd. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Huawei Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ -.text -.align 6 -.arch armv8-a+sve - -#include "../include/aarch64_label.h" - -.global cdecl(gf_6vect_dot_prod_sve) -#ifndef __APPLE__ -.type gf_6vect_dot_prod_sve, %function -#endif -/* void gf_6vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char **dest); - */ - -/* arguments */ -x_len .req x0 /* vector length */ -x_vec .req x1 /* number of source vectors (ie. data blocks) */ -x_tbl .req x2 -x_src .req x3 -x_dest .req x4 - -/* returns */ -w_ret .req w0 - -/* local variables */ -x_vec_i .req x5 -x_ptr .req x6 -x_pos .req x7 - -x_tbl1 .req x8 -x_tbl2 .req x9 -x_tbl3 .req x10 -x_tbl4 .req x11 -x_tbl5 .req x12 -x_tbl6 .req x13 -x_dest1 .req x14 -x_dest2 .req x15 -x_dest6 .req x_dest /* reused */ - -/* r16,r17,r18,r29,r30: special role registers, avoided */ -/* r19..r29 and SP must be preserved */ -x_dest3 .req x19 -x_dest4 .req x20 -x_dest5 .req x21 - -/* vectors */ -z_mask0f .req z0 - -z_src .req z1 -z_src_lo .req z2 -z_src_hi .req z_src - -z_dest1 .req z3 - -z_gft1_lo .req z4 -z_gft1_hi .req z5 -q_gft1_lo .req q4 -q_gft1_hi .req q5 - -/* bottom 64-bit of v8..v15 must be preserved if used */ -z_gft2_lo .req z17 -z_gft2_hi .req z18 -q_gft2_lo .req q17 -q_gft2_hi .req q18 - -z_gft3_lo .req z19 -z_gft3_hi .req z20 -q_gft3_lo .req q19 -q_gft3_hi .req q20 - -z_gft4_lo .req z21 -z_gft4_hi .req z22 -q_gft4_lo .req q21 -q_gft4_hi .req q22 - -z_gft5_lo .req z23 -z_gft5_hi .req z24 -q_gft5_lo .req q23 -q_gft5_hi .req q24 - -z_gft6_lo .req z25 -z_gft6_hi .req z26 -q_gft6_lo .req q25 -q_gft6_hi .req q26 - -z_dest2 .req z27 -z_dest3 .req z28 -z_dest4 .req z29 -z_dest5 .req z30 -z_dest6 .req z31 - -cdecl(gf_6vect_dot_prod_sve): - /* less than 16 bytes, return_fail */ - cmp x_len, #16 - blt .return_fail - - /* save r19..r29 */ - sub sp, sp, #32 /* alignment */ - stp x19, x20, [sp] - str x21, [sp, #16] - - mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ - mov x_pos, #0 - lsl x_vec, x_vec, #3 - ldp x_dest1, x_dest2, [x_dest, #8*0] - ldp x_dest3, x_dest4, [x_dest, #8*2] - ldp x_dest5, x_dest6, [x_dest, #8*4] /* x_dest6 reuses x_dest */ - -/* Loop 1: x_len, vector length */ -.Lloopsve_vl: - whilelo p0.b, x_pos, x_len - b.none .return_pass - - mov x_vec_i, #0 /* clear x_vec_i */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - mov z_dest1.b, #0 /* clear z_dest1 */ - mov z_dest2.b, #0 /* clear z_dest2 */ - mov z_dest3.b, #0 /* clear z_dest3 */ - mov z_dest4.b, #0 /* clear z_dest4 */ - mov z_dest5.b, #0 /* clear z_dest5 */ - mov z_dest6.b, #0 /* clear z_dest6 */ - - /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ - mov x_tbl1, x_tbl /* reset x_tbl1 */ - add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ - add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ - add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ - add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ - add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */ - -/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ -.Lloopsve_vl_vects: - /* load src data, governed by p0 */ - ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ - /* split 4-bit lo; 4-bit hi */ - and z_src_lo.d, z_src.d, z_mask0f.d - lsr z_src_hi.b, z_src.b, #4 - - - /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ - /* load gf_table's */ - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ - ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 - - /* prefetch */ - prfb pldl2keep, p0, [x_tbl1] - prfb pldl2keep, p0, [x_tbl2] - - /* calc for next and prefetch */ - add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - /* dest 1 */ - /* table indexing, ie. gf(2^8) multiplication */ - tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b - tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b - /* exclusive or, ie. gf(2^8) add */ - eor z_dest1.d, z_gft1_lo.d, z_dest1.d - eor z_dest1.d, z_dest1.d, z_gft1_hi.d - - ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 - ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 - prfb pldl2keep, p0, [x_tbl3] - prfb pldl2keep, p0, [x_tbl4] - - /* dest 2 */ - tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b - tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b - eor z_dest2.d, z_gft2_lo.d, z_dest2.d - eor z_dest2.d, z_dest2.d, z_gft2_hi.d - - /* dest 3 */ - tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b - tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b - eor z_dest3.d, z_gft3_lo.d, z_dest3.d - eor z_dest3.d, z_dest3.d, z_gft3_hi.d - - ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 - ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32 - prfb pldl2keep, p0, [x_tbl5] - prfb pldl2keep, p0, [x_tbl6] - - /* dest 4 */ - tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b - tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b - eor z_dest4.d, z_gft4_lo.d, z_dest4.d - eor z_dest4.d, z_dest4.d, z_gft4_hi.d - - /* dest 5 */ - tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b - tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b - eor z_dest5.d, z_gft5_lo.d, z_dest5.d - eor z_dest5.d, z_dest5.d, z_gft5_hi.d - - /* dest 6 */ - tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b - tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b - eor z_dest6.d, z_gft6_lo.d, z_dest6.d - eor z_dest6.d, z_dest6.d, z_gft6_hi.d - - cmp x_vec_i, x_vec - blt .Lloopsve_vl_vects -/* end of Loop 2 */ - - /* store dest data, governed by p0 */ - st1b z_dest1.b, p0, [x_dest1, x_pos] - st1b z_dest2.b, p0, [x_dest2, x_pos] - st1b z_dest3.b, p0, [x_dest3, x_pos] - st1b z_dest4.b, p0, [x_dest4, x_pos] - st1b z_dest5.b, p0, [x_dest5, x_pos] - st1b z_dest6.b, p0, [x_dest6, x_pos] - - /* increment one vector length */ - incb x_pos - b .Lloopsve_vl -/* end of Loop 1 */ - -.return_pass: - /* restore r19..r29 */ - ldr x21, [sp, #16] - ldp x19, x20, [sp] - add sp, sp, #32 - - mov w_ret, #0 - ret - -.return_fail: - mov w_ret, #1 - ret diff --git a/erasure_code/aarch64/gf_7vect_dot_prod_sve.S b/erasure_code/aarch64/gf_7vect_dot_prod_sve.S deleted file mode 100644 index 0f74873d..00000000 --- a/erasure_code/aarch64/gf_7vect_dot_prod_sve.S +++ /dev/null @@ -1,281 +0,0 @@ -/************************************************************* - Copyright (c) 2021 Linaro Ltd. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Huawei Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ -.text -.align 6 -.arch armv8-a+sve - -#include "../include/aarch64_label.h" - -.global cdecl(gf_7vect_dot_prod_sve) -#ifndef __APPLE__ -.type gf_7vect_dot_prod_sve, %function -#endif -/* void gf_7vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char **dest); - */ - -/* arguments */ -x_len .req x0 /* vector length */ -x_vec .req x1 /* number of source vectors (ie. data blocks) */ -x_tbl .req x2 -x_src .req x3 -x_dest .req x4 - -/* returns */ -w_ret .req w0 - -/* local variables */ -x_vec_i .req x5 -x_ptr .req x6 -x_pos .req x7 - -x_tbl1 .req x8 -x_tbl2 .req x9 -x_tbl3 .req x10 -x_tbl4 .req x11 -x_tbl5 .req x12 -x_tbl6 .req x13 -x_tbl7 .req x14 - -x_dest1 .req x15 - -/* r16,r17,r18,r29,r30: special role registers, avoided */ -/* r19..r29 and SP must be preserved */ -x_dest2 .req x19 -x_dest3 .req x20 -x_dest4 .req x21 -x_dest5 .req x22 -x_dest6 .req x23 -x_dest7 .req x_dest /* reused */ - -/* vectors */ -z_mask0f .req z0 - -z_src .req z1 -z_src_lo .req z2 -z_src_hi .req z_src - -z_dest1 .req z3 -z_gft1_lo .req z4 -z_gft1_hi .req z5 -q_gft1_lo .req q4 -q_gft1_hi .req q5 - -z_gft7_lo .req z6 -z_gft7_hi .req z7 -q_gft7_lo .req q6 -q_gft7_hi .req q7 - -/* bottom 64-bit of v8..v15 must be preserved if used */ -z_dest7 .req z16 - -z_gft2_lo .req z17 -z_gft2_hi .req z18 -q_gft2_lo .req q17 -q_gft2_hi .req q18 - -z_gft3_lo .req z19 -z_gft3_hi .req z20 -q_gft3_lo .req q19 -q_gft3_hi .req q20 - -z_gft4_lo .req z21 -z_gft4_hi .req z22 -q_gft4_lo .req q21 -q_gft4_hi .req q22 - -z_gft5_lo .req z23 -z_gft5_hi .req z24 -q_gft5_lo .req q23 -q_gft5_hi .req q24 - -z_gft6_lo .req z25 -z_gft6_hi .req z26 -q_gft6_lo .req q25 -q_gft6_hi .req q26 - -z_dest2 .req z27 -z_dest3 .req z28 -z_dest4 .req z29 -z_dest5 .req z30 -z_dest6 .req z31 - -cdecl(gf_7vect_dot_prod_sve): - /* less than 16 bytes, return_fail */ - cmp x_len, #16 - blt .return_fail - - /* save r19..r29 */ - sub sp, sp, #48 /* alignment */ - stp x19, x20, [sp] - stp x21, x22, [sp, #16] - str x23, [sp, #32] - - mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ - mov x_pos, #0 - lsl x_vec, x_vec, #3 - ldp x_dest1, x_dest2, [x_dest, #8*0] - ldp x_dest3, x_dest4, [x_dest, #8*2] - ldp x_dest5, x_dest6, [x_dest, #8*4] - ldr x_dest7, [x_dest, #8*6] /* x_dest7 reuses x_dest */ - -/* Loop 1: x_len, vector length */ -.Lloopsve_vl: - whilelo p0.b, x_pos, x_len - b.none .return_pass - - mov x_vec_i, #0 /* clear x_vec_i */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - mov z_dest1.b, #0 /* clear z_dest1 */ - mov z_dest2.b, #0 /* clear z_dest2 */ - mov z_dest3.b, #0 /* clear z_dest3 */ - mov z_dest4.b, #0 /* clear z_dest4 */ - mov z_dest5.b, #0 /* clear z_dest5 */ - mov z_dest6.b, #0 /* clear z_dest6 */ - mov z_dest7.b, #0 /* clear z_dest7 */ - - /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ - mov x_tbl1, x_tbl /* reset x_tbl1 */ - add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ - add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ - add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ - add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ - add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */ - add x_tbl7, x_tbl6, x_vec, LSL #2 /* reset x_tbl7 */ - -/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ -.Lloopsve_vl_vects: - /* load src data, governed by p0 */ - ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ - /* split 4-bit lo; 4-bit hi */ - and z_src_lo.d, z_src.d, z_mask0f.d - lsr z_src_hi.b, z_src.b, #4 - - /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ - /* load gf_table's */ - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ - ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 - - /* prefetch */ - prfb pldl2keep, p0, [x_tbl1] - prfb pldl2keep, p0, [x_tbl2] - - /* calc for next and prefetch */ - add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - /* dest 1 */ - /* table indexing, ie. gf(2^8) multiplication */ - tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b - tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b - /* exclusive or, ie. gf(2^8) add */ - eor z_dest1.d, z_gft1_lo.d, z_dest1.d - eor z_dest1.d, z_gft1_hi.d, z_dest1.d - - ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 - ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 - prfb pldl2keep, p0, [x_tbl3] - prfb pldl2keep, p0, [x_tbl4] - - /* dest 2 */ - tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b - tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b - eor z_dest2.d, z_gft2_lo.d, z_dest2.d - eor z_dest2.d, z_gft2_hi.d, z_dest2.d - - /* dest 3 */ - tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b - tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b - eor z_dest3.d, z_gft3_lo.d, z_dest3.d - eor z_dest3.d, z_gft3_hi.d, z_dest3.d - - ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 - ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32 - prfb pldl2keep, p0, [x_tbl5] - prfb pldl2keep, p0, [x_tbl6] - - /* dest 4 */ - tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b - tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b - eor z_dest4.d, z_gft4_lo.d, z_dest4.d - eor z_dest4.d, z_gft4_hi.d, z_dest4.d - - /* dest 5 */ - tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b - tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b - eor z_dest5.d, z_gft5_lo.d, z_dest5.d - eor z_dest5.d, z_gft5_hi.d, z_dest5.d - - ldp q_gft7_lo, q_gft7_hi, [x_tbl7], #32 - prfb pldl2keep, p0, [x_tbl7] - - /* dest 6 */ - tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b - tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b - eor z_dest6.d, z_gft6_lo.d, z_dest6.d - eor z_dest6.d, z_gft6_hi.d, z_dest6.d - - /* dest 7 */ - tbl z_gft7_lo.b, {z_gft7_lo.b}, z_src_lo.b - tbl z_gft7_hi.b, {z_gft7_hi.b}, z_src_hi.b - eor z_dest7.d, z_gft7_lo.d, z_dest7.d - eor z_dest7.d, z_gft7_hi.d, z_dest7.d - - cmp x_vec_i, x_vec - blt .Lloopsve_vl_vects -/* end of Loop 2 */ - - /* store dest data, governed by p0 */ - st1b z_dest1.b, p0, [x_dest1, x_pos] - st1b z_dest2.b, p0, [x_dest2, x_pos] - st1b z_dest3.b, p0, [x_dest3, x_pos] - st1b z_dest4.b, p0, [x_dest4, x_pos] - st1b z_dest5.b, p0, [x_dest5, x_pos] - st1b z_dest6.b, p0, [x_dest6, x_pos] - st1b z_dest7.b, p0, [x_dest7, x_pos] - - /* increment one vector length */ - incb x_pos - b .Lloopsve_vl -/* end of Loop 1 */ - -.return_pass: - /* restore r19..r29 */ - ldr x23, [sp, #32] - ldp x21, x22, [sp, #16] - ldp x19, x20, [sp] - add sp, sp, #48 - - mov w_ret, #0 - ret - -.return_fail: - mov w_ret, #1 - ret diff --git a/erasure_code/aarch64/gf_8vect_dot_prod_sve.S b/erasure_code/aarch64/gf_8vect_dot_prod_sve.S deleted file mode 100644 index 20768f48..00000000 --- a/erasure_code/aarch64/gf_8vect_dot_prod_sve.S +++ /dev/null @@ -1,307 +0,0 @@ -/************************************************************* - Copyright (c) 2021 Linaro Ltd. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Huawei Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ -.text -.align 6 -.arch armv8-a+sve - -#include "../include/aarch64_label.h" - -.global cdecl(gf_8vect_dot_prod_sve) -#ifndef __APPLE__ -.type gf_8vect_dot_prod_sve, %function -#endif -/* void gf_8vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char **dest); - */ - -/* arguments */ -x_len .req x0 /* vector length */ -x_vec .req x1 /* number of source vectors (ie. data blocks) */ -x_tbl .req x2 -x_src .req x3 -x_dest .req x4 - -/* returns */ -w_ret .req w0 - -/* local variables */ -x_vec_i .req x5 -x_ptr .req x6 -x_pos .req x7 - -x_tbl1 .req x8 -x_tbl2 .req x9 -x_tbl3 .req x10 -x_tbl4 .req x11 -x_tbl5 .req x12 -x_tbl6 .req x13 -x_tbl7 .req x14 - -x_dest1 .req x15 - -/* r16,r17,r18,r29,r30: special role registers, avoided */ -/* r19..r29 and SP must be preserved */ -x_dest2 .req x19 -x_dest3 .req x20 -x_dest4 .req x21 -x_dest5 .req x22 -x_dest6 .req x23 -x_dest7 .req x24 -x_dest8 .req x_dest /* reused */ -x_tbl8 .req x25 - -/* vectors */ -z_mask0f .req z0 - -z_src .req z1 -z_src_lo .req z2 -z_src_hi .req z_src - -z_dest1 .req z3 -z_gft1_lo .req z4 -z_gft1_hi .req z5 -q_gft1_lo .req q4 -q_gft1_hi .req q5 - -z_gft7_lo .req z6 -z_gft7_hi .req z7 -q_gft7_lo .req q6 -q_gft7_hi .req q7 - -/* bottom 64-bit of v8..v15 must be preserved if used */ -z_dest7 .req z8 - -z_gft8_lo .req z9 -z_gft8_hi .req z10 -q_gft8_lo .req q9 -q_gft8_hi .req q10 - -z_dest8 .req z16 - -z_gft2_lo .req z17 -z_gft2_hi .req z18 -q_gft2_lo .req q17 -q_gft2_hi .req q18 - -z_gft3_lo .req z19 -z_gft3_hi .req z20 -q_gft3_lo .req q19 -q_gft3_hi .req q20 - -z_gft4_lo .req z21 -z_gft4_hi .req z22 -q_gft4_lo .req q21 -q_gft4_hi .req q22 - -z_gft5_lo .req z23 -z_gft5_hi .req z24 -q_gft5_lo .req q23 -q_gft5_hi .req q24 - -z_gft6_lo .req z25 -z_gft6_hi .req z26 -q_gft6_lo .req q25 -q_gft6_hi .req q26 - -z_dest2 .req z27 -z_dest3 .req z28 -z_dest4 .req z29 -z_dest5 .req z30 -z_dest6 .req z31 - -cdecl(gf_8vect_dot_prod_sve): - /* less than 16 bytes, return_fail */ - cmp x_len, #16 - blt .return_fail - - /* save r19..r29 */ - sub sp, sp, #80 /* alignment */ - stp x19, x20, [sp] - stp x21, x22, [sp, #16] - stp x23, x24, [sp, #32] - stp d8, d9, [sp, #48] - str d10, [sp, #56] - str x25, [sp, #64] - - mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ - mov x_pos, #0 - lsl x_vec, x_vec, #3 - ldp x_dest1, x_dest2, [x_dest, #8*0] - ldp x_dest3, x_dest4, [x_dest, #8*2] - ldp x_dest5, x_dest6, [x_dest, #8*4] - ldp x_dest7, x_dest8, [x_dest, #8*6] /* x_dest8 reuses x_dest */ - -/* Loop 1: x_len, vector length */ -.Lloopsve_vl: - whilelo p0.b, x_pos, x_len - b.none .return_pass - - mov x_vec_i, #0 /* clear x_vec_i */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - mov z_dest1.b, #0 /* clear z_dest1 */ - mov z_dest2.b, #0 /* clear z_dest2 */ - mov z_dest3.b, #0 /* clear z_dest3 */ - mov z_dest4.b, #0 /* clear z_dest4 */ - mov z_dest5.b, #0 /* clear z_dest5 */ - mov z_dest6.b, #0 /* clear z_dest6 */ - mov z_dest7.b, #0 /* clear z_dest7 */ - mov z_dest8.b, #0 /* clear z_dest8 */ - - /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ - mov x_tbl1, x_tbl /* reset x_tbl1 */ - add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ - add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ - add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ - add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ - add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */ - add x_tbl7, x_tbl6, x_vec, LSL #2 /* reset x_tbl7 */ - add x_tbl8, x_tbl7, x_vec, LSL #2 /* reset x_tbl8 */ - -/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ -.Lloopsve_vl_vects: - /* load src data, governed by p0 */ - ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ - /* split 4-bit lo; 4-bit hi */ - and z_src_lo.d, z_src.d, z_mask0f.d - lsr z_src_hi.b, z_src.b, #4 - - /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ - /* load gf_table's */ - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ - ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 - - /* prefetch */ - prfb pldl2keep, p0, [x_tbl1] - prfb pldl2keep, p0, [x_tbl2] - - /* calc for next and prefetch */ - add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - - /* dest 1 */ - /* table indexing, ie. gf(2^8) multiplication */ - tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b - tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b - /* exclusive or, ie. gf(2^8) add */ - eor z_dest1.d, z_gft1_lo.d, z_dest1.d - eor z_dest1.d, z_gft1_hi.d, z_dest1.d - - ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 - ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 - prfb pldl2keep, p0, [x_tbl3] - prfb pldl2keep, p0, [x_tbl4] - - /* dest 2 */ - tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b - tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b - eor z_dest2.d, z_gft2_lo.d, z_dest2.d - eor z_dest2.d, z_gft2_hi.d, z_dest2.d - - /* dest 3 */ - tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b - tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b - eor z_dest3.d, z_gft3_lo.d, z_dest3.d - eor z_dest3.d, z_gft3_hi.d, z_dest3.d - - ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 - ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32 - prfb pldl2keep, p0, [x_tbl5] - prfb pldl2keep, p0, [x_tbl6] - - /* dest 4 */ - tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b - tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b - eor z_dest4.d, z_gft4_lo.d, z_dest4.d - eor z_dest4.d, z_gft4_hi.d, z_dest4.d - - /* dest 5 */ - tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b - tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b - eor z_dest5.d, z_gft5_lo.d, z_dest5.d - eor z_dest5.d, z_gft5_hi.d, z_dest5.d - - ldp q_gft7_lo, q_gft7_hi, [x_tbl7], #32 - ldp q_gft8_lo, q_gft8_hi, [x_tbl8], #32 - prfb pldl2keep, p0, [x_tbl7] - prfb pldl2keep, p0, [x_tbl8] - - /* dest 6 */ - tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b - tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b - eor z_dest6.d, z_gft6_lo.d, z_dest6.d - eor z_dest6.d, z_gft6_hi.d, z_dest6.d - - /* dest 7 */ - tbl z_gft7_lo.b, {z_gft7_lo.b}, z_src_lo.b - tbl z_gft7_hi.b, {z_gft7_hi.b}, z_src_hi.b - eor z_dest7.d, z_gft7_lo.d, z_dest7.d - eor z_dest7.d, z_gft7_hi.d, z_dest7.d - - /* dest 8 */ - tbl z_gft8_lo.b, {z_gft8_lo.b}, z_src_lo.b - tbl z_gft8_hi.b, {z_gft8_hi.b}, z_src_hi.b - eor z_dest8.d, z_gft8_lo.d, z_dest8.d - eor z_dest8.d, z_gft8_hi.d, z_dest8.d - - cmp x_vec_i, x_vec - blt .Lloopsve_vl_vects -/* end of Loop 2 */ - - /* store dest data, governed by p0 */ - st1b z_dest1.b, p0, [x_dest1, x_pos] - st1b z_dest2.b, p0, [x_dest2, x_pos] - st1b z_dest3.b, p0, [x_dest3, x_pos] - st1b z_dest4.b, p0, [x_dest4, x_pos] - st1b z_dest5.b, p0, [x_dest5, x_pos] - st1b z_dest6.b, p0, [x_dest6, x_pos] - st1b z_dest7.b, p0, [x_dest7, x_pos] - st1b z_dest8.b, p0, [x_dest8, x_pos] - - /* increment one vector length */ - incb x_pos - b .Lloopsve_vl -/* end of Loop 1 */ - -.return_pass: - /* restore r19..r29 */ - ldr x25, [sp, #64] - ldr d10, [sp, #56] - ldp d8, d9, [sp, #48] - ldp x23, x24, [sp, #32] - ldp x21, x22, [sp, #16] - ldp x19, x20, [sp] - add sp, sp, #80 - - mov w_ret, #0 - ret - -.return_fail: - mov w_ret, #1 - ret diff --git a/erasure_code/aarch64/gf_nvect_dot_prod_sve.c b/erasure_code/aarch64/gf_nvect_dot_prod_sve.c new file mode 100644 index 00000000..ebf31399 --- /dev/null +++ b/erasure_code/aarch64/gf_nvect_dot_prod_sve.c @@ -0,0 +1,356 @@ +/************************************************************** + Copyright 2025 Amazon.com, Inc. or its affiliates. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Amazon.com, Inc. nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include + +// This implementation of the nvect_dot_prod uses several techniques for optimization: +// +// 1. Instead of a separate assembly implementation for each n-vect function, a single +// implementation in C can be optimized by the compiler to produce all of the versions. +// This is accomplished with a static function with the main implementation and +// non-static (i.e. exported) functions with the nvect argument hard coded. The compiler +// will inline the main function into each exported function and discard unused portions +// of the code. +// +// 2. SVE data types cannot be used in arrays since their sizes are not known. Instead +// split them out into separate variables and use switch-case blocks to do what we +// would normally do with a simple loop over an array. This also ensures that the +// compiler does not use loops in the output. +// +// 3. Additional loop unrolling: in addition to unrolling to the vector width, we also +// unroll 4x more and process 4x the vector width in each iteration of the loop. +// +// 4. A second version of each function is built with +sve2. SVE2 introduces the EOR3 +// instruction which allows consolidation of some of the XOR operations. The compiler +// can do this automatically in optimization so a separate implementation isn't required. +// We simply allow the compiler to generate SVE2 versions as well. + +__attribute__((target("+sve"), always_inline)) static inline void +gf_nvect_dot_prod_sve_unrolled(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest, int nvect) +{ + if (len < 16) + return; + + const svuint8_t mask0f = svdup_u8(0x0f); + const svbool_t predicate_true = svptrue_b8(); + int sve_len = svcntb(); + int pos = 0; + + // 4x unrolled main loop - SVE predicates handle ALL remaining data automatically + while (pos < len) { + // Create predicates for 4 batches - SVE masks beyond array bounds + svbool_t predicate_0 = svwhilelt_b8_s32(pos + sve_len * 0, len); + svbool_t predicate_1 = svwhilelt_b8_s32(pos + sve_len * 1, len); + svbool_t predicate_2 = svwhilelt_b8_s32(pos + sve_len * 2, len); + svbool_t predicate_3 = svwhilelt_b8_s32(pos + sve_len * 3, len); + + // Exit if no active lanes in first predicate + if (!svptest_any(predicate_true, predicate_0)) + break; + + // Initialize destination accumulators - use individual variables + svuint8_t dest_acc0_0, dest_acc0_1, dest_acc0_2, dest_acc0_3, dest_acc0_4, + dest_acc0_5, dest_acc0_6; + svuint8_t dest_acc1_0, dest_acc1_1, dest_acc1_2, dest_acc1_3, dest_acc1_4, + dest_acc1_5, dest_acc1_6; + svuint8_t dest_acc2_0, dest_acc2_1, dest_acc2_2, dest_acc2_3, dest_acc2_4, + dest_acc2_5, dest_acc2_6; + svuint8_t dest_acc3_0, dest_acc3_1, dest_acc3_2, dest_acc3_3, dest_acc3_4, + dest_acc3_5, dest_acc3_6; + + // Initialize based on nvect + switch (nvect) { + case 7: + dest_acc0_6 = dest_acc1_6 = dest_acc2_6 = dest_acc3_6 = + svdup_u8(0); // fallthrough + case 6: + dest_acc0_5 = dest_acc1_5 = dest_acc2_5 = dest_acc3_5 = + svdup_u8(0); // fallthrough + case 5: + dest_acc0_4 = dest_acc1_4 = dest_acc2_4 = dest_acc3_4 = + svdup_u8(0); // fallthrough + case 4: + dest_acc0_3 = dest_acc1_3 = dest_acc2_3 = dest_acc3_3 = + svdup_u8(0); // fallthrough + case 3: + dest_acc0_2 = dest_acc1_2 = dest_acc2_2 = dest_acc3_2 = + svdup_u8(0); // fallthrough + case 2: + dest_acc0_1 = dest_acc1_1 = dest_acc2_1 = dest_acc3_1 = + svdup_u8(0); // fallthrough + case 1: + dest_acc0_0 = dest_acc1_0 = dest_acc2_0 = dest_acc3_0 = svdup_u8(0); + break; + } + + // Process all source vectors + for (int v = 0; v < vlen; v++) { + // Load 4 batches of source data + svuint8_t src_data0 = svld1_u8(predicate_0, &src[v][pos + sve_len * 0]); + svuint8_t src_data1 = svld1_u8(predicate_1, &src[v][pos + sve_len * 1]); + svuint8_t src_data2 = svld1_u8(predicate_2, &src[v][pos + sve_len * 2]); + svuint8_t src_data3 = svld1_u8(predicate_3, &src[v][pos + sve_len * 3]); + + // Extract nibbles for all batches + svuint8_t src_lo0 = svand_x(predicate_0, src_data0, mask0f); + svuint8_t src_hi0 = svlsr_x(predicate_0, src_data0, 4); + svuint8_t src_lo1 = svand_x(predicate_1, src_data1, mask0f); + svuint8_t src_hi1 = svlsr_x(predicate_1, src_data1, 4); + svuint8_t src_lo2 = svand_x(predicate_2, src_data2, mask0f); + svuint8_t src_hi2 = svlsr_x(predicate_2, src_data2, 4); + svuint8_t src_lo3 = svand_x(predicate_3, src_data3, mask0f); + svuint8_t src_hi3 = svlsr_x(predicate_3, src_data3, 4); + + // Process each destination with unrolled batches + for (int d = 0; d < nvect; d++) { + unsigned char *tbl_base = &gftbls[d * vlen * 32 + v * 32]; + svuint8_t tbl_lo = svld1_u8(predicate_true, tbl_base); + svuint8_t tbl_hi = svld1_u8(predicate_true, tbl_base + 16); + + // Batch 0 + svuint8_t gf_lo0 = svtbl_u8(tbl_lo, src_lo0); + svuint8_t gf_hi0 = svtbl_u8(tbl_hi, src_hi0); + + // Batch 1 + svuint8_t gf_lo1 = svtbl_u8(tbl_lo, src_lo1); + svuint8_t gf_hi1 = svtbl_u8(tbl_hi, src_hi1); + + // Batch 2 + svuint8_t gf_lo2 = svtbl_u8(tbl_lo, src_lo2); + svuint8_t gf_hi2 = svtbl_u8(tbl_hi, src_hi2); + + // Batch 3 + svuint8_t gf_lo3 = svtbl_u8(tbl_lo, src_lo3); + svuint8_t gf_hi3 = svtbl_u8(tbl_hi, src_hi3); + + svuint8_t gf_result0 = sveor_x(predicate_0, gf_lo0, gf_hi0); + svuint8_t gf_result1 = sveor_x(predicate_1, gf_lo1, gf_hi1); + svuint8_t gf_result2 = sveor_x(predicate_2, gf_lo2, gf_hi2); + svuint8_t gf_result3 = sveor_x(predicate_3, gf_lo3, gf_hi3); + + // Accumulate results + switch (d) { + case 0: + dest_acc0_0 = sveor_x(predicate_0, dest_acc0_0, gf_result0); + dest_acc1_0 = sveor_x(predicate_1, dest_acc1_0, gf_result1); + dest_acc2_0 = sveor_x(predicate_2, dest_acc2_0, gf_result2); + dest_acc3_0 = sveor_x(predicate_3, dest_acc3_0, gf_result3); + break; + case 1: + dest_acc0_1 = sveor_x(predicate_0, dest_acc0_1, gf_result0); + dest_acc1_1 = sveor_x(predicate_1, dest_acc1_1, gf_result1); + dest_acc2_1 = sveor_x(predicate_2, dest_acc2_1, gf_result2); + dest_acc3_1 = sveor_x(predicate_3, dest_acc3_1, gf_result3); + break; + case 2: + dest_acc0_2 = sveor_x(predicate_0, dest_acc0_2, gf_result0); + dest_acc1_2 = sveor_x(predicate_1, dest_acc1_2, gf_result1); + dest_acc2_2 = sveor_x(predicate_2, dest_acc2_2, gf_result2); + dest_acc3_2 = sveor_x(predicate_3, dest_acc3_2, gf_result3); + break; + case 3: + dest_acc0_3 = sveor_x(predicate_0, dest_acc0_3, gf_result0); + dest_acc1_3 = sveor_x(predicate_1, dest_acc1_3, gf_result1); + dest_acc2_3 = sveor_x(predicate_2, dest_acc2_3, gf_result2); + dest_acc3_3 = sveor_x(predicate_3, dest_acc3_3, gf_result3); + break; + case 4: + dest_acc0_4 = sveor_x(predicate_0, dest_acc0_4, gf_result0); + dest_acc1_4 = sveor_x(predicate_1, dest_acc1_4, gf_result1); + dest_acc2_4 = sveor_x(predicate_2, dest_acc2_4, gf_result2); + dest_acc3_4 = sveor_x(predicate_3, dest_acc3_4, gf_result3); + break; + case 5: + dest_acc0_5 = sveor_x(predicate_0, dest_acc0_5, gf_result0); + dest_acc1_5 = sveor_x(predicate_1, dest_acc1_5, gf_result1); + dest_acc2_5 = sveor_x(predicate_2, dest_acc2_5, gf_result2); + dest_acc3_5 = sveor_x(predicate_3, dest_acc3_5, gf_result3); + break; + case 6: + dest_acc0_6 = sveor_x(predicate_0, dest_acc0_6, gf_result0); + dest_acc1_6 = sveor_x(predicate_1, dest_acc1_6, gf_result1); + dest_acc2_6 = sveor_x(predicate_2, dest_acc2_6, gf_result2); + dest_acc3_6 = sveor_x(predicate_3, dest_acc3_6, gf_result3); + break; + } + } + } + + // Store results for all batches + switch (nvect) { + case 7: + svst1_u8(predicate_0, &dest[6][pos + sve_len * 0], dest_acc0_6); + svst1_u8(predicate_1, &dest[6][pos + sve_len * 1], dest_acc1_6); + svst1_u8(predicate_2, &dest[6][pos + sve_len * 2], dest_acc2_6); + svst1_u8(predicate_3, &dest[6][pos + sve_len * 3], dest_acc3_6); + // fallthrough + case 6: + svst1_u8(predicate_0, &dest[5][pos + sve_len * 0], dest_acc0_5); + svst1_u8(predicate_1, &dest[5][pos + sve_len * 1], dest_acc1_5); + svst1_u8(predicate_2, &dest[5][pos + sve_len * 2], dest_acc2_5); + svst1_u8(predicate_3, &dest[5][pos + sve_len * 3], dest_acc3_5); + // fallthrough + case 5: + svst1_u8(predicate_0, &dest[4][pos + sve_len * 0], dest_acc0_4); + svst1_u8(predicate_1, &dest[4][pos + sve_len * 1], dest_acc1_4); + svst1_u8(predicate_2, &dest[4][pos + sve_len * 2], dest_acc2_4); + svst1_u8(predicate_3, &dest[4][pos + sve_len * 3], dest_acc3_4); + // fallthrough + case 4: + svst1_u8(predicate_0, &dest[3][pos + sve_len * 0], dest_acc0_3); + svst1_u8(predicate_1, &dest[3][pos + sve_len * 1], dest_acc1_3); + svst1_u8(predicate_2, &dest[3][pos + sve_len * 2], dest_acc2_3); + svst1_u8(predicate_3, &dest[3][pos + sve_len * 3], dest_acc3_3); + // fallthrough + case 3: + svst1_u8(predicate_0, &dest[2][pos + sve_len * 0], dest_acc0_2); + svst1_u8(predicate_1, &dest[2][pos + sve_len * 1], dest_acc1_2); + svst1_u8(predicate_2, &dest[2][pos + sve_len * 2], dest_acc2_2); + svst1_u8(predicate_3, &dest[2][pos + sve_len * 3], dest_acc3_2); + // fallthrough + case 2: + svst1_u8(predicate_0, &dest[1][pos + sve_len * 0], dest_acc0_1); + svst1_u8(predicate_1, &dest[1][pos + sve_len * 1], dest_acc1_1); + svst1_u8(predicate_2, &dest[1][pos + sve_len * 2], dest_acc2_1); + svst1_u8(predicate_3, &dest[1][pos + sve_len * 3], dest_acc3_1); + // fallthrough + case 1: + svst1_u8(predicate_0, &dest[0][pos + sve_len * 0], dest_acc0_0); + svst1_u8(predicate_1, &dest[0][pos + sve_len * 1], dest_acc1_0); + svst1_u8(predicate_2, &dest[0][pos + sve_len * 2], dest_acc2_0); + svst1_u8(predicate_3, &dest[0][pos + sve_len * 3], dest_acc3_0); + break; + } + + pos += sve_len * 4; + } +} + +// Optimized wrapper functions +__attribute__((target("+sve"))) void +gf_vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char *dest) +{ + unsigned char *dest_array[1] = { dest }; + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest_array, 1); +} + +__attribute__((target("+sve"))) void +gf_2vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 2); +} + +__attribute__((target("+sve"))) void +gf_3vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 3); +} + +__attribute__((target("+sve"))) void +gf_4vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 4); +} + +__attribute__((target("+sve"))) void +gf_5vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 5); +} + +__attribute__((target("+sve"))) void +gf_6vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 6); +} + +__attribute__((target("+sve"))) void +gf_7vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 7); +} + +// SVE2 wrapper functions - compiler will optimize eor to eor3 automatically +__attribute__((target("+sve+sve2"))) void +gf_vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char *dest) +{ + unsigned char *dest_array[1] = { dest }; + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest_array, 1); +} + +__attribute__((target("+sve+sve2"))) void +gf_2vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 2); +} + +__attribute__((target("+sve+sve2"))) void +gf_3vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 3); +} + +__attribute__((target("+sve+sve2"))) void +gf_4vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 4); +} + +__attribute__((target("+sve+sve2"))) void +gf_5vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 5); +} + +__attribute__((target("+sve+sve2"))) void +gf_6vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 6); +} + +__attribute__((target("+sve+sve2"))) void +gf_7vect_dot_prod_sve2(int len, int vlen, unsigned char *gftbls, unsigned char **src, + unsigned char **dest) +{ + gf_nvect_dot_prod_sve_unrolled(len, vlen, gftbls, src, dest, 7); +} diff --git a/erasure_code/aarch64/gf_vect_dot_prod_sve.S b/erasure_code/aarch64/gf_vect_dot_prod_sve.S deleted file mode 100644 index 48ce151f..00000000 --- a/erasure_code/aarch64/gf_vect_dot_prod_sve.S +++ /dev/null @@ -1,132 +0,0 @@ -/************************************************************** - Copyright (c) 2021 Linaro Ltd. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Huawei Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ -.text -.align 6 -.arch armv8-a+sve - -#include "../include/aarch64_label.h" - -.global cdecl(gf_vect_dot_prod_sve) -#ifndef __APPLE__ -.type gf_vect_dot_prod_sve, %function -#endif -/* void gf_vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, - unsigned char **src, unsigned char *dest); - */ - -/* arguments */ -x_len .req x0 /* vector length */ -x_vec .req x1 /* number of source vectors (ie. data blocks) */ -x_tbl .req x2 -x_src .req x3 -x_dest1 .req x4 - -/* returns */ -w_ret .req w0 - -/* local variables */ -x_vec_i .req x5 -x_ptr .req x6 -x_pos .req x7 -x_tbl1 .req x8 - -/* vectors */ -z_mask0f .req z0 - -z_src .req z1 -z_src_lo .req z2 -z_src_hi .req z_src - -z_dest .req z3 - -z_gft1_lo .req z4 -z_gft1_hi .req z5 -q_gft1_lo .req q4 -q_gft1_hi .req q5 - -cdecl(gf_vect_dot_prod_sve): - /* less than 16 bytes, return_fail */ - cmp x_len, #16 - blt .return_fail - - mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ - mov x_pos, #0 - lsl x_vec, x_vec, #3 - -/* Loop 1: x_len, vector length */ -.Lloopsve_vl: - whilelo p0.b, x_pos, x_len - b.none .return_pass - - mov z_dest.b, #0 /* clear z_dest */ - mov x_vec_i, #0 /* clear x_vec_i */ - mov x_tbl1, x_tbl /* reset x_tbl1 */ - -/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ -.Lloopsve_vl_vects: - ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ - /* load src data, governed by p0 */ - ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ - - add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ - - /* load gf_table */ - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is added by #32 - for each src vect */ - - /* split 4-bit lo; 4-bit hi */ - and z_src_lo.d, z_src.d, z_mask0f.d - lsr z_src_hi.b, z_src.b, #4 - - /* table indexing, ie. gf(2^8) multiplication */ - tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b - tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b - - /* exclusive or, ie. gf(2^8) add */ - eor z_dest.d, z_gft1_lo.d, z_dest.d - eor z_dest.d, z_gft1_hi.d, z_dest.d - - cmp x_vec_i, x_vec - blt .Lloopsve_vl_vects - - /* end of Loop 2 */ - /* store dest data, governed by p0 */ - st1b z_dest.b, p0, [x_dest1, x_pos] - /* increment one vector length */ - incb x_pos - - b .Lloopsve_vl - -.return_pass: - mov w_ret, #0 - ret - -.return_fail: - mov w_ret, #1 - ret