From 069eb2b1ec89750779d80000426f40cb82464953 Mon Sep 17 00:00:00 2001 From: Kseniya Zaytseva Date: Fri, 7 Apr 2023 11:13:23 +0300 Subject: [PATCH] Pull request #35: Fix utest DSDOT fails on C910V and C910V_MANGOPI configurations Merge in PL/openblas from k.zaytseva/fix_dsdot to dev-riscv --- kernel/riscv64/KERNEL.C910V | 1 + kernel/riscv64/dsdot_vector.c | 141 ++++++++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 kernel/riscv64/dsdot_vector.c diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V index 0da66fa359..2798a870ed 100644 --- a/kernel/riscv64/KERNEL.C910V +++ b/kernel/riscv64/KERNEL.C910V @@ -59,6 +59,7 @@ SDOTKERNEL = dot_vector.c DDOTKERNEL = dot_vector.c CDOTKERNEL = zdot_vector.c ZDOTKERNEL = zdot_vector.c +DSDOTKERNEL = dsdot_vector.c SNRM2KERNEL = nrm2_vector.c DNRM2KERNEL = nrm2_vector.c diff --git a/kernel/riscv64/dsdot_vector.c b/kernel/riscv64/dsdot_vector.c new file mode 100644 index 0000000000..399515f430 --- /dev/null +++ b/kernel/riscv64/dsdot_vector.c @@ -0,0 +1,141 @@ +/*************************************************************************** +Copyright (C), 2023, KNS Group LLC (YADRO). + +All Rights Reserved. + + +This software contains the intellectual property of YADRO or is licensed to YADRO + +from third parties. Use of this software and the intellectual property contained + +therein is expressly limited to the terms and conditions of the License Agreement + +under which it is provided by YADRO. + +*****************************************************************************/ + +#include "common.h" + +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0, j=0; + double dot = 0.0 ; + + if ( n < 1 ) return(dot); + vfloat64m4_t vr; + vfloat32m2_t vx, vy; + unsigned int gvl = 0; + vfloat64m1_t v_res, v_z0; + gvl = vsetvlmax_e64m1(); + v_res = vfmv_v_f_f64m1(0, gvl); + v_z0 = vfmv_v_f_f64m1(0, gvl); + + if(inc_x == 1 && inc_y == 1){ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vle_v_f32m2(&x[j], gvl); + vy = vle_v_f32m2(&y[j], gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + } + }else if(inc_y == 1){ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + int stride_x = inc_x * sizeof(FLOAT); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vlse_v_f32m2(&x[j*inc_x], stride_x, gvl); + vy = vle_v_f32m2(&y[j], gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + }else if(inc_x == 1){ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + int stride_y = inc_y * sizeof(FLOAT); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vle_v_f32m2(&x[j], gvl); + vy = vlse_v_f32m2(&y[j*inc_y], stride_y, gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + }else{ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + int stride_x = inc_x * sizeof(FLOAT); + int stride_y = inc_y * sizeof(FLOAT); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vlse_v_f32m2(&x[j*inc_x], stride_x, gvl); + vy = vlse_v_f32m2(&y[j*inc_y], stride_y, gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + } + return(dot); +}