Skip to content

Commit

Permalink
Pull request OpenMathLib#35: Fix utest DSDOT fails on C910V and C910V…
Browse files Browse the repository at this point in the history
…_MANGOPI configurations

Merge in PL/openblas from k.zaytseva/fix_dsdot to dev-riscv
  • Loading branch information
Kseniya Zaytseva authored and kseniyazaytseva committed Dec 19, 2023
1 parent 1332f8a commit 069eb2b
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 0 deletions.
1 change: 1 addition & 0 deletions kernel/riscv64/KERNEL.C910V
Expand Up @@ -59,6 +59,7 @@ SDOTKERNEL = dot_vector.c
DDOTKERNEL = dot_vector.c
CDOTKERNEL = zdot_vector.c
ZDOTKERNEL = zdot_vector.c
DSDOTKERNEL = dsdot_vector.c

SNRM2KERNEL = nrm2_vector.c
DNRM2KERNEL = nrm2_vector.c
Expand Down
141 changes: 141 additions & 0 deletions kernel/riscv64/dsdot_vector.c
@@ -0,0 +1,141 @@
/***************************************************************************
Copyright (C), 2023, KNS Group LLC (YADRO).
All Rights Reserved.
This software contains the intellectual property of YADRO or is licensed to YADRO
from third parties. Use of this software and the intellectual property contained
therein is expressly limited to the terms and conditions of the License Agreement
under which it is provided by YADRO.
*****************************************************************************/

#include "common.h"

double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0, j=0;
double dot = 0.0 ;

if ( n < 1 ) return(dot);
vfloat64m4_t vr;
vfloat32m2_t vx, vy;
unsigned int gvl = 0;
vfloat64m1_t v_res, v_z0;
gvl = vsetvlmax_e64m1();
v_res = vfmv_v_f_f64m1(0, gvl);
v_z0 = vfmv_v_f_f64m1(0, gvl);

if(inc_x == 1 && inc_y == 1){
gvl = vsetvl_e64m4(n);
vr = vfmv_v_f_f64m4(0, gvl);
for(i=0,j=0; i<n/gvl; i++){
vx = vle_v_f32m2(&x[j], gvl);
vy = vle_v_f32m2(&y[j], gvl);
vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
j += gvl;
}
if(j > 0){
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
dot += (double)vfmv_f_s_f64m1_f64(v_res);
}
//tail
if(j < n){
gvl = vsetvl_e64m4(n-j);
vx = vle_v_f32m2(&x[j], gvl);
vy = vle_v_f32m2(&y[j], gvl);
vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
//vr = vfdot_vv_f32m2(vx, vy, gvl);
vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
dot += (double)vfmv_f_s_f64m1_f64(v_res);
}
}else if(inc_y == 1){
gvl = vsetvl_e64m4(n);
vr = vfmv_v_f_f64m4(0, gvl);
int stride_x = inc_x * sizeof(FLOAT);
for(i=0,j=0; i<n/gvl; i++){
vx = vlse_v_f32m2(&x[j*inc_x], stride_x, gvl);
vy = vle_v_f32m2(&y[j], gvl);
vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
j += gvl;
}
if(j > 0){
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
dot += (double)vfmv_f_s_f64m1_f64(v_res);

}
//tail
if(j < n){
gvl = vsetvl_e64m4(n-j);
vx = vlse_v_f32m2(&x[j*inc_x], stride_x, gvl);
vy = vle_v_f32m2(&y[j], gvl);
vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
//vr = vfdot_vv_f32m2(vx, vy, gvl);
vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
dot += (double)vfmv_f_s_f64m1_f64(v_res);

}
}else if(inc_x == 1){
gvl = vsetvl_e64m4(n);
vr = vfmv_v_f_f64m4(0, gvl);
int stride_y = inc_y * sizeof(FLOAT);
for(i=0,j=0; i<n/gvl; i++){
vx = vle_v_f32m2(&x[j], gvl);
vy = vlse_v_f32m2(&y[j*inc_y], stride_y, gvl);
vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
j += gvl;
}
if(j > 0){
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
dot += (double)vfmv_f_s_f64m1_f64(v_res);

}
//tail
if(j < n){
gvl = vsetvl_e64m4(n-j);
vx = vle_v_f32m2(&x[j], gvl);
vy = vlse_v_f32m2(&y[j*inc_y], stride_y, gvl);
vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
//vr = vfdot_vv_f32m2(vx, vy, gvl);
vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
dot += (double)vfmv_f_s_f64m1_f64(v_res);

}
}else{
gvl = vsetvl_e64m4(n);
vr = vfmv_v_f_f64m4(0, gvl);
int stride_x = inc_x * sizeof(FLOAT);
int stride_y = inc_y * sizeof(FLOAT);
for(i=0,j=0; i<n/gvl; i++){
vx = vlse_v_f32m2(&x[j*inc_x], stride_x, gvl);
vy = vlse_v_f32m2(&y[j*inc_y], stride_y, gvl);
vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
j += gvl;
}
if(j > 0){
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
dot += (double)vfmv_f_s_f64m1_f64(v_res);

}
//tail
if(j < n){
gvl = vsetvl_e64m4(n-j);
vx = vlse_v_f32m2(&x[j*inc_x], stride_x, gvl);
vy = vlse_v_f32m2(&y[j*inc_y], stride_y, gvl);
vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
//vr = vfdot_vv_f32m2(vx, vy, gvl);
vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
dot += (double)vfmv_f_s_f64m1_f64(v_res);

}
}
return(dot);
}

0 comments on commit 069eb2b

Please sign in to comment.