Skip to content

Commit

Permalink
Merge pull request opencv#24325 from hanliutong:rewrite
Browse files Browse the repository at this point in the history
Rewrite Universal Intrinsic code: float related part opencv#24325

The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.

The series of PRs is listed below:
opencv#23885 First patch, an example
opencv#23980 Core module
opencv#24058 ImgProc module, part 1
opencv#24132 ImgProc module, part 2
opencv#24166 ImgProc module, part 3
opencv#24301 Features2d and calib3d module
opencv#24324 Gapi module

This patch (hopefully) is the last one in the series. 

This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`, 
    then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
    - Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
    - Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
      - ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
      - ./modules/imgproc/src/color_lab.cpp (Array of vector type)
      - ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
      - ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
      These algorithms will need to be redesigned to accommodate scalable backends.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
  • Loading branch information
hanliutong committed Oct 7, 2023
1 parent faf1ff7 commit 6d20cab
Show file tree
Hide file tree
Showing 12 changed files with 345 additions and 328 deletions.
74 changes: 37 additions & 37 deletions modules/calib3d/src/undistort.simd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
s2(_s2),
s3(_s3),
s4(_s4) {
#if CV_SIMD_64F
for (int i = 0; i < 2 * v_float64::nlanes; ++i)
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
for (int i = 0; i < 2 * VTraits<v_float64>::vlanes(); ++i)
{
s_x[i] = ir[0] * i;
s_y[i] = ir[3] * i;
Expand Down Expand Up @@ -123,26 +123,26 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
else
CV_Assert(m1 != NULL);

#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
const v_float64 v_one = vx_setall_f64(1.0);
for (; j <= size.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes, _x += 2*v_float64::nlanes * ir[0], _y += 2*v_float64::nlanes * ir[3], _w += 2*v_float64::nlanes * ir[6])
for (; j <= size.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes(), _x += 2*VTraits<v_float64>::vlanes() * ir[0], _y += 2*VTraits<v_float64>::vlanes() * ir[3], _w += 2*VTraits<v_float64>::vlanes() * ir[6])
{
v_float64 m_0, m_1, m_2, m_3;
m_2 = v_one / (vx_setall_f64(_w) + vx_load(s_w));
m_3 = v_one / (vx_setall_f64(_w) + vx_load(s_w + v_float64::nlanes));
m_2 = v_div(v_one, v_add(vx_setall_f64(_w), vx_load(this->s_w)));
m_3 = v_div(v_one, v_add(vx_setall_f64(_w), vx_load(this->s_w + VTraits<v_float64>::vlanes())));
m_0 = vx_setall_f64(_x); m_1 = vx_setall_f64(_y);
v_float64 x_0 = (m_0 + vx_load(s_x)) * m_2;
v_float64 x_1 = (m_0 + vx_load(s_x + v_float64::nlanes)) * m_3;
v_float64 y_0 = (m_1 + vx_load(s_y)) * m_2;
v_float64 y_1 = (m_1 + vx_load(s_y + v_float64::nlanes)) * m_3;
v_float64 x_0 = v_mul(v_add(m_0, vx_load(this->s_x)), m_2);
v_float64 x_1 = v_mul(v_add(m_0, vx_load(this->s_x + VTraits<v_float64>::vlanes())), m_3);
v_float64 y_0 = v_mul(v_add(m_1, vx_load(this->s_y)), m_2);
v_float64 y_1 = v_mul(v_add(m_1, vx_load(this->s_y + VTraits<v_float64>::vlanes())), m_3);

v_float64 xd_0 = x_0 * x_0;
v_float64 yd_0 = y_0 * y_0;
v_float64 xd_1 = x_1 * x_1;
v_float64 yd_1 = y_1 * y_1;
v_float64 xd_0 = v_mul(x_0, x_0);
v_float64 yd_0 = v_mul(y_0, y_0);
v_float64 xd_1 = v_mul(x_1, x_1);
v_float64 yd_1 = v_mul(y_1, y_1);

v_float64 r2_0 = xd_0 + yd_0;
v_float64 r2_1 = xd_1 + yd_1;
v_float64 r2_0 = v_add(xd_0, yd_0);
v_float64 r2_1 = v_add(xd_1, yd_1);

m_1 = vx_setall_f64(k3);
m_2 = vx_setall_f64(k2);
Expand All @@ -151,18 +151,18 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
m_1 = v_muladd(v_muladd(v_muladd(m_1, r2_1, m_2), r2_1, m_3), r2_1, v_one);
m_3 = vx_setall_f64(k6);
m_2 = vx_setall_f64(k5);
m_0 /= v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(k4)), r2_0, v_one);
m_1 /= v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(k4)), r2_1, v_one);
m_0 = v_div(m_0, v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(this->k4)), r2_0, v_one));
m_1 = v_div(m_1, v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(this->k4)), r2_1, v_one));

m_3 = vx_setall_f64(2.0);
xd_0 = v_muladd(m_3, xd_0, r2_0);
yd_0 = v_muladd(m_3, yd_0, r2_0);
xd_1 = v_muladd(m_3, xd_1, r2_1);
yd_1 = v_muladd(m_3, yd_1, r2_1);
m_2 = x_0 * y_0 * m_3;
m_3 = x_1 * y_1 * m_3;
m_2 = v_mul(v_mul(x_0, y_0), m_3);
m_3 = v_mul(v_mul(x_1, y_1), m_3);

x_0 *= m_0; y_0 *= m_0; x_1 *= m_1; y_1 *= m_1;
x_0 = v_mul(x_0, m_0); y_0 = v_mul(y_0, m_0); x_1 = v_mul(x_1, m_1); y_1 = v_mul(y_1, m_1);

m_0 = vx_setall_f64(p1);
m_1 = vx_setall_f64(p2);
Expand All @@ -176,8 +176,8 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
xd_1 = v_muladd(m_0, m_3, xd_1);
yd_1 = v_muladd(m_1, m_3, yd_1);

m_0 = r2_0 * r2_0;
m_1 = r2_1 * r2_1;
m_0 = v_mul(r2_0, r2_0);
m_1 = v_mul(r2_1, r2_1);
m_2 = vx_setall_f64(s2);
m_3 = vx_setall_f64(s1);
xd_0 = v_muladd(m_3, r2_0, v_muladd(m_2, m_0, xd_0));
Expand All @@ -203,17 +203,17 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
r2_0 = v_muladd(m_0, xd_0, v_muladd(m_1, yd_0, m_2));
r2_1 = v_muladd(m_0, xd_1, v_muladd(m_1, yd_1, m_2));
m_0 = vx_setzero_f64();
r2_0 = v_select(r2_0 == m_0, v_one, v_one / r2_0);
r2_1 = v_select(r2_1 == m_0, v_one, v_one / r2_1);
r2_0 = v_select(v_eq(r2_0, m_0), v_one, v_div(v_one, r2_0));
r2_1 = v_select(v_eq(r2_1, m_0), v_one, v_div(v_one, r2_1));

m_0 = vx_setall_f64(fx);
m_1 = vx_setall_f64(u0);
m_2 = vx_setall_f64(fy);
m_3 = vx_setall_f64(v0);
x_0 = v_muladd(m_0 * r2_0, x_0, m_1);
y_0 = v_muladd(m_2 * r2_0, y_0, m_3);
x_1 = v_muladd(m_0 * r2_1, x_1, m_1);
y_1 = v_muladd(m_2 * r2_1, y_1, m_3);
x_0 = v_muladd(v_mul(m_0, r2_0), x_0, m_1);
y_0 = v_muladd(v_mul(m_2, r2_0), y_0, m_3);
x_1 = v_muladd(v_mul(m_0, r2_1), x_1, m_1);
y_1 = v_muladd(v_mul(m_2, r2_1), y_1, m_3);

if (m1type == CV_32FC1)
{
Expand All @@ -225,20 +225,20 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
v_float32 mf0, mf1;
v_zip(v_cvt_f32(x_0, x_1), v_cvt_f32(y_0, y_1), mf0, mf1);
v_store(&m1f[j * 2], mf0);
v_store(&m1f[j * 2 + v_float32::nlanes], mf1);
v_store(&m1f[j * 2 + VTraits<v_float32>::vlanes()], mf1);
}
else // m1type == CV_16SC2
{
m_0 = vx_setall_f64(INTER_TAB_SIZE);
x_0 *= m_0; x_1 *= m_0; y_0 *= m_0; y_1 *= m_0;
x_0 = v_mul(x_0, m_0); x_1 = v_mul(x_1, m_0); y_0 = v_mul(y_0, m_0); y_1 = v_mul(y_1, m_0);

v_int32 mask = vx_setall_s32(INTER_TAB_SIZE - 1);
v_int32 iu = v_round(x_0, x_1);
v_int32 iv = v_round(y_0, y_1);

v_pack_u_store(&m2[j], (iu & mask) + (iv & mask) * vx_setall_s32(INTER_TAB_SIZE));
v_pack_u_store(&m2[j], v_add(v_and(iu, mask), v_mul(v_and(iv, mask), vx_setall_s32(INTER_TAB_SIZE))));
v_int32 out0, out1;
v_zip(iu >> INTER_BITS, iv >> INTER_BITS, out0, out1);
v_zip(v_shr<INTER_BITS>(iu), v_shr<INTER_BITS>(iv), out0, out1);
v_store(&m1[j * 2], v_pack(out0, out1));
}
}
Expand Down Expand Up @@ -302,10 +302,10 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
double s2;
double s3;
double s4;
#if CV_SIMD_64F
double s_x[2*v_float64::nlanes];
double s_y[2*v_float64::nlanes];
double s_w[2*v_float64::nlanes];
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
double s_x[2*VTraits<v_float64>::max_nlanes];
double s_y[2*VTraits<v_float64>::max_nlanes];
double s_w[2*VTraits<v_float64>::max_nlanes];
#endif
};
}
Expand Down
17 changes: 15 additions & 2 deletions modules/core/include/opencv2/core/hal/intrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -972,6 +972,15 @@ namespace CV__SIMD_NAMESPACE {
{ \
return a op b; \
}
#define OPENCV_HAL_WRAP_EQ_OP(_Tpvec) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
{ \
return a == b; \
} \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ \
return a != b; \
}

#define OPENCV_HAL_WRAP_CMP(_Tpvec) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \
Expand All @@ -984,11 +993,11 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_CMP(v_uint8)
OPENCV_HAL_WRAP_CMP(v_uint16)
OPENCV_HAL_WRAP_CMP(v_uint32)
// OPENCV_HAL_WRAP_CMP(v_uint64)
OPENCV_HAL_WRAP_EQ_OP(v_uint64)
OPENCV_HAL_WRAP_CMP(v_int8)
OPENCV_HAL_WRAP_CMP(v_int16)
OPENCV_HAL_WRAP_CMP(v_int32)
// OPENCV_HAL_WRAP_CMP(v_int64)
OPENCV_HAL_WRAP_EQ_OP(v_int64)
OPENCV_HAL_WRAP_CMP(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64)
Expand All @@ -997,9 +1006,11 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_CMP(v_uint8x16)
OPENCV_HAL_WRAP_CMP(v_uint16x8)
OPENCV_HAL_WRAP_CMP(v_uint32x4)
OPENCV_HAL_WRAP_EQ_OP(v_uint64x2)
OPENCV_HAL_WRAP_CMP(v_int8x16)
OPENCV_HAL_WRAP_CMP(v_int16x8)
OPENCV_HAL_WRAP_CMP(v_int32x4)
OPENCV_HAL_WRAP_EQ_OP(v_int64x2)
OPENCV_HAL_WRAP_CMP(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64x2)
Expand All @@ -1009,9 +1020,11 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_CMP(v_uint8x32)
OPENCV_HAL_WRAP_CMP(v_uint16x16)
OPENCV_HAL_WRAP_CMP(v_uint32x8)
OPENCV_HAL_WRAP_EQ_OP(v_uint64x4)
OPENCV_HAL_WRAP_CMP(v_int8x32)
OPENCV_HAL_WRAP_CMP(v_int16x16)
OPENCV_HAL_WRAP_CMP(v_int32x8)
OPENCV_HAL_WRAP_EQ_OP(v_int64x4)
OPENCV_HAL_WRAP_CMP(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64x4)
Expand Down

0 comments on commit 6d20cab

Please sign in to comment.