Merge pull request opencv#24325 from hanliutong:rewrite

Rewrite Universal Intrinsic code: float related part opencv#24325 The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API. The series of PRs is listed below: opencv#23885 First patch, an example opencv#23980 Core module opencv#24058 ImgProc module, part 1 opencv#24132 ImgProc module, part 2 opencv#24166 ImgProc module, part 3 opencv#24301 Features2d and calib3d module opencv#24324 Gapi module This patch (hopefully) is the last one in the series. This patch mainly involves 3 parts 1. Add some modifications related to float (CV_SIMD_64F) 2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`, then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD` 3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments - Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....` - Some blocks can not be rewrited directly. (Not commented in the source code, just listed here) - ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct) - ./modules/imgproc/src/color_lab.cpp (Array of vector type) - ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type) - ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`) These algorithms will need to be redesigned to accommodate scalable backends. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
hanliutong · Oct 7, 2023 · 6d20cab · 6d20cab
1 parent faf1ff7
commit 6d20cab
Show file tree

Hide file tree

Showing 12 changed files with 345 additions and 328 deletions.
diff --git a/modules/calib3d/src/undistort.simd.hpp b/modules/calib3d/src/undistort.simd.hpp
@@ -89,8 +89,8 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
         s2(_s2),
         s3(_s3),
         s4(_s4) {
-#if CV_SIMD_64F
-        for (int i = 0; i < 2 * v_float64::nlanes; ++i)
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+        for (int i = 0; i < 2 * VTraits<v_float64>::vlanes(); ++i)
         {
             s_x[i] = ir[0] * i;
             s_y[i] = ir[3] * i;
@@ -123,26 +123,26 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
             else
                 CV_Assert(m1 != NULL);
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
             const v_float64 v_one = vx_setall_f64(1.0);
-            for (; j <= size.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes, _x += 2*v_float64::nlanes * ir[0], _y += 2*v_float64::nlanes * ir[3], _w += 2*v_float64::nlanes * ir[6])
+            for (; j <= size.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes(), _x += 2*VTraits<v_float64>::vlanes() * ir[0], _y += 2*VTraits<v_float64>::vlanes() * ir[3], _w += 2*VTraits<v_float64>::vlanes() * ir[6])
             {
                 v_float64 m_0, m_1, m_2, m_3;
-                m_2 = v_one / (vx_setall_f64(_w) + vx_load(s_w));
-                m_3 = v_one / (vx_setall_f64(_w) + vx_load(s_w + v_float64::nlanes));
+                m_2 = v_div(v_one, v_add(vx_setall_f64(_w), vx_load(this->s_w)));
+                m_3 = v_div(v_one, v_add(vx_setall_f64(_w), vx_load(this->s_w + VTraits<v_float64>::vlanes())));
                 m_0 = vx_setall_f64(_x); m_1 = vx_setall_f64(_y);
-                v_float64 x_0 = (m_0 + vx_load(s_x)) * m_2;
-                v_float64 x_1 = (m_0 + vx_load(s_x + v_float64::nlanes)) * m_3;
-                v_float64 y_0 = (m_1 + vx_load(s_y)) * m_2;
-                v_float64 y_1 = (m_1 + vx_load(s_y + v_float64::nlanes)) * m_3;
+                v_float64 x_0 = v_mul(v_add(m_0, vx_load(this->s_x)), m_2);
+                v_float64 x_1 = v_mul(v_add(m_0, vx_load(this->s_x + VTraits<v_float64>::vlanes())), m_3);
+                v_float64 y_0 = v_mul(v_add(m_1, vx_load(this->s_y)), m_2);
+                v_float64 y_1 = v_mul(v_add(m_1, vx_load(this->s_y + VTraits<v_float64>::vlanes())), m_3);
 
-                v_float64 xd_0 = x_0 * x_0;
-                v_float64 yd_0 = y_0 * y_0;
-                v_float64 xd_1 = x_1 * x_1;
-                v_float64 yd_1 = y_1 * y_1;
+                v_float64 xd_0 = v_mul(x_0, x_0);
+                v_float64 yd_0 = v_mul(y_0, y_0);
+                v_float64 xd_1 = v_mul(x_1, x_1);
+                v_float64 yd_1 = v_mul(y_1, y_1);
 
-                v_float64 r2_0 = xd_0 + yd_0;
-                v_float64 r2_1 = xd_1 + yd_1;
+                v_float64 r2_0 = v_add(xd_0, yd_0);
+                v_float64 r2_1 = v_add(xd_1, yd_1);
 
                 m_1 = vx_setall_f64(k3);
                 m_2 = vx_setall_f64(k2);
@@ -151,18 +151,18 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
                 m_1 = v_muladd(v_muladd(v_muladd(m_1, r2_1, m_2), r2_1, m_3), r2_1, v_one);
                 m_3 = vx_setall_f64(k6);
                 m_2 = vx_setall_f64(k5);
-                m_0 /= v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(k4)), r2_0, v_one);
-                m_1 /= v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(k4)), r2_1, v_one);
+                m_0 = v_div(m_0, v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(this->k4)), r2_0, v_one));
+                m_1 = v_div(m_1, v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(this->k4)), r2_1, v_one));
 
                 m_3 = vx_setall_f64(2.0);
                 xd_0 = v_muladd(m_3, xd_0, r2_0);
                 yd_0 = v_muladd(m_3, yd_0, r2_0);
                 xd_1 = v_muladd(m_3, xd_1, r2_1);
                 yd_1 = v_muladd(m_3, yd_1, r2_1);
-                m_2 = x_0 * y_0 * m_3;
-                m_3 = x_1 * y_1 * m_3;
+                m_2 = v_mul(v_mul(x_0, y_0), m_3);
+                m_3 = v_mul(v_mul(x_1, y_1), m_3);
 
-                x_0 *= m_0; y_0 *= m_0; x_1 *= m_1; y_1 *= m_1;
+                x_0 = v_mul(x_0, m_0); y_0 = v_mul(y_0, m_0); x_1 = v_mul(x_1, m_1); y_1 = v_mul(y_1, m_1);
 
                 m_0 = vx_setall_f64(p1);
                 m_1 = vx_setall_f64(p2);
@@ -176,8 +176,8 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
                 xd_1 = v_muladd(m_0, m_3, xd_1);
                 yd_1 = v_muladd(m_1, m_3, yd_1);
 
-                m_0 = r2_0 * r2_0;
-                m_1 = r2_1 * r2_1;
+                m_0 = v_mul(r2_0, r2_0);
+                m_1 = v_mul(r2_1, r2_1);
                 m_2 = vx_setall_f64(s2);
                 m_3 = vx_setall_f64(s1);
                 xd_0 = v_muladd(m_3, r2_0, v_muladd(m_2, m_0, xd_0));
@@ -203,17 +203,17 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
                 r2_0 = v_muladd(m_0, xd_0, v_muladd(m_1, yd_0, m_2));
                 r2_1 = v_muladd(m_0, xd_1, v_muladd(m_1, yd_1, m_2));
                 m_0 = vx_setzero_f64();
-                r2_0 = v_select(r2_0 == m_0, v_one, v_one / r2_0);
-                r2_1 = v_select(r2_1 == m_0, v_one, v_one / r2_1);
+                r2_0 = v_select(v_eq(r2_0, m_0), v_one, v_div(v_one, r2_0));
+                r2_1 = v_select(v_eq(r2_1, m_0), v_one, v_div(v_one, r2_1));
 
                 m_0 = vx_setall_f64(fx);
                 m_1 = vx_setall_f64(u0);
                 m_2 = vx_setall_f64(fy);
                 m_3 = vx_setall_f64(v0);
-                x_0 = v_muladd(m_0 * r2_0, x_0, m_1);
-                y_0 = v_muladd(m_2 * r2_0, y_0, m_3);
-                x_1 = v_muladd(m_0 * r2_1, x_1, m_1);
-                y_1 = v_muladd(m_2 * r2_1, y_1, m_3);
+                x_0 = v_muladd(v_mul(m_0, r2_0), x_0, m_1);
+                y_0 = v_muladd(v_mul(m_2, r2_0), y_0, m_3);
+                x_1 = v_muladd(v_mul(m_0, r2_1), x_1, m_1);
+                y_1 = v_muladd(v_mul(m_2, r2_1), y_1, m_3);
 
                 if (m1type == CV_32FC1)
                 {
@@ -225,20 +225,20 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
                     v_float32 mf0, mf1;
                     v_zip(v_cvt_f32(x_0, x_1), v_cvt_f32(y_0, y_1), mf0, mf1);
                     v_store(&m1f[j * 2], mf0);
-                    v_store(&m1f[j * 2 + v_float32::nlanes], mf1);
+                    v_store(&m1f[j * 2 + VTraits<v_float32>::vlanes()], mf1);
                 }
                 else // m1type == CV_16SC2
                 {
                     m_0 = vx_setall_f64(INTER_TAB_SIZE);
-                    x_0 *= m_0; x_1 *= m_0; y_0 *= m_0; y_1 *= m_0;
+                    x_0 = v_mul(x_0, m_0); x_1 = v_mul(x_1, m_0); y_0 = v_mul(y_0, m_0); y_1 = v_mul(y_1, m_0);
 
                     v_int32 mask = vx_setall_s32(INTER_TAB_SIZE - 1);
                     v_int32 iu = v_round(x_0, x_1);
                     v_int32 iv = v_round(y_0, y_1);
 
-                    v_pack_u_store(&m2[j], (iu & mask) + (iv & mask) * vx_setall_s32(INTER_TAB_SIZE));
+                    v_pack_u_store(&m2[j], v_add(v_and(iu, mask), v_mul(v_and(iv, mask), vx_setall_s32(INTER_TAB_SIZE))));
                     v_int32 out0, out1;
-                    v_zip(iu >> INTER_BITS, iv >> INTER_BITS, out0, out1);
+                    v_zip(v_shr<INTER_BITS>(iu), v_shr<INTER_BITS>(iv), out0, out1);
                     v_store(&m1[j * 2], v_pack(out0, out1));
                 }
             }
@@ -302,10 +302,10 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
     double s2;
     double s3;
     double s4;
-#if CV_SIMD_64F
-    double s_x[2*v_float64::nlanes];
-    double s_y[2*v_float64::nlanes];
-    double s_w[2*v_float64::nlanes];
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    double s_x[2*VTraits<v_float64>::max_nlanes];
+    double s_y[2*VTraits<v_float64>::max_nlanes];
+    double s_w[2*VTraits<v_float64>::max_nlanes];
 #endif
 };
 }

diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -972,6 +972,15 @@ namespace CV__SIMD_NAMESPACE {
     { \
         return a op b; \
     }
+    #define OPENCV_HAL_WRAP_EQ_OP(_Tpvec) \
+    inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a == b; \
+    } \
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a != b; \
+    }
 
     #define OPENCV_HAL_WRAP_CMP(_Tpvec) \
     OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \
@@ -984,11 +993,11 @@ namespace CV__SIMD_NAMESPACE {
     OPENCV_HAL_WRAP_CMP(v_uint8)
     OPENCV_HAL_WRAP_CMP(v_uint16)
     OPENCV_HAL_WRAP_CMP(v_uint32)
-    // OPENCV_HAL_WRAP_CMP(v_uint64)
+    OPENCV_HAL_WRAP_EQ_OP(v_uint64)
     OPENCV_HAL_WRAP_CMP(v_int8)
     OPENCV_HAL_WRAP_CMP(v_int16)
     OPENCV_HAL_WRAP_CMP(v_int32)
-    // OPENCV_HAL_WRAP_CMP(v_int64)
+    OPENCV_HAL_WRAP_EQ_OP(v_int64)
     OPENCV_HAL_WRAP_CMP(v_float32)
     #if CV_SIMD_64F
     OPENCV_HAL_WRAP_CMP(v_float64)
@@ -997,9 +1006,11 @@ namespace CV__SIMD_NAMESPACE {
         OPENCV_HAL_WRAP_CMP(v_uint8x16)
         OPENCV_HAL_WRAP_CMP(v_uint16x8)
         OPENCV_HAL_WRAP_CMP(v_uint32x4)
+        OPENCV_HAL_WRAP_EQ_OP(v_uint64x2)
         OPENCV_HAL_WRAP_CMP(v_int8x16)
         OPENCV_HAL_WRAP_CMP(v_int16x8)
         OPENCV_HAL_WRAP_CMP(v_int32x4)
+        OPENCV_HAL_WRAP_EQ_OP(v_int64x2)
         OPENCV_HAL_WRAP_CMP(v_float32x4)
         #if CV_SIMD_64F
         OPENCV_HAL_WRAP_CMP(v_float64x2)
@@ -1009,9 +1020,11 @@ namespace CV__SIMD_NAMESPACE {
         OPENCV_HAL_WRAP_CMP(v_uint8x32)
         OPENCV_HAL_WRAP_CMP(v_uint16x16)
         OPENCV_HAL_WRAP_CMP(v_uint32x8)
+        OPENCV_HAL_WRAP_EQ_OP(v_uint64x4)
         OPENCV_HAL_WRAP_CMP(v_int8x32)
         OPENCV_HAL_WRAP_CMP(v_int16x16)
         OPENCV_HAL_WRAP_CMP(v_int32x8)
+        OPENCV_HAL_WRAP_EQ_OP(v_int64x4)
         OPENCV_HAL_WRAP_CMP(v_float32x8)
         #if CV_SIMD_64F
         OPENCV_HAL_WRAP_CMP(v_float64x4)