NEON culling: Use mla operations to shave off some more cycles. ARM32…

… compat.
hrydgard · Dec 9, 2023 · 99548be · 99548be
1 parent 6a7ef83
commit 99548be
Show file tree

Hide file tree

Showing 6 changed files with 79 additions and 22 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -737,6 +737,7 @@ add_library(Common STATIC
 	Common/Input/InputState.cpp
 	Common/Input/InputState.h
 	Common/Math/fast/fast_matrix.c
+	Common/Math/CrossSIMD.h
 	Common/Math/curves.cpp
 	Common/Math/curves.h
 	Common/Math/expression_parser.cpp

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
@@ -0,0 +1,54 @@
+// CrossSIMD
+//
+// Compatibility wrappers for SIMD dialects.
+//
+// In the long run, might do a more general single-source-SIMD wrapper here consisting
+// of defines that translate to either NEON or SSE. It would be possible to write quite a lot of
+// our various color conversion functions and so on in a pretty generic manner.
+
+#include "ppsspp_config.h"
+
+#include <cstdint>
+
+#ifdef _M_SSE
+#include <emmintrin.h>
+#endif
+
+#if PPSSPP_ARCH(ARM_NEON)
+#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+#endif
+
+// Basic types
+
+#if PPSSPP_ARCH(ARM64_NEON)
+
+// No special ones here.
+
+#elif PPSSPP_ARCH(ARM_NEON)
+
+// Compatibility wrappers making ARM64 NEON code run on ARM32
+// With optimization on, these should compile down to the optimal code.
+
+inline float32x4_t vmulq_laneq_f32(float32x4_t a, float32x4_t b, int lane) {
+	switch (lane & 3) {
+	case 0: return vmulq_lane_f32(a, vget_low_f32(b), 0);
+	case 1: return vmulq_lane_f32(a, vget_low_f32(b), 1);
+	case 2: return vmulq_lane_f32(a, vget_high_f32(b), 0);
+	case 3: return vmulq_lane_f32(a, vget_high_f32(b), 1);
+	}
+}
+
+inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c, int lane) {
+	switch (lane & 3) {
+	case 0: return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
+	case 1: return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
+	case 2: return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
+	case 3: return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
+	}
+}
+
+#endif
diff --git a/Core/Config.cpp b/Core/Config.cpp
@@ -1893,7 +1893,7 @@ void PlayTimeTracker::Load(const Section *section) {
 
 		// Parse the string.
 		PlayTime gameTime{};
-		if (2 == sscanf(value.c_str(), "%d,%llu", &gameTime.totalTimePlayed, &gameTime.lastTimePlayed)) {
+		if (2 == sscanf(value.c_str(), "%d,%llu", &gameTime.totalTimePlayed, (long long *)&gameTime.lastTimePlayed)) {
 			tracker_[key] = gameTime;
 		}
 	}

diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp
@@ -21,6 +21,7 @@
 #include "Common/Data/Convert/ColorConv.h"
 #include "Common/Profiler/Profiler.h"
 #include "Common/LogReporting.h"
+#include "Common/Math/CrossSIMD.h"
 #include "Common/Math/lin/matrix4x4.h"
 #include "Core/Config.h"
 #include "GPU/Common/DrawEngineCommon.h"
@@ -442,12 +443,12 @@ bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u
 			__m128 pos = _mm_mul_ps(_mm_cvtepi32_ps(bits), scaleFactor);
 			_mm_storeu_ps(verts + i * 3, pos);  // TODO: use stride 4 to avoid clashing writes?
 		}
-#elif 0 && PPSSPP_ARCH(ARM_NEON)
-		__m128 scaleFactor = vdupq_n_f32(1.0f / 32768.0f);
+#elif PPSSPP_ARCH(ARM_NEON)
+		float32x4_t scaleFactor = vdupq_n_f32(1.0f / 32768.0f);
 		for (int i = 0; i < vertexCount; i++) {
-			const int16_t *dataPtr = ((const int16_t *)((const s8 *)vdata + i * stride + offset));
+			const s16 *dataPtr = ((const s16 *)((const s8 *)vdata + i * stride + offset));
 			int32x4_t data = vmovl_s16(vld1_s16(dataPtr));
-			float32x4_t pos = vmulq_f32(scaleFactor, vcvtq_s32_f32(data));  // This does the division by 32768.0f, effectively.
+			float32x4_t pos = vmulq_f32(vcvtq_f32_s32(data), scaleFactor);
 			vst1q_f32(verts + i * 3, pos);
 		}
 #else
@@ -470,7 +471,11 @@ bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u
 	// We only check the 4 sides. Near/far won't likely make a huge difference.
 	// We test one vertex against 4 planes to get some SIMD. Vertices need to be transformed to world space
 	// for testing, don't want to re-do that, so we have to use that "pivot" of the data.
+<<<<<<< HEAD
 #ifdef _M_SSE
+=======
+#if PPSSPP_ARCH(SSE2)
+>>>>>>> c5a94c3799 (Buildfix again)
 	const __m128 worldX = _mm_loadu_ps(gstate.worldMatrix);
 	const __m128 worldY = _mm_loadu_ps(gstate.worldMatrix + 3);
 	const __m128 worldZ = _mm_loadu_ps(gstate.worldMatrix + 6);
@@ -509,8 +514,9 @@ bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u
 		);
 		inside = _mm_or_ps(inside, _mm_cmpge_ps(planeDist, _mm_setzero_ps()));
 	}
-	u8 mask = _mm_movemask_ps(inside);
-	return mask == 0xF;  // 0xF means that we found at least one vertex inside every one of the planes. We don't bother with counts, though it wouldn't be hard.
+	// 0xF means that we found at least one vertex inside every one of the planes.
+	// We don't bother with counts, though it wouldn't be hard if we had a use for them.
+	return _mm_movemask_ps(inside) == 0xF;
 #elif PPSSPP_ARCH(ARM_NEON)
 	const float32x4_t worldX = vld1q_f32(gstate.worldMatrix);
 	const float32x4_t worldY = vld1q_f32(gstate.worldMatrix + 3);
@@ -525,28 +531,20 @@ bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u
 		const float *pos = verts + i * vertStride;
 		float32x4_t objpos = vld1q_f32(pos);
 		float32x4_t worldpos = vaddq_f32(
-			vaddq_f32(
+			vmlaq_laneq_f32(
 				vmulq_laneq_f32(worldX, objpos, 0),
-				vmulq_laneq_f32(worldY, objpos, 1)
-			),
-			vaddq_f32(
-				vmulq_laneq_f32(worldZ, objpos, 2),
-				worldW
-			)
+				worldY, objpos, 1),
+			vmlaq_laneq_f32(worldW, worldZ, objpos, 2)
 		);
 		// OK, now we check it against the four planes.
 		// This is really curiously similar to a matrix multiplication (well, it is one).
 		float32x4_t planeDist = vaddq_f32(
-			vaddq_f32(
+			vmlaq_laneq_f32(
 				vmulq_laneq_f32(planeX, worldpos, 0),
-				vmulq_laneq_f32(planeY, worldpos, 1)
-			),
-			vaddq_f32(
-				vmulq_laneq_f32(planeZ, worldpos, 2),
-				planeW
-			)
+				planeY, worldpos, 1),
+			vmlaq_laneq_f32(planeW, planeZ, worldpos, 2)
 		);
-		inside = vorrq_u32(inside, vcgeq_f32(planeDist, vdupq_n_f32(0.0f)));
+		inside = vorrq_u32(inside, vcgezq_f32(planeDist));
 	}
 	uint64_t insideBits = vget_lane_u64(vreinterpret_u64_u16(vmovn_u32(inside)), 0);
 	return ~insideBits == 0;  // InsideBits all ones means that we found at least one vertex inside every one of the planes. We don't bother with counts, though it wouldn't be hard.

diff --git a/UWP/CommonUWP/CommonUWP.vcxproj b/UWP/CommonUWP/CommonUWP.vcxproj
@@ -105,6 +105,7 @@
     <ClInclude Include="..\..\Common\File\AndroidStorage.h" />
     <ClInclude Include="..\..\Common\GPU\GPUBackendCommon.h" />
     <ClInclude Include="..\..\Common\GPU\Vulkan\VulkanLoader.h" />
+    <ClInclude Include="..\..\Common\Math\CrossSIMD.h" />
     <ClInclude Include="..\..\Common\Math\Statistics.h" />
     <ClInclude Include="..\..\Common\Net\HTTPNaettRequest.h" />
     <ClInclude Include="..\..\Common\Net\HTTPRequest.h" />

diff --git a/UWP/CommonUWP/CommonUWP.vcxproj.filters b/UWP/CommonUWP/CommonUWP.vcxproj.filters
@@ -862,6 +862,9 @@
     <ClInclude Include="..\..\ext\naett\naett.h">
       <Filter>ext\naett</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\Common\Math\CrossSIMD.h">
+      <Filter>Math</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="..\..\Common\Math\fast\fast_matrix_neon.S">