From f0d844a5a3599acb2d15377746e3484a2e23a6f0 Mon Sep 17 00:00:00 2001
From: fp64 <106717720+fp64@users.noreply.github.com>
Date: Wed, 14 Jun 2023 22:02:50 -0400
Subject: [PATCH] Convert Dot33 to SSE2

Simpler, lower requirements, and doesn't seem to hurt speed. See #17571.
---
 GPU/Software/Lighting.cpp | 30 ++++++++++--------------------
 1 file changed, 10 insertions(+), 20 deletions(-)
diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp
index 58b7e0bd3e24..fb8bdf91cdcb 100644
--- a/GPU/Software/Lighting.cpp
+++ b/GPU/Software/Lighting.cpp
@@ -255,23 +255,13 @@ static inline void LightColorSum(Vec4<int> &sum, const Vec4<int> &src) {
 #endif
 }
 
-#if defined(_M_SSE)
-#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
-[[gnu::target("sse4.1")]]
-#endif
-static inline __m128 Dot33SSE4(__m128 a, __m128 b) {
-	__m128 multiplied = _mm_insert_ps(_mm_mul_ps(a, b), _mm_setzero_ps(), 0x30);
-	__m128 lanes3311 = _mm_movehdup_ps(multiplied);
-	__m128 partial = _mm_add_ps(multiplied, lanes3311);
-	return _mm_add_ss(partial, _mm_movehl_ps(lanes3311, partial));
-}
-#endif
-
-template <bool useSSE4>
 static inline float Dot33(const Vec3f &a, const Vec3f &b) {
-#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
-	if (useSSE4)
-		return _mm_cvtss_f32(Dot33SSE4(a.vec, b.vec));
+#if defined(_M_SSE)
+	__m128 v = _mm_mul_ps(a.vec, b.vec); // [X, Y, Z, W]
+	__m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 2, 0, 1)); // [Y, X, Z, W]
+	__m128 sums = _mm_add_ps(v, shuf); // [X + Y, X + Y, Z + Z, W + W]
+	shuf = _mm_movehl_ps(shuf, shuf); // [Z, W, Z, W]
+	return _mm_cvtss_f32(_mm_add_ss(sums, shuf)); // X + Y + Z
 #elif PPSSPP_ARCH(ARM64_NEON)
 	float32x4_t multipled = vsetq_lane_f32(0.0f, vmulq_f32(a.vec, b.vec), 3);
 	float32x2_t add1 = vget_low_f32(vpaddq_f32(multipled, multipled));
@@ -311,7 +301,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
 			// TODO: Should this normalize (0, 0, 0) to (0, 0, 1)?
 			float d = L.NormalizeOr001();
 
-			att = 1.0f / Dot33<useSSE4>(lstate.att, Vec3f(1.0f, d, d * d));
+			att = 1.0f / Dot33(lstate.att, Vec3f(1.0f, d, d * d));
 			if (!(att > 0.0f))
 				att = 0.0f;
 			else if (att > 1.0f)
@@ -320,7 +310,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
 
 		float spot = 1.0f;
 		if (lstate.spot) {
-			float rawSpot = Dot33<useSSE4>(lstate.spotDir, L);
+			float rawSpot = Dot33(lstate.spotDir, L);
 			if (std::isnan(rawSpot))
 				rawSpot = std::signbit(rawSpot) ? 0.0f : 1.0f;
 
@@ -345,7 +335,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
 		// diffuse lighting
 		float diffuse_factor;
 		if (lstate.diffuse || lstate.specular) {
-			diffuse_factor = Dot33<useSSE4>(L, worldnormal);
+			diffuse_factor = Dot33(L, worldnormal);
 			if (lstate.poweredDiffuse) {
 				diffuse_factor = pspLightPow(diffuse_factor, state.specularExp);
 			}
@@ -363,7 +353,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
 		if (lstate.specular && diffuse_factor >= 0.0f) {
 			Vec3<float> H = L + Vec3<float>(0.f, 0.f, 1.f);
 
-			float specular_factor = Dot33<useSSE4>(H.NormalizedOr001(useSSE4), worldnormal);
+			float specular_factor = Dot33(H.NormalizedOr001(useSSE4), worldnormal);
 			specular_factor = pspLightPow(specular_factor, state.specularExp);
 
 			if (specular_factor > 0.0f) {