From 7e85d3d10a83b3bd767f30f1bd974c90e086dce6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sat, 9 Dec 2023 14:50:14 +0100 Subject: [PATCH] Disable the new culling on RISC-V for now. --- Common/Math/CrossSIMD.h | 2 +- GPU/Common/DrawEngineCommon.cpp | 15 +++++---------- GPU/GPUCommonHW.cpp | 9 +++++++++ ppsspp_config.h | 2 ++ 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index c2851b5775b3..3eb8e0e75e2f 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -10,7 +10,7 @@ #include -#ifdef _M_SSE +#if PPSSPP_ARCH(SSE2) #include #endif diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 71ead60a021b..96a81b36b934 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -433,12 +433,12 @@ bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u break; case GE_VTYPE_POS_16BIT: { -#if defined(_M_SSE) +#if PPSSPP_ARCH(SSE2) __m128 scaleFactor = _mm_set1_ps(1.0f / 32768.0f); for (int i = 0; i < vertexCount; i++) { const s16 *data = ((const s16 *)((const s8 *)vdata + i * stride + offset)); __m128i bits = _mm_castpd_si128(_mm_load_sd((const double *)data)); - // Sign extension. Ugly without SSE4. + // Sign extension. Hacky without SSE4. bits = _mm_srai_epi32(_mm_unpacklo_epi16(bits, bits), 16); __m128 pos = _mm_mul_ps(_mm_cvtepi32_ps(bits), scaleFactor); _mm_storeu_ps(verts + i * 3, pos); // TODO: use stride 4 to avoid clashing writes? @@ -470,11 +470,7 @@ bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u // We only check the 4 sides. Near/far won't likely make a huge difference. // We test one vertex against 4 planes to get some SIMD. Vertices need to be transformed to world space // for testing, don't want to re-do that, so we have to use that "pivot" of the data. -<<<<<<< HEAD -#ifdef _M_SSE -======= #if PPSSPP_ARCH(SSE2) ->>>>>>> c5a94c3799 (Buildfix again) const __m128 worldX = _mm_loadu_ps(gstate.worldMatrix); const __m128 worldY = _mm_loadu_ps(gstate.worldMatrix + 3); const __m128 worldZ = _mm_loadu_ps(gstate.worldMatrix + 6); @@ -498,9 +494,9 @@ bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u ); // OK, now we check it against the four planes. // This is really curiously similar to a matrix multiplication (well, it is one). - __m128 posX = _mm_shuffle_ps(worldpos, worldpos, 0); - __m128 posY = _mm_shuffle_ps(worldpos, worldpos, 1 | (1 << 2) | (1 << 4) | (1 << 6)); - __m128 posZ = _mm_shuffle_ps(worldpos, worldpos, 2 | (2 << 2) | (2 << 4) | (2 << 6)); + __m128 posX = _mm_shuffle_ps(worldpos, worldpos, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 posY = _mm_shuffle_ps(worldpos, worldpos, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 posZ = _mm_shuffle_ps(worldpos, worldpos, _MM_SHUFFLE(2, 2, 2, 2)); __m128 planeDist = _mm_add_ps( _mm_add_ps( _mm_mul_ps(planeX, posX), @@ -566,7 +562,6 @@ bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u } } #endif - return true; } diff --git a/GPU/GPUCommonHW.cpp b/GPU/GPUCommonHW.cpp index 8af997b67b64..66906f4b1b47 100644 --- a/GPU/GPUCommonHW.cpp +++ b/GPU/GPUCommonHW.cpp @@ -992,8 +992,17 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { #define MAX_CULL_CHECK_COUNT 6 +// For now, turn off culling on platforms where we don't have SIMD bounding box tests, like RISC-V. +#if PPSSPP_ARCH(ARM_NEON) || PPSSPP_ARCH(SSE2) + #define PASSES_CULLING ((vertexType & (GE_VTYPE_THROUGH_MASK | GE_VTYPE_MORPHCOUNT_MASK | GE_VTYPE_WEIGHT_MASK | GE_VTYPE_IDX_MASK)) || count > MAX_CULL_CHECK_COUNT) +#else + +#define PASSES_CULLING true + +#endif + // If certain conditions are true, do frustum culling. bool passCulling = PASSES_CULLING; if (!passCulling) { diff --git a/ppsspp_config.h b/ppsspp_config.h index 2861b621b333..71e7b9c9466d 100644 --- a/ppsspp_config.h +++ b/ppsspp_config.h @@ -11,6 +11,7 @@ #if defined(_M_IX86) || defined(__i386__) || defined (__EMSCRIPTEN__) #define PPSSPP_ARCH_X86 1 #define PPSSPP_ARCH_32BIT 1 + #define PPSSPP_ARCH_SSE2 1 //TODO: Remove this compat define #ifndef _M_IX86 #define _M_IX86 600 @@ -19,6 +20,7 @@ #if (defined(_M_X64) || defined(__amd64__) || defined(__x86_64__)) && !defined(__EMSCRIPTEN__) #define PPSSPP_ARCH_AMD64 1 + #define PPSSPP_ARCH_SSE2 1 #if defined(__ILP32__) #define PPSSPP_ARCH_32BIT 1 #else