Skip to content

Commit

Permalink
NEON culling: Use mla operations to shave off some more cycles. ARM32…
Browse files Browse the repository at this point in the history
… compat.
  • Loading branch information
hrydgard committed Dec 9, 2023
1 parent 6a7ef83 commit 99548be
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 22 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -737,6 +737,7 @@ add_library(Common STATIC
Common/Input/InputState.cpp
Common/Input/InputState.h
Common/Math/fast/fast_matrix.c
Common/Math/CrossSIMD.h
Common/Math/curves.cpp
Common/Math/curves.h
Common/Math/expression_parser.cpp
Expand Down
54 changes: 54 additions & 0 deletions Common/Math/CrossSIMD.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// CrossSIMD
//
// Compatibility wrappers for SIMD dialects.
//
// In the long run, might do a more general single-source-SIMD wrapper here consisting
// of defines that translate to either NEON or SSE. It would be possible to write quite a lot of
// our various color conversion functions and so on in a pretty generic manner.

#include "ppsspp_config.h"

#include <cstdint>

#ifdef _M_SSE
#include <emmintrin.h>
#endif

#if PPSSPP_ARCH(ARM_NEON)
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
#include <arm64_neon.h>
#else
#include <arm_neon.h>
#endif
#endif

// Basic types

#if PPSSPP_ARCH(ARM64_NEON)

// No special ones here.

#elif PPSSPP_ARCH(ARM_NEON)

// Compatibility wrappers making ARM64 NEON code run on ARM32
// With optimization on, these should compile down to the optimal code.

inline float32x4_t vmulq_laneq_f32(float32x4_t a, float32x4_t b, int lane) {
switch (lane & 3) {
case 0: return vmulq_lane_f32(a, vget_low_f32(b), 0);
case 1: return vmulq_lane_f32(a, vget_low_f32(b), 1);
case 2: return vmulq_lane_f32(a, vget_high_f32(b), 0);
case 3: return vmulq_lane_f32(a, vget_high_f32(b), 1);
}
}

inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c, int lane) {
switch (lane & 3) {
case 0: return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
case 1: return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
case 2: return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
case 3: return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
}
}

#endif
2 changes: 1 addition & 1 deletion Core/Config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1893,7 +1893,7 @@ void PlayTimeTracker::Load(const Section *section) {

// Parse the string.
PlayTime gameTime{};
if (2 == sscanf(value.c_str(), "%d,%llu", &gameTime.totalTimePlayed, &gameTime.lastTimePlayed)) {
if (2 == sscanf(value.c_str(), "%d,%llu", &gameTime.totalTimePlayed, (long long *)&gameTime.lastTimePlayed)) {
tracker_[key] = gameTime;
}
}
Expand Down
40 changes: 19 additions & 21 deletions GPU/Common/DrawEngineCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "Common/Data/Convert/ColorConv.h"
#include "Common/Profiler/Profiler.h"
#include "Common/LogReporting.h"
#include "Common/Math/CrossSIMD.h"
#include "Common/Math/lin/matrix4x4.h"
#include "Core/Config.h"
#include "GPU/Common/DrawEngineCommon.h"
Expand Down Expand Up @@ -442,12 +443,12 @@ bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u
__m128 pos = _mm_mul_ps(_mm_cvtepi32_ps(bits), scaleFactor);
_mm_storeu_ps(verts + i * 3, pos); // TODO: use stride 4 to avoid clashing writes?
}
#elif 0 && PPSSPP_ARCH(ARM_NEON)
__m128 scaleFactor = vdupq_n_f32(1.0f / 32768.0f);
#elif PPSSPP_ARCH(ARM_NEON)
float32x4_t scaleFactor = vdupq_n_f32(1.0f / 32768.0f);
for (int i = 0; i < vertexCount; i++) {
const int16_t *dataPtr = ((const int16_t *)((const s8 *)vdata + i * stride + offset));
const s16 *dataPtr = ((const s16 *)((const s8 *)vdata + i * stride + offset));
int32x4_t data = vmovl_s16(vld1_s16(dataPtr));
float32x4_t pos = vmulq_f32(scaleFactor, vcvtq_s32_f32(data)); // This does the division by 32768.0f, effectively.
float32x4_t pos = vmulq_f32(vcvtq_f32_s32(data), scaleFactor);
vst1q_f32(verts + i * 3, pos);
}
#else
Expand All @@ -470,7 +471,11 @@ bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u
// We only check the 4 sides. Near/far won't likely make a huge difference.
// We test one vertex against 4 planes to get some SIMD. Vertices need to be transformed to world space
// for testing, don't want to re-do that, so we have to use that "pivot" of the data.
<<<<<<< HEAD
#ifdef _M_SSE
=======
#if PPSSPP_ARCH(SSE2)
>>>>>>> c5a94c3799 (Buildfix again)
const __m128 worldX = _mm_loadu_ps(gstate.worldMatrix);
const __m128 worldY = _mm_loadu_ps(gstate.worldMatrix + 3);
const __m128 worldZ = _mm_loadu_ps(gstate.worldMatrix + 6);
Expand Down Expand Up @@ -509,8 +514,9 @@ bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u
);
inside = _mm_or_ps(inside, _mm_cmpge_ps(planeDist, _mm_setzero_ps()));
}
u8 mask = _mm_movemask_ps(inside);
return mask == 0xF; // 0xF means that we found at least one vertex inside every one of the planes. We don't bother with counts, though it wouldn't be hard.
// 0xF means that we found at least one vertex inside every one of the planes.
// We don't bother with counts, though it wouldn't be hard if we had a use for them.
return _mm_movemask_ps(inside) == 0xF;
#elif PPSSPP_ARCH(ARM_NEON)
const float32x4_t worldX = vld1q_f32(gstate.worldMatrix);
const float32x4_t worldY = vld1q_f32(gstate.worldMatrix + 3);
Expand All @@ -525,28 +531,20 @@ bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u
const float *pos = verts + i * vertStride;
float32x4_t objpos = vld1q_f32(pos);
float32x4_t worldpos = vaddq_f32(
vaddq_f32(
vmlaq_laneq_f32(
vmulq_laneq_f32(worldX, objpos, 0),
vmulq_laneq_f32(worldY, objpos, 1)
),
vaddq_f32(
vmulq_laneq_f32(worldZ, objpos, 2),
worldW
)
worldY, objpos, 1),
vmlaq_laneq_f32(worldW, worldZ, objpos, 2)
);
// OK, now we check it against the four planes.
// This is really curiously similar to a matrix multiplication (well, it is one).
float32x4_t planeDist = vaddq_f32(
vaddq_f32(
vmlaq_laneq_f32(
vmulq_laneq_f32(planeX, worldpos, 0),
vmulq_laneq_f32(planeY, worldpos, 1)
),
vaddq_f32(
vmulq_laneq_f32(planeZ, worldpos, 2),
planeW
)
planeY, worldpos, 1),
vmlaq_laneq_f32(planeW, planeZ, worldpos, 2)
);
inside = vorrq_u32(inside, vcgeq_f32(planeDist, vdupq_n_f32(0.0f)));
inside = vorrq_u32(inside, vcgezq_f32(planeDist));
}
uint64_t insideBits = vget_lane_u64(vreinterpret_u64_u16(vmovn_u32(inside)), 0);
return ~insideBits == 0; // InsideBits all ones means that we found at least one vertex inside every one of the planes. We don't bother with counts, though it wouldn't be hard.
Expand Down
1 change: 1 addition & 0 deletions UWP/CommonUWP/CommonUWP.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
<ClInclude Include="..\..\Common\File\AndroidStorage.h" />
<ClInclude Include="..\..\Common\GPU\GPUBackendCommon.h" />
<ClInclude Include="..\..\Common\GPU\Vulkan\VulkanLoader.h" />
<ClInclude Include="..\..\Common\Math\CrossSIMD.h" />
<ClInclude Include="..\..\Common\Math\Statistics.h" />
<ClInclude Include="..\..\Common\Net\HTTPNaettRequest.h" />
<ClInclude Include="..\..\Common\Net\HTTPRequest.h" />
Expand Down
3 changes: 3 additions & 0 deletions UWP/CommonUWP/CommonUWP.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -862,6 +862,9 @@
<ClInclude Include="..\..\ext\naett\naett.h">
<Filter>ext\naett</Filter>
</ClInclude>
<ClInclude Include="..\..\Common\Math\CrossSIMD.h">
<Filter>Math</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="..\..\Common\Math\fast\fast_matrix_neon.S">
Expand Down

0 comments on commit 99548be

Please sign in to comment.