diff --git a/CMakeLists.txt b/CMakeLists.txt index caaef337810f..520ae2b0a945 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -737,6 +737,7 @@ add_library(Common STATIC Common/Input/InputState.cpp Common/Input/InputState.h Common/Math/fast/fast_matrix.c + Common/Math/CrossSIMD.h Common/Math/curves.cpp Common/Math/curves.h Common/Math/expression_parser.cpp diff --git a/Common/Common.vcxproj b/Common/Common.vcxproj index 93e91681b0cf..2b31119bd9cc 100644 --- a/Common/Common.vcxproj +++ b/Common/Common.vcxproj @@ -484,6 +484,7 @@ + diff --git a/Common/Common.vcxproj.filters b/Common/Common.vcxproj.filters index 73aefb9323bf..6d82c03c0417 100644 --- a/Common/Common.vcxproj.filters +++ b/Common/Common.vcxproj.filters @@ -518,6 +518,9 @@ GPU\Vulkan + + Math + diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h new file mode 100644 index 000000000000..3eb8e0e75e2f --- /dev/null +++ b/Common/Math/CrossSIMD.h @@ -0,0 +1,58 @@ +// CrossSIMD +// +// Compatibility wrappers for SIMD dialects. +// +// In the long run, might do a more general single-source-SIMD wrapper here consisting +// of defines that translate to either NEON or SSE. It would be possible to write quite a lot of +// our various color conversion functions and so on in a pretty generic manner. + +#include "ppsspp_config.h" + +#include + +#if PPSSPP_ARCH(SSE2) +#include +#endif + +#if PPSSPP_ARCH(ARM_NEON) +#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64) +#include +#else +#include +#endif +#endif + +// Basic types + +#if PPSSPP_ARCH(ARM64_NEON) + +// No special ones here. + +#elif PPSSPP_ARCH(ARM_NEON) + +// Compatibility wrappers making ARM64 NEON code run on ARM32 +// With optimization on, these should compile down to the optimal code. + +inline float32x4_t vmulq_laneq_f32(float32x4_t a, float32x4_t b, int lane) { + switch (lane & 3) { + case 0: return vmulq_lane_f32(a, vget_low_f32(b), 0); + case 1: return vmulq_lane_f32(a, vget_low_f32(b), 1); + case 2: return vmulq_lane_f32(a, vget_high_f32(b), 0); + default: return vmulq_lane_f32(a, vget_high_f32(b), 1); + } +} + +inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c, int lane) { + switch (lane & 3) { + case 0: return vmlaq_lane_f32(a, b, vget_low_f32(c), 0); + case 1: return vmlaq_lane_f32(a, b, vget_low_f32(c), 1); + case 2: return vmlaq_lane_f32(a, b, vget_high_f32(c), 0); + default: return vmlaq_lane_f32(a, b, vget_high_f32(c), 1); + } +} + +inline uint32x4_t vcgezq_f32(float32x4_t v) { + return vcgeq_f32(v, vdupq_n_f32(0.0f)); +} + +#endif diff --git a/Common/UI/View.cpp b/Common/UI/View.cpp index b60f6ac88de8..37f8e9ab02aa 100644 --- a/Common/UI/View.cpp +++ b/Common/UI/View.cpp @@ -620,7 +620,6 @@ CollapsibleHeader::CollapsibleHeader(bool *toggle, const std::string &text, Layo void CollapsibleHeader::Draw(UIContext &dc) { Style style = dc.theme->itemStyle; - style.background.color = 0; if (HasFocus()) style = dc.theme->itemFocusedStyle; if (down_) style = dc.theme->itemDownStyle; if (!IsEnabled()) style = dc.theme->itemDisabledStyle; diff --git a/Core/Config.cpp b/Core/Config.cpp index fc4a2b913a06..eec83a23e87c 100644 --- a/Core/Config.cpp +++ b/Core/Config.cpp @@ -1893,7 +1893,7 @@ void PlayTimeTracker::Load(const Section *section) { // Parse the string. PlayTime gameTime{}; - if (2 == sscanf(value.c_str(), "%d,%llu", &gameTime.totalTimePlayed, &gameTime.lastTimePlayed)) { + if (2 == sscanf(value.c_str(), "%d,%llu", &gameTime.totalTimePlayed, (long long *)&gameTime.lastTimePlayed)) { tracker_[key] = gameTime; } } diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 8927a1c2a099..96a81b36b934 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -21,6 +21,7 @@ #include "Common/Data/Convert/ColorConv.h" #include "Common/Profiler/Profiler.h" #include "Common/LogReporting.h" +#include "Common/Math/CrossSIMD.h" #include "Common/Math/lin/matrix4x4.h" #include "Core/Config.h" #include "GPU/Common/DrawEngineCommon.h" @@ -197,15 +198,10 @@ void DrawEngineCommon::DispatchSubmitImm(GEPrimitiveType prim, TransformedVertex // Gated by DIRTY_CULL_PLANES void DrawEngineCommon::UpdatePlanes() { - float world[16]; float view[16]; - float worldview[16]; - float worldviewproj[16]; - ConvertMatrix4x3To4x4(world, gstate.worldMatrix); + float viewproj[16]; ConvertMatrix4x3To4x4(view, gstate.viewMatrix); - // TODO: Create a Matrix4x3ByMatrix4x3, and Matrix4x4ByMatrix4x3? - Matrix4ByMatrix4(worldview, world, view); - Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); + Matrix4ByMatrix4(viewproj, view, gstate.projMatrix); // Next, we need to apply viewport, scissor, region, and even offset - but only for X/Y. // Note that the PSP does not clip against the viewport. @@ -214,6 +210,9 @@ void DrawEngineCommon::UpdatePlanes() { minOffset_ = baseOffset + Vec2f(std::max(gstate.getRegionRateX() - 0x100, gstate.getScissorX1()), std::max(gstate.getRegionRateY() - 0x100, gstate.getScissorY1())) - Vec2f(1.0f, 1.0f); maxOffset_ = baseOffset + Vec2f(std::min(gstate.getRegionX2(), gstate.getScissorX2()), std::min(gstate.getRegionY2(), gstate.getScissorY2())) + Vec2f(1.0f, 1.0f); + // Let's not handle these special cases in the fast culler. + offsetOutsideEdge_ = maxOffset_.x >= 4096.0f || minOffset_.x < 1.0f || minOffset_.y < 1.0f || maxOffset_.y >= 4096.0f; + // Now let's apply the viewport to our scissor/region + offset range. Vec2f inverseViewportScale = Vec2f(1.0f / gstate.getViewportXScale(), 1.0f / gstate.getViewportYScale()); Vec2f minViewport = (minOffset_ - Vec2f(gstate.getViewportXCenter(), gstate.getViewportYCenter())) * inverseViewportScale; @@ -232,14 +231,14 @@ void DrawEngineCommon::UpdatePlanes() { applyViewport.wy = -(maxViewport.y + minViewport.y) * viewportInvSize.y; float mtx[16]; - Matrix4ByMatrix4(mtx, worldviewproj, applyViewport.m); - - planes_[0].Set(mtx[3] - mtx[0], mtx[7] - mtx[4], mtx[11] - mtx[8], mtx[15] - mtx[12]); // Right - planes_[1].Set(mtx[3] + mtx[0], mtx[7] + mtx[4], mtx[11] + mtx[8], mtx[15] + mtx[12]); // Left - planes_[2].Set(mtx[3] + mtx[1], mtx[7] + mtx[5], mtx[11] + mtx[9], mtx[15] + mtx[13]); // Bottom - planes_[3].Set(mtx[3] - mtx[1], mtx[7] - mtx[5], mtx[11] - mtx[9], mtx[15] - mtx[13]); // Top - planes_[4].Set(mtx[3] + mtx[2], mtx[7] + mtx[6], mtx[11] + mtx[10], mtx[15] + mtx[14]); // Near - planes_[5].Set(mtx[3] - mtx[2], mtx[7] - mtx[6], mtx[11] - mtx[10], mtx[15] - mtx[14]); // Far + Matrix4ByMatrix4(mtx, viewproj, applyViewport.m); + // I'm sure there's some fairly optimized way to set these. + planes_.Set(0, mtx[3] - mtx[0], mtx[7] - mtx[4], mtx[11] - mtx[8], mtx[15] - mtx[12]); // Right + planes_.Set(1, mtx[3] + mtx[0], mtx[7] + mtx[4], mtx[11] + mtx[8], mtx[15] + mtx[12]); // Left + planes_.Set(2, mtx[3] + mtx[1], mtx[7] + mtx[5], mtx[11] + mtx[9], mtx[15] + mtx[13]); // Bottom + planes_.Set(3, mtx[3] - mtx[1], mtx[7] - mtx[5], mtx[11] - mtx[9], mtx[15] - mtx[13]); // Top + planes_.Set(4, mtx[3] + mtx[2], mtx[7] + mtx[6], mtx[11] + mtx[10], mtx[15] + mtx[14]); // Near + planes_.Set(5, mtx[3] - mtx[2], mtx[7] - mtx[6], mtx[11] - mtx[10], mtx[15] - mtx[14]); // Far } // This code has plenty of potential for optimization. @@ -262,7 +261,6 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int SimpleVertex *corners = (SimpleVertex *)(decoded_ + 65536 * 12); float *verts = (float *)(decoded_ + 65536 * 18); - int vertStride = 3; // Although this may lead to drawing that shouldn't happen, the viewport is more complex on VR. // Let's always say objects are within bounds. @@ -338,17 +336,23 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int } break; case GE_VTYPE_POS_FLOAT: - // No need to copy in this case, we can just read directly from the source format with a stride. - verts = (float *)((uint8_t *)vdata + offset); - vertStride = stride / 4; // Previous code: - // for (int i = 0; i < vertexCount; i++) - // memcpy(&verts[i * 3], (const u8 *)vdata + stride * i + offset, sizeof(float) * 3); + for (int i = 0; i < vertexCount; i++) + memcpy(&verts[i * 3], (const u8 *)vdata + stride * i + offset, sizeof(float) * 3); break; } } } + // Pretransform the verts in-place so we don't have to do it inside the loop. + // We do this differently in the fast version below since we skip the max/minOffset checks there + // making it easier to get the whole thing ready for SIMD. + for (int i = 0; i < vertexCount; i++) { + float worldpos[3]; + Vec3ByMatrix43(worldpos, &verts[i * 3], gstate.worldMatrix); + memcpy(&verts[i * 3], worldpos, 12); + } + // Note: near/far are not checked without clamp/clip enabled, so we skip those planes. int totalPlanes = gstate.isDepthClampEnabled() ? 6 : 4; for (int plane = 0; plane < totalPlanes; plane++) { @@ -358,8 +362,8 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int // Test against the frustum planes, and count. // TODO: We should test 4 vertices at a time using SIMD. // I guess could also test one vertex against 4 planes at a time, though a lot of waste at the common case of 6. - const float *pos = verts + i * vertStride; - float value = planes_[plane].Test(pos); + const float *worldpos = verts + i * 3; + float value = planes_.Test(plane, worldpos); if (value <= -FLT_EPSILON) // Not sure why we use exactly this value. Probably '< 0' would do. out++; else @@ -388,6 +392,179 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int return true; } +// NOTE: This doesn't handle through-mode, indexing, morph, or skinning. +bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u32 vertType) { + SimpleVertex *corners = (SimpleVertex *)(decoded_ + 65536 * 12); + float *verts = (float *)(decoded_ + 65536 * 18); + + // Although this may lead to drawing that shouldn't happen, the viewport is more complex on VR. + // Let's always say objects are within bounds. + if (gstate_c.Use(GPU_USE_VIRTUAL_REALITY)) + return true; + + // Due to world matrix updates per "thing", this isn't quite as effective as it could be if we did world transform + // in here as well. Though, it still does cut down on a lot of updates in Tekken 6. + if (gstate_c.IsDirty(DIRTY_CULL_PLANES)) { + UpdatePlanes(); + gpuStats.numPlaneUpdates++; + gstate_c.Clean(DIRTY_CULL_PLANES); + } + + // Also let's just bail if offsetOutsideEdge_ is set, instead of handling the cases. + // NOTE: This is written to in UpdatePlanes so can't check it before. + if (offsetOutsideEdge_) + return true; + + // Simple, most common case. + VertexDecoder *dec = GetVertexDecoder(vertType); + int stride = dec->VertexSize(); + int offset = dec->posoff; + int vertStride = 3; + + // TODO: Possibly do the plane tests directly against the source formats instead of converting. + switch (vertType & GE_VTYPE_POS_MASK) { + case GE_VTYPE_POS_8BIT: + for (int i = 0; i < vertexCount; i++) { + const s8 *data = (const s8 *)vdata + i * stride + offset; + for (int j = 0; j < 3; j++) { + verts[i * 3 + j] = data[j] * (1.0f / 128.0f); + } + } + break; + case GE_VTYPE_POS_16BIT: + { +#if PPSSPP_ARCH(SSE2) + __m128 scaleFactor = _mm_set1_ps(1.0f / 32768.0f); + for (int i = 0; i < vertexCount; i++) { + const s16 *data = ((const s16 *)((const s8 *)vdata + i * stride + offset)); + __m128i bits = _mm_castpd_si128(_mm_load_sd((const double *)data)); + // Sign extension. Hacky without SSE4. + bits = _mm_srai_epi32(_mm_unpacklo_epi16(bits, bits), 16); + __m128 pos = _mm_mul_ps(_mm_cvtepi32_ps(bits), scaleFactor); + _mm_storeu_ps(verts + i * 3, pos); // TODO: use stride 4 to avoid clashing writes? + } +#elif PPSSPP_ARCH(ARM_NEON) + for (int i = 0; i < vertexCount; i++) { + const s16 *dataPtr = ((const s16 *)((const s8 *)vdata + i * stride + offset)); + int32x4_t data = vmovl_s16(vld1_s16(dataPtr)); + float32x4_t pos = vcvtq_n_f32_s32(data, 15); // >> 15 = division by 32768.0f + vst1q_f32(verts + i * 3, pos); + } +#else + for (int i = 0; i < vertexCount; i++) { + const s16 *data = ((const s16 *)((const s8 *)vdata + i * stride + offset)); + for (int j = 0; j < 3; j++) { + verts[i * 3 + j] = data[j] * (1.0f / 32768.0f); + } + } +#endif + break; + } + case GE_VTYPE_POS_FLOAT: + // No need to copy in this case, we can just read directly from the source format with a stride. + verts = (float *)((uint8_t *)vdata + offset); + vertStride = stride / 4; + break; + } + + // We only check the 4 sides. Near/far won't likely make a huge difference. + // We test one vertex against 4 planes to get some SIMD. Vertices need to be transformed to world space + // for testing, don't want to re-do that, so we have to use that "pivot" of the data. +#if PPSSPP_ARCH(SSE2) + const __m128 worldX = _mm_loadu_ps(gstate.worldMatrix); + const __m128 worldY = _mm_loadu_ps(gstate.worldMatrix + 3); + const __m128 worldZ = _mm_loadu_ps(gstate.worldMatrix + 6); + const __m128 worldW = _mm_loadu_ps(gstate.worldMatrix + 9); + const __m128 planeX = _mm_loadu_ps(planes_.x); + const __m128 planeY = _mm_loadu_ps(planes_.y); + const __m128 planeZ = _mm_loadu_ps(planes_.z); + const __m128 planeW = _mm_loadu_ps(planes_.w); + __m128 inside = _mm_set1_ps(0.0f); + for (int i = 0; i < vertexCount; i++) { + const float *pos = verts + i * vertStride; + __m128 worldpos = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(worldX, _mm_set1_ps(pos[0])), + _mm_mul_ps(worldY, _mm_set1_ps(pos[1])) + ), + _mm_add_ps( + _mm_mul_ps(worldZ, _mm_set1_ps(pos[2])), + worldW + ) + ); + // OK, now we check it against the four planes. + // This is really curiously similar to a matrix multiplication (well, it is one). + __m128 posX = _mm_shuffle_ps(worldpos, worldpos, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 posY = _mm_shuffle_ps(worldpos, worldpos, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 posZ = _mm_shuffle_ps(worldpos, worldpos, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 planeDist = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(planeX, posX), + _mm_mul_ps(planeY, posY) + ), + _mm_add_ps( + _mm_mul_ps(planeZ, posZ), + planeW + ) + ); + inside = _mm_or_ps(inside, _mm_cmpge_ps(planeDist, _mm_setzero_ps())); + } + // 0xF means that we found at least one vertex inside every one of the planes. + // We don't bother with counts, though it wouldn't be hard if we had a use for them. + return _mm_movemask_ps(inside) == 0xF; +#elif PPSSPP_ARCH(ARM_NEON) + const float32x4_t worldX = vld1q_f32(gstate.worldMatrix); + const float32x4_t worldY = vld1q_f32(gstate.worldMatrix + 3); + const float32x4_t worldZ = vld1q_f32(gstate.worldMatrix + 6); + const float32x4_t worldW = vld1q_f32(gstate.worldMatrix + 9); + const float32x4_t planeX = vld1q_f32(planes_.x); + const float32x4_t planeY = vld1q_f32(planes_.y); + const float32x4_t planeZ = vld1q_f32(planes_.z); + const float32x4_t planeW = vld1q_f32(planes_.w); + uint32x4_t inside = vdupq_n_u32(0); + for (int i = 0; i < vertexCount; i++) { + const float *pos = verts + i * vertStride; + float32x4_t objpos = vld1q_f32(pos); + float32x4_t worldpos = vaddq_f32( + vmlaq_laneq_f32( + vmulq_laneq_f32(worldX, objpos, 0), + worldY, objpos, 1), + vmlaq_laneq_f32(worldW, worldZ, objpos, 2) + ); + // OK, now we check it against the four planes. + // This is really curiously similar to a matrix multiplication (well, it is one). + float32x4_t planeDist = vaddq_f32( + vmlaq_laneq_f32( + vmulq_laneq_f32(planeX, worldpos, 0), + planeY, worldpos, 1), + vmlaq_laneq_f32(planeW, planeZ, worldpos, 2) + ); + inside = vorrq_u32(inside, vcgezq_f32(planeDist)); + } + uint64_t insideBits = vget_lane_u64(vreinterpret_u64_u16(vmovn_u32(inside)), 0); + return ~insideBits == 0; // InsideBits all ones means that we found at least one vertex inside every one of the planes. We don't bother with counts, though it wouldn't be hard. +#else + int inside[4]{}; + for (int i = 0; i < vertexCount; i++) { + const float *pos = verts + i * vertStride; + float worldpos[3]; + Vec3ByMatrix43(worldpos, pos, gstate.worldMatrix); + for (int plane = 0; plane < 4; plane++) { + float value = planes_.Test(plane, worldpos); + if (value >= 0.0f) + inside[plane]++; + } + } + + for (int plane = 0; plane < 4; plane++) { + if (inside[plane] == 0) { + return false; + } + } +#endif + return true; +} + // TODO: This probably is not the best interface. bool DrawEngineCommon::GetCurrentSimpleVertices(int count, std::vector &vertices, std::vector &indices) { // This is always for the current vertices. @@ -670,6 +847,31 @@ int DrawEngineCommon::ExtendNonIndexedPrim(const uint32_t *cmd, const uint32_t * return cmd - start; } +void DrawEngineCommon::SkipPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead) { + if (!indexGen.PrimCompatible(prevPrim_, prim)) { + DispatchFlush(); + } + + // This isn't exactly right, if we flushed, since prims can straddle previous calls. + // But it generally works for common usage. + if (prim == GE_PRIM_KEEP_PREVIOUS) { + // Has to be set to something, let's assume POINTS (0) if no previous. + if (prevPrim_ == GE_PRIM_INVALID) + prevPrim_ = GE_PRIM_POINTS; + prim = prevPrim_; + } else { + prevPrim_ = prim; + } + + // If vtype has changed, setup the vertex decoder. + if (vertTypeID != lastVType_ || !dec_) { + dec_ = GetVertexDecoder(vertTypeID); + lastVType_ = vertTypeID; + } + + *bytesRead = vertexCount * dec_->VertexSize(); +} + // vertTypeID is the vertex type but with the UVGen mode smashed into the top bits. bool DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, bool clockwise, int *bytesRead) { if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawVerts_ >= MAX_DEFERRED_DRAW_VERTS || numDrawInds_ >= MAX_DEFERRED_DRAW_INDS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) { diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h index e9e8870ef33d..147a1178bd56 100644 --- a/GPU/Common/DrawEngineCommon.h +++ b/GPU/Common/DrawEngineCommon.h @@ -69,11 +69,11 @@ class TessellationDataTransfer { virtual void SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) = 0; }; -// Culling plane. -struct Plane { - float x, y, z, w; - void Set(float _x, float _y, float _z, float _w) { x = _x; y = _y; z = _z; w = _w; } - float Test(const float f[3]) const { return x * f[0] + y * f[1] + z * f[2] + w; } +// Culling plane, group of 8. +struct alignas(16) Plane8 { + float x[8], y[8], z[8], w[8]; + void Set(int i, float _x, float _y, float _z, float _w) { x[i] = _x; y[i] = _y; z[i] = _z; w[i] = _w; } + float Test(int i, const float f[3]) const { return x[i] * f[0] + y[i] * f[1] + z[i] * f[2] + w[i]; } }; class DrawEngineCommon { @@ -104,6 +104,10 @@ class DrawEngineCommon { bool TestBoundingBox(const void *control_points, const void *inds, int vertexCount, u32 vertType); + // This is a less accurate version of TestBoundingBox, but faster. Can have more false positives. + // Doesn't support indexing. + bool TestBoundingBoxFast(const void *control_points, int vertexCount, u32 vertType); + void FlushSkin() { bool applySkin = (lastVType_ & GE_VTYPE_WEIGHT_MASK) && decOptions_.applySkinInDecode; if (applySkin) { @@ -113,6 +117,8 @@ class DrawEngineCommon { int ExtendNonIndexedPrim(const uint32_t *cmd, const uint32_t *stall, u32 vertTypeID, bool clockwise, int *bytesRead, bool isTriangle); bool SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, bool clockwise, int *bytesRead); + void SkipPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead); + template void SubmitCurve(const void *control_points, const void *indices, Surface &surface, u32 vertType, int *bytesRead, const char *scope); void ClearSplineBezierWeights(); @@ -287,7 +293,8 @@ class DrawEngineCommon { TessellationDataTransfer *tessDataTransfer; // Culling - Plane planes_[6]; + Plane8 planes_; Vec2f minOffset_; Vec2f maxOffset_; + bool offsetOutsideEdge_; }; diff --git a/GPU/GPU.h b/GPU/GPU.h index 7d4d4d1c0a07..f2edbdc1f624 100644 --- a/GPU/GPU.h +++ b/GPU/GPU.h @@ -76,6 +76,7 @@ struct GPUStatistics { void ResetFrame() { numDrawCalls = 0; numVertexDecodes = 0; + numCulledDraws = 0; numDrawSyncs = 0; numListSyncs = 0; numVertsSubmitted = 0; @@ -111,6 +112,7 @@ struct GPUStatistics { // Per frame statistics int numDrawCalls; int numVertexDecodes; + int numCulledDraws; int numDrawSyncs; int numListSyncs; int numFlushes; diff --git a/GPU/GPUCommonHW.cpp b/GPU/GPUCommonHW.cpp index f961880d8a42..66906f4b1b47 100644 --- a/GPU/GPUCommonHW.cpp +++ b/GPU/GPUCommonHW.cpp @@ -989,9 +989,45 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { int cullMode = gstate.getCullMode(); uint32_t vertTypeID = GetVertTypeID(vertexType, gstate.getUVGenMode(), g_Config.bSoftwareSkinning); - if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, vertTypeID, true, &bytesRead)) { + +#define MAX_CULL_CHECK_COUNT 6 + +// For now, turn off culling on platforms where we don't have SIMD bounding box tests, like RISC-V. +#if PPSSPP_ARCH(ARM_NEON) || PPSSPP_ARCH(SSE2) + +#define PASSES_CULLING ((vertexType & (GE_VTYPE_THROUGH_MASK | GE_VTYPE_MORPHCOUNT_MASK | GE_VTYPE_WEIGHT_MASK | GE_VTYPE_IDX_MASK)) || count > MAX_CULL_CHECK_COUNT) + +#else + +#define PASSES_CULLING true + +#endif + + // If certain conditions are true, do frustum culling. + bool passCulling = PASSES_CULLING; + if (!passCulling) { + // Do software culling. + if (drawEngineCommon_->TestBoundingBoxFast(verts, count, vertexType)) { + passCulling = true; + } else { + gpuStats.numCulledDraws++; + } + } + + // If the first one in a batch passes, let's assume the whole batch passes. + // Cuts down on checking, while not losing that much efficiency. + bool onePassed = false; + if (passCulling) { + if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, vertTypeID, true, &bytesRead)) { + canExtend = false; + } + onePassed = true; + } else { + // Still need to advance bytesRead. + drawEngineCommon_->SkipPrim(prim, count, vertTypeID, &bytesRead); canExtend = false; } + // After drawing, we advance the vertexAddr (when non indexed) or indexAddr (when indexed). // Some games rely on this, they don't bother reloading VADDR and IADDR. // The VADDR/IADDR registers are NOT updated. @@ -1027,7 +1063,7 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { bool clockwise = !gstate.isCullEnabled() || gstate.getCullMode() == cullMode; if (canExtend) { // Non-indexed draws can be cheaply merged if vertexAddr hasn't changed, that means the vertices - // are consecutive in memory. + // are consecutive in memory. We also ignore culling here. _dbg_assert_((vertexType & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_NONE); int commandsExecuted = drawEngineCommon_->ExtendNonIndexedPrim(src, stall, vertTypeID, clockwise, &bytesRead, isTriangle); if (!commandsExecuted) { @@ -1047,7 +1083,25 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { // We can extend again after submitting a normal draw. canExtend = isTriangle; } - if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, vertTypeID, clockwise, &bytesRead)) { + + bool passCulling = onePassed || PASSES_CULLING; + if (!passCulling) { + // Do software culling. + if (drawEngineCommon_->TestBoundingBox(verts, inds, count, vertexType)) { + passCulling = true; + } else { + gpuStats.numCulledDraws++; + } + } + if (passCulling) { + if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, vertTypeID, clockwise, &bytesRead)) { + canExtend = false; + } + // As soon as one passes, assume we don't need to check the rest of this batch. + onePassed = true; + } else { + // Still need to advance bytesRead. + drawEngineCommon_->SkipPrim(newPrim, count, vertTypeID, &bytesRead); canExtend = false; } AdvanceVerts(vertexType, count, bytesRead); @@ -1412,7 +1466,7 @@ void GPUCommonHW::Execute_WorldMtxNum(u32 op, u32 diff) { if (dst[i] != newVal) { Flush(); dst[i] = newVal; - gstate_c.Dirty(DIRTY_WORLDMATRIX | DIRTY_CULL_PLANES); + gstate_c.Dirty(DIRTY_WORLDMATRIX); } if (++i >= end) { break; @@ -1435,7 +1489,7 @@ void GPUCommonHW::Execute_WorldMtxData(u32 op, u32 diff) { if (num < 12 && newVal != ((const u32 *)gstate.worldMatrix)[num]) { Flush(); ((u32 *)gstate.worldMatrix)[num] = newVal; - gstate_c.Dirty(DIRTY_WORLDMATRIX | DIRTY_CULL_PLANES); + gstate_c.Dirty(DIRTY_WORLDMATRIX); } num++; gstate.worldmtxnum = (GE_CMD_WORLDMATRIXNUMBER << 24) | (num & 0x00FFFFFF); @@ -1691,7 +1745,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) { float vertexAverageCycles = gpuStats.numVertsSubmitted > 0 ? (float)gpuStats.vertexGPUCycles / (float)gpuStats.numVertsSubmitted : 0.0f; return snprintf(buffer, size, "DL processing time: %0.2f ms, %d drawsync, %d listsync\n" - "Draw: %d (%d dec), flushes %d, clears %d, bbox jumps %d (%d updates)\n" + "Draw: %d (%d dec, %d culled), flushes %d, clears %d, bbox jumps %d (%d updates)\n" "Vertices: %d drawn: %d\n" "FBOs active: %d (evaluations: %d)\n" "Textures: %d, dec: %d, invalidated: %d, hashed: %d kB\n" @@ -1705,6 +1759,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) { gpuStats.numListSyncs, gpuStats.numDrawCalls, gpuStats.numVertexDecodes, + gpuStats.numCulledDraws, gpuStats.numFlushes, gpuStats.numClears, gpuStats.numBBOXJumps, diff --git a/UWP/CommonUWP/CommonUWP.vcxproj b/UWP/CommonUWP/CommonUWP.vcxproj index b4d9a8937551..b27cbdbb06be 100644 --- a/UWP/CommonUWP/CommonUWP.vcxproj +++ b/UWP/CommonUWP/CommonUWP.vcxproj @@ -105,6 +105,7 @@ + diff --git a/UWP/CommonUWP/CommonUWP.vcxproj.filters b/UWP/CommonUWP/CommonUWP.vcxproj.filters index 2eedf9e8c653..262e1e7af3b7 100644 --- a/UWP/CommonUWP/CommonUWP.vcxproj.filters +++ b/UWP/CommonUWP/CommonUWP.vcxproj.filters @@ -862,6 +862,9 @@ ext\naett + + Math + diff --git a/ppsspp_config.h b/ppsspp_config.h index 2861b621b333..71e7b9c9466d 100644 --- a/ppsspp_config.h +++ b/ppsspp_config.h @@ -11,6 +11,7 @@ #if defined(_M_IX86) || defined(__i386__) || defined (__EMSCRIPTEN__) #define PPSSPP_ARCH_X86 1 #define PPSSPP_ARCH_32BIT 1 + #define PPSSPP_ARCH_SSE2 1 //TODO: Remove this compat define #ifndef _M_IX86 #define _M_IX86 600 @@ -19,6 +20,7 @@ #if (defined(_M_X64) || defined(__amd64__) || defined(__x86_64__)) && !defined(__EMSCRIPTEN__) #define PPSSPP_ARCH_AMD64 1 + #define PPSSPP_ARCH_SSE2 1 #if defined(__ILP32__) #define PPSSPP_ARCH_32BIT 1 #else