From cd352524006fe9656128705295c2efdb36eafa96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 6 Oct 2023 16:08:41 +0200 Subject: [PATCH 1/2] DrawEngine; Convert strip sequences in a tight loop --- GPU/Common/DrawEngineCommon.cpp | 61 +++++++++++++++++++++------------ GPU/Common/DrawEngineCommon.h | 2 +- GPU/GPUCommon.h | 23 +++++++------ GPU/GPUCommonHW.cpp | 11 +++--- 4 files changed, 59 insertions(+), 38 deletions(-) diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index b61447a3d9bb..e135eceeed4b 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -610,7 +610,7 @@ u32 DrawEngineCommon::ComputeMiniHash() { } for (int i = 0; i < numDrawInds_; i += step) { const DeferredInds &di = drawInds_[i]; - if (di.inds) { + if (di.indexType != 0) { fullhash += ComputeMiniHashRange(di.inds, indexSize * di.vertexCount); } } @@ -638,8 +638,10 @@ uint32_t DrawEngineCommon::ComputeDrawcallsHash() const { } for (int j = 0; j < numDrawInds_; j++) { u32 dhash = dcid; - dhash = __rotl(dhash ^ (u32)(uintptr_t)drawInds_[j].inds, 19); - dcid = lowbias32_r(__rotl(dhash ^ (u32)drawInds_[j].indexType, 7)); + if (drawInds_[j].inds) { + dhash = __rotl(dhash ^ (u32)(uintptr_t)drawInds_[j].inds, 19); + dcid = lowbias32_r(__rotl(dhash ^ (u32)drawInds_[j].indexType, 7)); + } } return dcid; } @@ -678,31 +680,46 @@ uint64_t DrawEngineCommon::ComputeHash() { return fullhash; } -bool DrawEngineCommon::ExtendNonIndexedPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead) { - if (numDrawInds_ >= MAX_DEFERRED_DRAW_INDS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) { - return false; - } +int DrawEngineCommon::ExtendNonIndexedPrim(const uint32_t *cmd, u32 vertTypeID, int cullMode, int *bytesRead, bool isTriangle) { + const uint32_t *start = cmd; + int prevDrawVerts = numDrawVerts_ - 1; + DeferredVerts &dv = drawVerts_[prevDrawVerts]; + int offset = dv.vertexCount; _dbg_assert_(numDrawInds_ < MAX_DEFERRED_DRAW_INDS); _dbg_assert_(numDrawVerts_ > 0); - *bytesRead = vertexCount * dec_->VertexSize(); - DeferredInds &di = drawInds_[numDrawInds_++]; - di.inds = nullptr; - di.indexType = 0; - di.prim = prim; - di.cullMode = cullMode; - di.vertexCount = vertexCount; - di.vertDecodeIndex = numDrawVerts_ - 1; + while (true) { + uint32_t data = *cmd; + if ((data & 0xFFF80000) != 0x04000000) { + break; + } + GEPrimitiveType newPrim = static_cast((data >> 16) & 7); + if (IsTrianglePrim(newPrim) != isTriangle) + break; + int vertexCount = data & 0xFFFF; + if (numDrawInds_ >= MAX_DEFERRED_DRAW_INDS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) { + break; + } + DeferredInds &di = drawInds_[numDrawInds_++]; + di.indexType = 0; + di.prim = newPrim; + di.cullMode = cullMode; + di.vertexCount = vertexCount; + di.vertDecodeIndex = prevDrawVerts; + di.offset = offset; + offset += vertexCount; + cmd++; + } - DeferredVerts &dv = drawVerts_[numDrawVerts_ - 1]; - int offset = dv.vertexCount; - di.offset = offset; - dv.vertexCount += vertexCount; - dv.indexUpperBound = dv.vertexCount - 1; - vertexCountInDrawCalls_ += vertexCount; + _dbg_assert_(cmd != start); - return true; + int totalCount = offset - dv.vertexCount; + dv.vertexCount = offset; + dv.indexUpperBound = dv.vertexCount - 1; + vertexCountInDrawCalls_ += totalCount; + *bytesRead = totalCount * dec_->VertexSize(); + return cmd - start; } // vertTypeID is the vertex type but with the UVGen mode smashed into the top bits. diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h index c2c781c6b329..a40bf08c51e1 100644 --- a/GPU/Common/DrawEngineCommon.h +++ b/GPU/Common/DrawEngineCommon.h @@ -111,7 +111,7 @@ class DrawEngineCommon { } } - bool ExtendNonIndexedPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead); + int ExtendNonIndexedPrim(const uint32_t *cmd, u32 vertTypeID, int cullMode, int *bytesRead, bool isTriangle); void SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead); template void SubmitCurve(const void *control_points, const void *indices, Surface &surface, u32 vertType, int *bytesRead, const char *scope); diff --git a/GPU/GPUCommon.h b/GPU/GPUCommon.h index 21bf2dc5eadd..012f299d335c 100644 --- a/GPU/GPUCommon.h +++ b/GPU/GPUCommon.h @@ -67,6 +67,18 @@ struct TransformedVertex { } }; +inline bool IsTrianglePrim(GEPrimitiveType prim) { + // TODO: KEEP_PREVIOUS is mistakenly treated as TRIANGLE here... This isn't new. + // + // Interesting optimization, but not confident in performance: + // static const bool p[8] = { false, false, false, true, true, true, false, true }; + // 10111000 = 0xB8; + // return (0xB8U >> (u8)prim) & 1; + + return prim > GE_PRIM_LINE_STRIP && prim != GE_PRIM_RECTANGLES; +} + + class GPUCommon : public GPUInterface, public GPUDebugInterface { public: GPUCommon(GraphicsContext *gfxCtx, Draw::DrawContext *draw); @@ -219,17 +231,6 @@ class GPUCommon : public GPUInterface, public GPUDebugInterface { virtual void CheckRenderResized() {} - inline bool IsTrianglePrim(GEPrimitiveType prim) const { - // TODO: KEEP_PREVIOUS is mistakenly treated as TRIANGLE here... This isn't new. - // - // Interesting optimization, but not confident in performance: - // static const bool p[8] = { false, false, false, true, true, true, false, true }; - // 10111000 = 0xB8; - // return (0xB8U >> (u8)prim) & 1; - - return prim > GE_PRIM_LINE_STRIP && prim != GE_PRIM_RECTANGLES; - } - void SetDrawType(DrawType type, GEPrimitiveType prim) { if (type != lastDraw_) { // We always flush when drawing splines/beziers so no need to do so here diff --git a/GPU/GPUCommonHW.cpp b/GPU/GPUCommonHW.cpp index 5c9d5661ae43..369bb19b6d34 100644 --- a/GPU/GPUCommonHW.cpp +++ b/GPU/GPUCommonHW.cpp @@ -1026,11 +1026,14 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { // Non-indexed draws can be cheaply merged if vertexAddr hasn't changed, that means the vertices // are consecutive in memory. _dbg_assert_((vertexType & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_NONE); - if (drawEngineCommon_->ExtendNonIndexedPrim(newPrim, count, vertTypeID, cullMode, &bytesRead)) { - gstate_c.vertexAddr += bytesRead; - totalVertCount += count; - break; + int commandsExecuted = drawEngineCommon_->ExtendNonIndexedPrim(src, vertTypeID, cullMode, &bytesRead, isTriangle); + if (!commandsExecuted) { + goto bail; } + src += commandsExecuted - 1; + gstate_c.vertexAddr += bytesRead; + totalVertCount += count; + break; } // Failed, or can't extend? Do a normal submit. From c7a3e7bc3282f4b64a1a5e3c897516c0f78013b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 6 Oct 2023 16:32:59 +0200 Subject: [PATCH 2/2] Remove a redundant variable --- GPU/Common/IndexGenerator.cpp | 16 ---------------- GPU/Common/IndexGenerator.h | 4 +--- 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/GPU/Common/IndexGenerator.cpp b/GPU/Common/IndexGenerator.cpp index b361cbcbce62..679fdc259cfd 100644 --- a/GPU/Common/IndexGenerator.cpp +++ b/GPU/Common/IndexGenerator.cpp @@ -68,7 +68,6 @@ void IndexGenerator::AddPoints(int numVerts, int indexOffset) { *outInds++ = indexOffset + i; inds_ = outInds; // ignore overflow verts - count_ += numVerts; prim_ = GE_PRIM_POINTS; seenPrims_ |= 1 << GE_PRIM_POINTS; } @@ -84,7 +83,6 @@ void IndexGenerator::AddList(int numVerts, int indexOffset, bool clockwise) { } inds_ = outInds; // ignore overflow verts - count_ += numVerts; prim_ = GE_PRIM_TRIANGLES; seenPrims_ |= 1 << GE_PRIM_TRIANGLES; if (!clockwise) { @@ -203,8 +201,6 @@ void IndexGenerator::AddStrip(int numVerts, int indexOffset, bool clockwise) { inds_ = outInds; #endif - if (numTris > 0) - count_ += numTris * 3; // This is so we can detect one single strip by just looking at seenPrims_. if (!seenPrims_ && clockwise) { seenPrims_ = 1 << GE_PRIM_TRIANGLE_STRIP; @@ -228,7 +224,6 @@ void IndexGenerator::AddFan(int numVerts, int indexOffset, bool clockwise) { *outInds++ = indexOffset + i + v2; } inds_ = outInds; - count_ += numTris * 3; prim_ = GE_PRIM_TRIANGLES; seenPrims_ |= 1 << GE_PRIM_TRIANGLE_FAN; if (!clockwise) { @@ -245,7 +240,6 @@ void IndexGenerator::AddLineList(int numVerts, int indexOffset) { *outInds++ = indexOffset + i + 1; } inds_ = outInds; - count_ += numVerts; prim_ = GE_PRIM_LINES; seenPrims_ |= 1 << prim_; } @@ -258,7 +252,6 @@ void IndexGenerator::AddLineStrip(int numVerts, int indexOffset) { *outInds++ = indexOffset + i + 1; } inds_ = outInds; - count_ += numLines * 2; prim_ = GE_PRIM_LINES; seenPrims_ |= 1 << GE_PRIM_LINE_STRIP; } @@ -272,7 +265,6 @@ void IndexGenerator::AddRectangles(int numVerts, int indexOffset) { *outInds++ = indexOffset + i + 1; } inds_ = outInds; - count_ += numVerts; prim_ = GE_PRIM_RECTANGLES; seenPrims_ |= 1 << GE_PRIM_RECTANGLES; } @@ -283,7 +275,6 @@ void IndexGenerator::TranslatePoints(int numInds, const ITypeLE *inds, int index for (int i = 0; i < numInds; i++) *outInds++ = indexOffset + inds[i]; inds_ = outInds; - count_ += numInds; prim_ = GE_PRIM_POINTS; seenPrims_ |= (1 << GE_PRIM_POINTS) | flag; } @@ -297,7 +288,6 @@ void IndexGenerator::TranslateLineList(int numInds, const ITypeLE *inds, int ind *outInds++ = indexOffset + inds[i + 1]; } inds_ = outInds; - count_ += numInds; prim_ = GE_PRIM_LINES; seenPrims_ |= (1 << GE_PRIM_LINES) | flag; } @@ -311,7 +301,6 @@ void IndexGenerator::TranslateLineStrip(int numInds, const ITypeLE *inds, int in *outInds++ = indexOffset + inds[i + 1]; } inds_ = outInds; - count_ += numLines * 2; prim_ = GE_PRIM_LINES; seenPrims_ |= (1 << GE_PRIM_LINE_STRIP) | flag; } @@ -323,7 +312,6 @@ void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOf if (sizeof(ITypeLE) == sizeof(inds_[0]) && indexOffset == 0 && clockwise) { memcpy(inds_, inds, numInds * sizeof(ITypeLE)); inds_ += numInds; - count_ += numInds; } else { u16 *outInds = inds_; int numTris = numInds / 3; // Round to whole triangles @@ -337,7 +325,6 @@ void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOf *outInds++ = indexOffset + inds[i + v2]; } inds_ = outInds; - count_ += numInds; } prim_ = GE_PRIM_TRIANGLES; seenPrims_ |= (1 << GE_PRIM_TRIANGLES) | flag; @@ -355,7 +342,6 @@ void IndexGenerator::TranslateStrip(int numInds, const ITypeLE *inds, int indexO *outInds++ = indexOffset + inds[i + wind]; } inds_ = outInds; - count_ += numTris * 3; prim_ = GE_PRIM_TRIANGLES; seenPrims_ |= (1 << GE_PRIM_TRIANGLE_STRIP) | flag; } @@ -373,7 +359,6 @@ void IndexGenerator::TranslateFan(int numInds, const ITypeLE *inds, int indexOff *outInds++ = indexOffset + inds[i + v2]; } inds_ = outInds; - count_ += numTris * 3; prim_ = GE_PRIM_TRIANGLES; seenPrims_ |= (1 << GE_PRIM_TRIANGLE_FAN) | flag; } @@ -388,7 +373,6 @@ inline void IndexGenerator::TranslateRectangles(int numInds, const ITypeLE *inds *outInds++ = indexOffset + inds[i+1]; } inds_ = outInds; - count_ += numInds; prim_ = GE_PRIM_RECTANGLES; seenPrims_ |= (1 << GE_PRIM_RECTANGLES) | flag; } diff --git a/GPU/Common/IndexGenerator.h b/GPU/Common/IndexGenerator.h index b5df11aab8f5..77a49f39eb35 100644 --- a/GPU/Common/IndexGenerator.h +++ b/GPU/Common/IndexGenerator.h @@ -27,7 +27,6 @@ class IndexGenerator { void Setup(u16 *indexptr); void Reset() { prim_ = GE_PRIM_INVALID; - count_ = 0; seenPrims_ = 0; pureCount_ = 0; this->inds_ = indsBase_; @@ -61,7 +60,7 @@ class IndexGenerator { void TranslatePrim(int prim, int numInds, const u16_le *inds, int indexOffset, bool clockwise); void TranslatePrim(int prim, int numInds, const u32_le *inds, int indexOffset, bool clockwise); - int VertexCount() const { return count_; } + int VertexCount() const { return inds_ - indsBase_; } int SeenPrims() const { return seenPrims_; } int PureCount() const { return pureCount_; } bool SeenOnlyPurePrims() const { @@ -110,7 +109,6 @@ class IndexGenerator { u16 *indsBase_; u16 *inds_; - int count_; int pureCount_; GEPrimitiveType prim_; int seenPrims_;