diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 663c0495fe91..b61447a3d9bb 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -72,52 +72,6 @@ VertexDecoder *DrawEngineCommon::GetVertexDecoder(u32 vtype) { return dec; } -int DrawEngineCommon::ComputeNumVertsToDecode() const { - int vertsToDecode = 0; - int numDrawCalls = numDrawCalls_; - if (drawCalls_[0].indexType == GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT) { - for (int i = 0; i < numDrawCalls; i++) { - const DeferredDrawCall &dc = drawCalls_[i]; - vertsToDecode += dc.vertexCount; - } - } else { - // TODO: Share this computation with DecodeVertsStep? - for (int i = 0; i < numDrawCalls; i++) { - const DeferredDrawCall &dc = drawCalls_[i]; - int lastMatch = i; - const int total = numDrawCalls; - int indexLowerBound = dc.indexLowerBound; - int indexUpperBound = dc.indexUpperBound; - for (int j = i + 1; j < total; ++j) { - if (drawCalls_[j].verts != dc.verts) - break; - - indexLowerBound = std::min(indexLowerBound, (int)drawCalls_[j].indexLowerBound); - indexUpperBound = std::max(indexUpperBound, (int)drawCalls_[j].indexUpperBound); - lastMatch = j; - } - vertsToDecode += indexUpperBound - indexLowerBound + 1; - i = lastMatch; - } - } - return vertsToDecode; -} - -void DrawEngineCommon::DecodeVerts(u8 *dest) { - int decodeCounter = decodeCounter_; - for (; decodeCounter < numDrawCalls_; decodeCounter++) { - DecodeVertsStep(dest, decodeCounter, decodedVerts_, &drawCalls_[decodeCounter].uvScale); // NOTE! DecodeVertsStep can modify the decodeCounter parameter! - } - decodeCounter_ = decodeCounter; - - // Sanity check - if (indexGen.Prim() < 0) { - ERROR_LOG_REPORT(G3D, "DecodeVerts: Failed to deduce prim: %i", indexGen.Prim()); - // Force to points (0) - indexGen.AddPrim(GE_PRIM_POINTS, 0, true); - } -} - std::vector DrawEngineCommon::DebugGetVertexLoaderIDs() { std::vector ids; decoderMap_.Iterate([&](const uint32_t vtype, VertexDecoder *decoder) { @@ -619,94 +573,6 @@ void DrawEngineCommon::ApplyFramebufferRead(FBOTexState *fboTexState) { gstate_c.Dirty(DIRTY_SHADERBLEND); } -void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts, const UVScale *uvScale) { - PROFILE_THIS_SCOPE("vertdec"); - - const DeferredDrawCall &dc = drawCalls_[i]; - - indexGen.SetIndex(decodedVerts); - int indexLowerBound = dc.indexLowerBound; - int indexUpperBound = dc.indexUpperBound; - - if (dc.indexType == GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT) { - // Decode the verts (and at the same time apply morphing/skinning). Simple. - dec_->DecodeVerts(dest + decodedVerts * (int)dec_->GetDecVtxFmt().stride, - dc.verts, uvScale, indexLowerBound, indexUpperBound); - decodedVerts += indexUpperBound - indexLowerBound + 1; - - bool clockwise = true; - if (gstate.isCullEnabled() && gstate.getCullMode() != dc.cullMode) { - clockwise = false; - } - indexGen.AddPrim(dc.prim, dc.vertexCount, clockwise); - } else { - // It's fairly common that games issue long sequences of PRIM calls, with differing - // inds pointer but the same base vertex pointer. We'd like to reuse vertices between - // these as much as possible, so we make sure here to combine as many as possible - // into one nice big drawcall, sharing data. - - // 1. Look ahead to find the max index, only looking as "matching" drawcalls. - // Expand the lower and upper bounds as we go. - int lastMatch = i; - const int total = numDrawCalls_; - for (int j = i + 1; j < total; ++j) { - if (drawCalls_[j].verts != dc.verts) - break; - // TODO: What if UV scale/offset changes between drawcalls here? - indexLowerBound = std::min(indexLowerBound, (int)drawCalls_[j].indexLowerBound); - indexUpperBound = std::max(indexUpperBound, (int)drawCalls_[j].indexUpperBound); - lastMatch = j; - } - - // 2. Loop through the drawcalls, translating indices as we go. - switch (dc.indexType) { - case GE_VTYPE_IDX_8BIT >> GE_VTYPE_IDX_SHIFT: - for (int j = i; j <= lastMatch; j++) { - bool clockwise = true; - if (gstate.isCullEnabled() && gstate.getCullMode() != drawCalls_[j].cullMode) { - clockwise = false; - } - indexGen.TranslatePrim(drawCalls_[j].prim, drawCalls_[j].vertexCount, (const u8 *)drawCalls_[j].inds, indexLowerBound, clockwise); - } - break; - case GE_VTYPE_IDX_16BIT >> GE_VTYPE_IDX_SHIFT: - for (int j = i; j <= lastMatch; j++) { - bool clockwise = true; - if (gstate.isCullEnabled() && gstate.getCullMode() != drawCalls_[j].cullMode) { - clockwise = false; - } - indexGen.TranslatePrim(drawCalls_[j].prim, drawCalls_[j].vertexCount, (const u16_le *)drawCalls_[j].inds, indexLowerBound, clockwise); - } - break; - case GE_VTYPE_IDX_32BIT >> GE_VTYPE_IDX_SHIFT: - for (int j = i; j <= lastMatch; j++) { - bool clockwise = true; - if (gstate.isCullEnabled() && gstate.getCullMode() != drawCalls_[j].cullMode) { - clockwise = false; - } - indexGen.TranslatePrim(drawCalls_[j].prim, drawCalls_[j].vertexCount, (const u32_le *)drawCalls_[j].inds, indexLowerBound, clockwise); - } - break; - } - - const int vertexCount = indexUpperBound - indexLowerBound + 1; - - // This check is a workaround for Pangya Fantasy Golf, which sends bogus index data when switching items in "My Room" sometimes. - if (decodedVerts + vertexCount > VERTEX_BUFFER_MAX) { - return; - } - - // 3. Decode that range of vertex data. - dec_->DecodeVerts(dest + decodedVerts * (int)dec_->GetDecVtxFmt().stride, - dc.verts, uvScale, indexLowerBound, indexUpperBound); - decodedVerts += vertexCount; - - // 4. Advance indexgen vertex counter. - indexGen.Advance(vertexCount); - i = lastMatch; - } -} - inline u32 ComputeMiniHashRange(const void *ptr, size_t sz) { // Switch to u32 units, and round up to avoid unaligned accesses. // Probably doesn't matter if we skip the first few bytes in some cases. @@ -731,69 +597,121 @@ u32 DrawEngineCommon::ComputeMiniHash() { const int indexSize = IndexSize(dec_->VertexType()); int step; - if (numDrawCalls_ < 3) { + if (numDrawVerts_ < 3) { step = 1; - } else if (numDrawCalls_ < 8) { + } else if (numDrawVerts_ < 8) { step = 4; } else { - step = numDrawCalls_ / 8; + step = numDrawVerts_ / 8; } - for (int i = 0; i < numDrawCalls_; i += step) { - const DeferredDrawCall &dc = drawCalls_[i]; - if (!dc.inds) { - fullhash += ComputeMiniHashRange(dc.verts, vertexSize * dc.vertexCount); - } else { - int indexLowerBound = dc.indexLowerBound, indexUpperBound = dc.indexUpperBound; - fullhash += ComputeMiniHashRange((const u8 *)dc.verts + vertexSize * indexLowerBound, vertexSize * (indexUpperBound - indexLowerBound)); - fullhash += ComputeMiniHashRange(dc.inds, indexSize * dc.vertexCount); + for (int i = 0; i < numDrawVerts_; i += step) { + const DeferredVerts &dc = drawVerts_[i]; + fullhash += ComputeMiniHashRange((const u8 *)dc.verts + vertexSize * dc.indexLowerBound, vertexSize * (dc.indexUpperBound - dc.indexLowerBound)); + } + for (int i = 0; i < numDrawInds_; i += step) { + const DeferredInds &di = drawInds_[i]; + if (di.inds) { + fullhash += ComputeMiniHashRange(di.inds, indexSize * di.vertexCount); } } return fullhash; } +// Cheap bit scrambler from https://nullprogram.com/blog/2018/07/31/ +inline uint32_t lowbias32_r(uint32_t x) { + x ^= x >> 16; + x *= 0x43021123U; + x ^= x >> 15 ^ x >> 30; + x *= 0x1d69e2a5U; + x ^= x >> 16; + return x; +} + +uint32_t DrawEngineCommon::ComputeDrawcallsHash() const { + uint32_t dcid = 0; + for (int i = 0; i < numDrawVerts_; i++) { + u32 dhash = dcid; + dhash = __rotl(dhash ^ (u32)(uintptr_t)drawVerts_[i].verts, 13); + dhash = __rotl(dhash ^ (u32)drawInds_[i].vertexCount, 11); + dcid = lowbias32_r(dhash ^ (u32)drawInds_[i].prim); + } + for (int j = 0; j < numDrawInds_; j++) { + u32 dhash = dcid; + dhash = __rotl(dhash ^ (u32)(uintptr_t)drawInds_[j].inds, 19); + dcid = lowbias32_r(__rotl(dhash ^ (u32)drawInds_[j].indexType, 7)); + } + return dcid; +} + +int DrawEngineCommon::ComputeNumVertsToDecode() const { + int sum = 0; + for (int i = 0; i < numDrawVerts_; i++) { + sum += drawVerts_[i].indexUpperBound + 1 - drawVerts_[i].indexLowerBound; + } + return sum; +} + uint64_t DrawEngineCommon::ComputeHash() { uint64_t fullhash = 0; const int vertexSize = dec_->GetDecVtxFmt().stride; - const int indexSize = IndexSize(dec_->VertexType()); // TODO: Add some caps both for numDrawCalls_ and num verts to check? // It is really very expensive to check all the vertex data so often. - for (int i = 0; i < numDrawCalls_; i++) { - const DeferredDrawCall &dc = drawCalls_[i]; - if (!dc.inds) { - fullhash += XXH3_64bits((const char *)dc.verts, vertexSize * dc.vertexCount); - } else { - int indexLowerBound = dc.indexLowerBound, indexUpperBound = dc.indexUpperBound; - int j = i + 1; - int lastMatch = i; - while (j < numDrawCalls_) { - if (drawCalls_[j].verts != dc.verts) - break; - indexLowerBound = std::min(indexLowerBound, (int)dc.indexLowerBound); - indexUpperBound = std::max(indexUpperBound, (int)dc.indexUpperBound); - lastMatch = j; - j++; - } - // This could get seriously expensive with sparse indices. Need to combine hashing ranges the same way - // we do when drawing. - fullhash += XXH3_64bits((const char *)dc.verts + vertexSize * indexLowerBound, - vertexSize * (indexUpperBound - indexLowerBound)); + for (int i = 0; i < numDrawVerts_; i++) { + const DeferredVerts &dv = drawVerts_[i]; + int indexLowerBound = dv.indexLowerBound, indexUpperBound = dv.indexUpperBound; + fullhash += XXH3_64bits((const char *)dv.verts + vertexSize * indexLowerBound, vertexSize * (indexUpperBound - indexLowerBound)); + } + + for (int i = 0; i < numDrawInds_; i++) { + const DeferredInds &di = drawInds_[i]; + if (di.indexType != 0) { + int indexSize = IndexSize(di.indexType << GE_VTYPE_IDX_SHIFT); // Hm, we will miss some indices when combining above, but meh, it should be fine. - fullhash += XXH3_64bits((const char *)dc.inds, indexSize * dc.vertexCount); - i = lastMatch; + fullhash += XXH3_64bits((const char *)di.inds, indexSize * di.vertexCount); } } - fullhash += XXH3_64bits(&drawCalls_[0].uvScale, sizeof(drawCalls_[0].uvScale) * numDrawCalls_); + // this looks utterly broken?? + // fullhash += XXH3_64bits(&drawCalls_[0].uvScale, sizeof(drawCalls_[0].uvScale) * numDrawCalls_); return fullhash; } +bool DrawEngineCommon::ExtendNonIndexedPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead) { + if (numDrawInds_ >= MAX_DEFERRED_DRAW_INDS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) { + return false; + } + + _dbg_assert_(numDrawInds_ < MAX_DEFERRED_DRAW_INDS); + _dbg_assert_(numDrawVerts_ > 0); + *bytesRead = vertexCount * dec_->VertexSize(); + + DeferredInds &di = drawInds_[numDrawInds_++]; + di.inds = nullptr; + di.indexType = 0; + di.prim = prim; + di.cullMode = cullMode; + di.vertexCount = vertexCount; + di.vertDecodeIndex = numDrawVerts_ - 1; + + DeferredVerts &dv = drawVerts_[numDrawVerts_ - 1]; + int offset = dv.vertexCount; + di.offset = offset; + dv.vertexCount += vertexCount; + dv.indexUpperBound = dv.vertexCount - 1; + vertexCountInDrawCalls_ += vertexCount; + + return true; +} + // vertTypeID is the vertex type but with the UVGen mode smashed into the top bits. void DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead) { - if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawCalls_ >= MAX_DEFERRED_DRAW_CALLS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) { + if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawVerts_ >= MAX_DEFERRED_DRAW_VERTS || numDrawInds_ >= MAX_DEFERRED_DRAW_INDS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) { DispatchFlush(); } + _dbg_assert_(numDrawVerts_ < MAX_DEFERRED_DRAW_VERTS); + _dbg_assert_(numDrawInds_ < MAX_DEFERRED_DRAW_INDS); // This isn't exactly right, if we flushed, since prims can straddle previous calls. // But it generally works for common usage. @@ -818,29 +736,47 @@ void DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimiti if (vertexCount < 3 && ((vertexCount < 2 && prim > 0) || (prim > GE_PRIM_LINE_STRIP && prim != GE_PRIM_RECTANGLES))) return; - DeferredDrawCall &dc = drawCalls_[numDrawCalls_]; - dc.verts = verts; - dc.inds = inds; - dc.vertexCount = vertexCount; - dc.indexType = (vertTypeID & GE_VTYPE_IDX_MASK) >> GE_VTYPE_IDX_SHIFT; - dc.prim = prim; - dc.cullMode = cullMode; - dc.uvScale = gstate_c.uv; - if (inds) { - GetIndexBounds(inds, vertexCount, vertTypeID, &dc.indexLowerBound, &dc.indexUpperBound); + bool applySkin = (vertTypeID & GE_VTYPE_WEIGHT_MASK) && decOptions_.applySkinInDecode; + + DeferredInds &di = drawInds_[numDrawInds_++]; + di.inds = inds; + di.indexType = (vertTypeID & GE_VTYPE_IDX_MASK) >> GE_VTYPE_IDX_SHIFT; + di.prim = prim; + di.cullMode = cullMode; + di.vertexCount = vertexCount; + di.vertDecodeIndex = numDrawVerts_; + di.offset = 0; + + _dbg_assert_(numDrawVerts_ <= MAX_DEFERRED_DRAW_VERTS); + _dbg_assert_(numDrawInds_ <= MAX_DEFERRED_DRAW_INDS); + + if (inds && numDrawVerts_ > decodeVertsCounter_ && drawVerts_[numDrawVerts_ - 1].verts == verts && !applySkin) { + // Same vertex pointer as a previous un-decoded draw call - let's just extend the decode! + di.vertDecodeIndex = numDrawVerts_ - 1; + DeferredVerts &dv = drawVerts_[numDrawVerts_ - 1]; + u16 lb; + u16 ub; + GetIndexBounds(inds, vertexCount, vertTypeID, &lb, &ub); + if (lb < dv.indexLowerBound) + dv.indexLowerBound = lb; + if (ub > dv.indexUpperBound) + dv.indexUpperBound = ub; } else { - dc.indexLowerBound = 0; - dc.indexUpperBound = vertexCount - 1; + // Record a new draw, and a new index gen. + DeferredVerts &dv = drawVerts_[numDrawVerts_++]; + dv.verts = verts; + dv.vertexCount = vertexCount; + dv.uvScale = gstate_c.uv; + if (inds) { + GetIndexBounds(inds, vertexCount, vertTypeID, &dv.indexLowerBound, &dv.indexUpperBound); + } else { + dv.indexLowerBound = 0; + dv.indexUpperBound = vertexCount - 1; + } } - numDrawCalls_++; vertexCountInDrawCalls_ += vertexCount; - if ((vertTypeID & GE_VTYPE_WEIGHT_MASK) && decOptions_.applySkinInDecode) { - DecodeVertsStep(decoded_, decodeCounter_, decodedVerts_, &dc.uvScale); - decodeCounter_++; - } - if (prim == GE_PRIM_RECTANGLES && (gstate.getTextureAddress(0) & 0x3FFFFFFF) == (gstate.getFrameBufAddress() & 0x3FFFFFFF)) { // This prevents issues with consecutive self-renders in Ridge Racer. gstate_c.Dirty(DIRTY_TEXTURE_PARAMS); @@ -848,6 +784,60 @@ void DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimiti } } +void DrawEngineCommon::DecodeVerts(u8 *dest) { + int i = decodeVertsCounter_; + int stride = (int)dec_->GetDecVtxFmt().stride; + for (; i < numDrawVerts_; i++) { + DeferredVerts &dv = drawVerts_[i]; + + int indexLowerBound = dv.indexLowerBound; + drawVertexOffsets_[i] = decodedVerts_ - indexLowerBound; + + int indexUpperBound = dv.indexUpperBound; + // Decode the verts (and at the same time apply morphing/skinning). Simple. + dec_->DecodeVerts(dest + decodedVerts_ * stride, dv.verts, &dv.uvScale, indexLowerBound, indexUpperBound); + decodedVerts_ += indexUpperBound - indexLowerBound + 1; + } + decodeVertsCounter_ = i; +} + +void DrawEngineCommon::DecodeInds() { + int i = decodeIndsCounter_; + for (; i < numDrawInds_; i++) { + const DeferredInds &di = drawInds_[i]; + + int indexOffset = drawVertexOffsets_[di.vertDecodeIndex] + di.offset; + bool clockwise = true; + if (gstate.isCullEnabled() && gstate.getCullMode() != di.cullMode) { + clockwise = false; + } + // We've already collapsed subsequent draws with the same vertex pointer, so no tricky logic here anymore. + // 2. Loop through the drawcalls, translating indices as we go. + switch (di.indexType) { + case GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT: + indexGen.AddPrim(di.prim, di.vertexCount, indexOffset, clockwise); + break; + case GE_VTYPE_IDX_8BIT >> GE_VTYPE_IDX_SHIFT: + indexGen.TranslatePrim(di.prim, di.vertexCount, (const u8 *)di.inds, indexOffset, clockwise); + break; + case GE_VTYPE_IDX_16BIT >> GE_VTYPE_IDX_SHIFT: + indexGen.TranslatePrim(di.prim, di.vertexCount, (const u16_le *)di.inds, indexOffset, clockwise); + break; + case GE_VTYPE_IDX_32BIT >> GE_VTYPE_IDX_SHIFT: + indexGen.TranslatePrim(di.prim, di.vertexCount, (const u32_le *)di.inds, indexOffset, clockwise); + break; + } + } + decodeIndsCounter_ = i; + + // Sanity check + if (indexGen.Prim() < 0) { + ERROR_LOG_REPORT(G3D, "DecodeVerts: Failed to deduce prim: %i", indexGen.Prim()); + // Force to points (0) + indexGen.AddPrim(GE_PRIM_POINTS, 0, 0, true); + } +} + bool DrawEngineCommon::CanUseHardwareTransform(int prim) { if (!useHWTransform_) return false; @@ -861,29 +851,6 @@ bool DrawEngineCommon::CanUseHardwareTessellation(GEPatchPrimType prim) { return false; } -// Cheap bit scrambler from https://nullprogram.com/blog/2018/07/31/ -inline uint32_t lowbias32_r(uint32_t x) { - x ^= x >> 16; - x *= 0x43021123U; - x ^= x >> 15 ^ x >> 30; - x *= 0x1d69e2a5U; - x ^= x >> 16; - return x; -} - -uint32_t DrawEngineCommon::ComputeDrawcallsHash() const { - uint32_t dcid = 0; - for (int i = 0; i < numDrawCalls_; i++) { - u32 dhash = dcid; - dhash = __rotl(dhash ^ (u32)(uintptr_t)drawCalls_[i].verts, 13); - dhash = __rotl(dhash ^ (u32)(uintptr_t)drawCalls_[i].inds, 19); - dhash = __rotl(dhash ^ (u32)drawCalls_[i].indexType, 7); - dhash = __rotl(dhash ^ (u32)drawCalls_[i].vertexCount, 11); - dcid = lowbias32_r(dhash ^ (u32)drawCalls_[i].prim); - } - return dcid; -} - void TessellationDataTransfer::CopyControlPoints(float *pos, float *tex, float *col, int posStride, int texStride, int colStride, const SimpleVertex *const *points, int size, u32 vertType) { bool hasColor = (vertType & GE_VTYPE_COL_MASK) != 0; bool hasTexCoord = (vertType & GE_VTYPE_TC_MASK) != 0; diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h index 21c0476a4400..3e9228bbfd8c 100644 --- a/GPU/Common/DrawEngineCommon.h +++ b/GPU/Common/DrawEngineCommon.h @@ -104,6 +104,14 @@ class DrawEngineCommon { bool TestBoundingBox(const void *control_points, const void *inds, int vertexCount, u32 vertType); + void FlushSkin() { + bool applySkin = (lastVType_ & GE_VTYPE_WEIGHT_MASK) && decOptions_.applySkinInDecode; + if (applySkin) { + DecodeVerts(decoded_); + } + } + + bool ExtendNonIndexedPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead); void SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead); template void SubmitCurve(const void *control_points, const void *indices, Surface &surface, u32 vertType, int *bytesRead, const char *scope); @@ -130,7 +138,7 @@ class DrawEngineCommon { return false; } int GetNumDrawCalls() const { - return numDrawCalls_; + return numDrawVerts_; } VertexDecoder *GetVertexDecoder(u32 vtype); @@ -141,8 +149,12 @@ class DrawEngineCommon { virtual bool UpdateUseHWTessellation(bool enabled) const { return enabled; } void UpdatePlanes(); - int ComputeNumVertsToDecode() const; void DecodeVerts(u8 *dest); + void DecodeInds(); + + int MaxIndex() const { + return decodedVerts_; + } // Preprocessing for spline/bezier u32 NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr, int lowerBound, int upperBound, u32 vertType, int *vertexSize = nullptr); @@ -151,8 +163,7 @@ class DrawEngineCommon { u32 ComputeMiniHash(); uint64_t ComputeHash(); - // Vertex decoding - void DecodeVertsStep(u8 *dest, int &i, int &decodedVerts, const UVScale *uvScale); + int ComputeNumVertsToDecode() const; void ApplyFramebufferRead(FBOTexState *fboTexState); @@ -210,25 +221,37 @@ class DrawEngineCommon { TransformedVertex *transformedExpanded_ = nullptr; // Defer all vertex decoding to a "Flush" (except when software skinning) - struct DeferredDrawCall { + struct DeferredVerts { const void *verts; + u32 vertexCount; + u16 indexLowerBound; + u16 indexUpperBound; + UVScale uvScale; + }; + + struct DeferredInds { const void *inds; u32 vertexCount; + u8 vertDecodeIndex; // index into the drawVerts_ array to look up the vertexOffset. u8 indexType; s8 prim; u8 cullMode; - u16 indexLowerBound; - u16 indexUpperBound; - UVScale uvScale; + u16 offset; }; - enum { MAX_DEFERRED_DRAW_CALLS = 128 }; - DeferredDrawCall drawCalls_[MAX_DEFERRED_DRAW_CALLS]; - int numDrawCalls_ = 0; + enum { MAX_DEFERRED_DRAW_VERTS = 128 }; // If you change this to more than 256, change type of DeferredInds::vertDecodeIndex. + enum { MAX_DEFERRED_DRAW_INDS = 512 }; // Monster Hunter spams indexed calls that we end up merging. + DeferredVerts drawVerts_[MAX_DEFERRED_DRAW_VERTS]; + uint32_t drawVertexOffsets_[MAX_DEFERRED_DRAW_VERTS]; + DeferredInds drawInds_[MAX_DEFERRED_DRAW_INDS]; + + int numDrawVerts_ = 0; + int numDrawInds_ = 0; int vertexCountInDrawCalls_ = 0; int decimationCounter_ = 0; - int decodeCounter_ = 0; + int decodeVertsCounter_ = 0; + int decodeIndsCounter_ = 0; // Vertex collector state IndexGenerator indexGen; diff --git a/GPU/Common/IndexGenerator.cpp b/GPU/Common/IndexGenerator.cpp index d1635e807d69..b361cbcbce62 100644 --- a/GPU/Common/IndexGenerator.cpp +++ b/GPU/Common/IndexGenerator.cpp @@ -50,44 +50,40 @@ void IndexGenerator::Setup(u16 *inds) { Reset(); } -void IndexGenerator::AddPrim(int prim, int vertexCount, bool clockwise) { +void IndexGenerator::AddPrim(int prim, int vertexCount, int indexOffset, bool clockwise) { switch (prim) { - case GE_PRIM_POINTS: AddPoints(vertexCount); break; - case GE_PRIM_LINES: AddLineList(vertexCount); break; - case GE_PRIM_LINE_STRIP: AddLineStrip(vertexCount); break; - case GE_PRIM_TRIANGLES: AddList(vertexCount, clockwise); break; - case GE_PRIM_TRIANGLE_STRIP: AddStrip(vertexCount, clockwise); break; - case GE_PRIM_TRIANGLE_FAN: AddFan(vertexCount, clockwise); break; - case GE_PRIM_RECTANGLES: AddRectangles(vertexCount); break; // Same + case GE_PRIM_POINTS: AddPoints(vertexCount, indexOffset); break; + case GE_PRIM_LINES: AddLineList(vertexCount, indexOffset); break; + case GE_PRIM_LINE_STRIP: AddLineStrip(vertexCount, indexOffset); break; + case GE_PRIM_TRIANGLES: AddList(vertexCount, indexOffset, clockwise); break; + case GE_PRIM_TRIANGLE_STRIP: AddStrip(vertexCount, indexOffset, clockwise); break; + case GE_PRIM_TRIANGLE_FAN: AddFan(vertexCount, indexOffset, clockwise); break; + case GE_PRIM_RECTANGLES: AddRectangles(vertexCount, indexOffset); break; // Same } } -void IndexGenerator::AddPoints(int numVerts) { +void IndexGenerator::AddPoints(int numVerts, int indexOffset) { u16 *outInds = inds_; - const int startIndex = index_; for (int i = 0; i < numVerts; i++) - *outInds++ = startIndex + i; + *outInds++ = indexOffset + i; inds_ = outInds; // ignore overflow verts - index_ += numVerts; count_ += numVerts; prim_ = GE_PRIM_POINTS; seenPrims_ |= 1 << GE_PRIM_POINTS; } -void IndexGenerator::AddList(int numVerts, bool clockwise) { +void IndexGenerator::AddList(int numVerts, int indexOffset, bool clockwise) { u16 *outInds = inds_; - const int startIndex = index_; const int v1 = clockwise ? 1 : 2; const int v2 = clockwise ? 2 : 1; for (int i = 0; i < numVerts; i += 3) { - *outInds++ = startIndex + i; - *outInds++ = startIndex + i + v1; - *outInds++ = startIndex + i + v2; + *outInds++ = indexOffset + i; + *outInds++ = indexOffset + i + v1; + *outInds++ = indexOffset + i + v2; } inds_ = outInds; // ignore overflow verts - index_ += numVerts; count_ += numVerts; prim_ = GE_PRIM_TRIANGLES; seenPrims_ |= 1 << GE_PRIM_TRIANGLES; @@ -119,7 +115,7 @@ alignas(16) static const uint16_t offsets_counter_clockwise[24] = { 7, (u16)(7 + 1), (u16)(7 + 2), }; -void IndexGenerator::AddStrip(int numVerts, bool clockwise) { +void IndexGenerator::AddStrip(int numVerts, int indexOffset, bool clockwise) { int numTris = numVerts - 2; #ifdef _M_SSE // In an SSE2 register we can fit 8 16-bit integers. @@ -130,7 +126,7 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) { // We allow ourselves to write some extra indices to avoid the fallback loop. // That's alright as we're appending to a buffer - they will get overwritten anyway. int numChunks = (numTris + 7) >> 3; - __m128i ibase8 = _mm_set1_epi16(index_); + __m128i ibase8 = _mm_set1_epi16(indexOffset); const __m128i *offsets = (const __m128i *)(clockwise ? offsets_clockwise : offsets_counter_clockwise); __m128i *dst = (__m128i *)inds_; __m128i offsets0 = _mm_add_epi16(ibase8, _mm_load_si128(offsets)); @@ -158,7 +154,7 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) { // wind doesn't need to be updated, an even number of triangles have been drawn. #elif PPSSPP_ARCH(ARM_NEON) int numChunks = (numTris + 7) >> 3; - uint16x8_t ibase8 = vdupq_n_u16(index_); + uint16x8_t ibase8 = vdupq_n_u16(indexOffset); const u16 *offsets = clockwise ? offsets_clockwise : offsets_counter_clockwise; u16 *dst = inds_; uint16x8_t offsets0 = vaddq_u16(ibase8, vld1q_u16(offsets)); @@ -185,7 +181,7 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) { #else // Slow fallback loop. int wind = clockwise ? 1 : 2; - int ibase = index_; + int ibase = indexOffset; size_t numPairs = numTris / 2; u16 *outInds = inds_; while (numPairs > 0) { @@ -207,7 +203,6 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) { inds_ = outInds; #endif - index_ += numVerts; if (numTris > 0) count_ += numTris * 3; // This is so we can detect one single strip by just looking at seenPrims_. @@ -222,19 +217,17 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) { } } -void IndexGenerator::AddFan(int numVerts, bool clockwise) { +void IndexGenerator::AddFan(int numVerts, int indexOffset, bool clockwise) { const int numTris = numVerts - 2; u16 *outInds = inds_; - const int startIndex = index_; const int v1 = clockwise ? 1 : 2; const int v2 = clockwise ? 2 : 1; for (int i = 0; i < numTris; i++) { - *outInds++ = startIndex; - *outInds++ = startIndex + i + v1; - *outInds++ = startIndex + i + v2; + *outInds++ = indexOffset; + *outInds++ = indexOffset + i + v1; + *outInds++ = indexOffset + i + v2; } inds_ = outInds; - index_ += numVerts; count_ += numTris * 3; prim_ = GE_PRIM_TRIANGLES; seenPrims_ |= 1 << GE_PRIM_TRIANGLE_FAN; @@ -245,46 +238,40 @@ void IndexGenerator::AddFan(int numVerts, bool clockwise) { } //Lines -void IndexGenerator::AddLineList(int numVerts) { +void IndexGenerator::AddLineList(int numVerts, int indexOffset) { u16 *outInds = inds_; - const int startIndex = index_; for (int i = 0; i < numVerts; i += 2) { - *outInds++ = startIndex + i; - *outInds++ = startIndex + i + 1; + *outInds++ = indexOffset + i; + *outInds++ = indexOffset + i + 1; } inds_ = outInds; - index_ += numVerts; count_ += numVerts; prim_ = GE_PRIM_LINES; seenPrims_ |= 1 << prim_; } -void IndexGenerator::AddLineStrip(int numVerts) { +void IndexGenerator::AddLineStrip(int numVerts, int indexOffset) { const int numLines = numVerts - 1; u16 *outInds = inds_; - const int startIndex = index_; for (int i = 0; i < numLines; i++) { - *outInds++ = startIndex + i; - *outInds++ = startIndex + i + 1; + *outInds++ = indexOffset + i; + *outInds++ = indexOffset + i + 1; } inds_ = outInds; - index_ += numVerts; count_ += numLines * 2; prim_ = GE_PRIM_LINES; seenPrims_ |= 1 << GE_PRIM_LINE_STRIP; } -void IndexGenerator::AddRectangles(int numVerts) { +void IndexGenerator::AddRectangles(int numVerts, int indexOffset) { u16 *outInds = inds_; - const int startIndex = index_; //rectangles always need 2 vertices, disregard the last one if there's an odd number numVerts = numVerts & ~1; for (int i = 0; i < numVerts; i += 2) { - *outInds++ = startIndex + i; - *outInds++ = startIndex + i + 1; + *outInds++ = indexOffset + i; + *outInds++ = indexOffset + i + 1; } inds_ = outInds; - index_ += numVerts; count_ += numVerts; prim_ = GE_PRIM_RECTANGLES; seenPrims_ |= 1 << GE_PRIM_RECTANGLES; @@ -292,7 +279,6 @@ void IndexGenerator::AddRectangles(int numVerts) { template void IndexGenerator::TranslatePoints(int numInds, const ITypeLE *inds, int indexOffset) { - indexOffset = index_ - indexOffset; u16 *outInds = inds_; for (int i = 0; i < numInds; i++) *outInds++ = indexOffset + inds[i]; @@ -304,7 +290,6 @@ void IndexGenerator::TranslatePoints(int numInds, const ITypeLE *inds, int index template void IndexGenerator::TranslateLineList(int numInds, const ITypeLE *inds, int indexOffset) { - indexOffset = index_ - indexOffset; u16 *outInds = inds_; numInds = numInds & ~1; for (int i = 0; i < numInds; i += 2) { @@ -319,7 +304,6 @@ void IndexGenerator::TranslateLineList(int numInds, const ITypeLE *inds, int ind template void IndexGenerator::TranslateLineStrip(int numInds, const ITypeLE *inds, int indexOffset) { - indexOffset = index_ - indexOffset; int numLines = numInds - 1; u16 *outInds = inds_; for (int i = 0; i < numLines; i++) { @@ -334,7 +318,6 @@ void IndexGenerator::TranslateLineStrip(int numInds, const ITypeLE *inds, int in template void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) { - indexOffset = index_ - indexOffset; // We only bother doing this minor optimization in triangle list, since it's by far the most // common operation that can benefit. if (sizeof(ITypeLE) == sizeof(inds_[0]) && indexOffset == 0 && clockwise) { @@ -347,6 +330,7 @@ void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOf numInds = numTris * 3; const int v1 = clockwise ? 1 : 2; const int v2 = clockwise ? 2 : 1; + // TODO: This can actually be SIMD-d, although will need complex shuffles if clockwise. for (int i = 0; i < numInds; i += 3) { *outInds++ = indexOffset + inds[i]; *outInds++ = indexOffset + inds[i + v1]; @@ -362,7 +346,6 @@ void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOf template void IndexGenerator::TranslateStrip(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) { int wind = clockwise ? 1 : 2; - indexOffset = index_ - indexOffset; int numTris = numInds - 2; u16 *outInds = inds_; for (int i = 0; i < numTris; i++) { @@ -380,7 +363,6 @@ void IndexGenerator::TranslateStrip(int numInds, const ITypeLE *inds, int indexO template void IndexGenerator::TranslateFan(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) { if (numInds <= 0) return; - indexOffset = index_ - indexOffset; int numTris = numInds - 2; u16 *outInds = inds_; const int v1 = clockwise ? 1 : 2; @@ -398,7 +380,6 @@ void IndexGenerator::TranslateFan(int numInds, const ITypeLE *inds, int indexOff template inline void IndexGenerator::TranslateRectangles(int numInds, const ITypeLE *inds, int indexOffset) { - indexOffset = index_ - indexOffset; u16 *outInds = inds_; //rectangles always need 2 vertices, disregard the last one if there's an odd number numInds = numInds & ~1; diff --git a/GPU/Common/IndexGenerator.h b/GPU/Common/IndexGenerator.h index e8c2578409bb..b5df11aab8f5 100644 --- a/GPU/Common/IndexGenerator.h +++ b/GPU/Common/IndexGenerator.h @@ -28,7 +28,6 @@ class IndexGenerator { void Reset() { prim_ = GE_PRIM_INVALID; count_ = 0; - index_ = 0; seenPrims_ = 0; pureCount_ = 0; this->inds_ = indsBase_; @@ -57,19 +56,12 @@ class IndexGenerator { } } - void AddPrim(int prim, int vertexCount, bool clockwise); + void AddPrim(int prim, int vertexCount, int indexOffset, bool clockwise); void TranslatePrim(int prim, int numInds, const u8 *inds, int indexOffset, bool clockwise); void TranslatePrim(int prim, int numInds, const u16_le *inds, int indexOffset, bool clockwise); void TranslatePrim(int prim, int numInds, const u32_le *inds, int indexOffset, bool clockwise); - void Advance(int numVerts) { - index_ += numVerts; - } - - void SetIndex(int ind) { index_ = ind; } - int MaxIndex() const { return index_; } // Really NextIndex rather than MaxIndex, it's one more than the highest index generated int VertexCount() const { return count_; } - bool Empty() const { return index_ == 0; } int SeenPrims() const { return seenPrims_; } int PureCount() const { return pureCount_; } bool SeenOnlyPurePrims() const { @@ -81,16 +73,16 @@ class IndexGenerator { private: // Points (why index these? code simplicity) - void AddPoints(int numVerts); + void AddPoints(int numVerts, int indexOffset); // Triangles - void AddList(int numVerts, bool clockwise); - void AddStrip(int numVerts, bool clockwise); - void AddFan(int numVerts, bool clockwise); + void AddList(int numVerts, int indexOffset, bool clockwise); + void AddStrip(int numVerts, int indexOffset, bool clockwise); + void AddFan(int numVerts, int indexOffset, bool clockwise); // Lines - void AddLineList(int numVerts); - void AddLineStrip(int numVerts); + void AddLineList(int numVerts, int indexOffset); + void AddLineStrip(int numVerts, int indexOffset); // Rectangles - void AddRectangles(int numVerts); + void AddRectangles(int numVerts, int indexOffset); // These translate already indexed lists template @@ -118,7 +110,6 @@ class IndexGenerator { u16 *indsBase_; u16 *inds_; - int index_; int count_; int pureCount_; GEPrimitiveType prim_; diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp index c31a5f1d581b..ec1af7c80e7d 100644 --- a/GPU/Common/VertexDecoderCommon.cpp +++ b/GPU/Common/VertexDecoderCommon.cpp @@ -1293,6 +1293,9 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options, } void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, const UVScale *uvScaleOffset, int indexLowerBound, int indexUpperBound) const { + // A single 0 is acceptable for point lists. + _dbg_assert_(indexLowerBound <= indexUpperBound); + // Decode the vertices within the found bounds, once each // decoded_ and ptr_ are used in the steps, so can't be turned into locals for speed. const u8 *startPtr = (const u8*)verts + indexLowerBound * size; diff --git a/GPU/D3D11/DrawEngineD3D11.cpp b/GPU/D3D11/DrawEngineD3D11.cpp index 7780bfa28a26..0deb056ef45b 100644 --- a/GPU/D3D11/DrawEngineD3D11.cpp +++ b/GPU/D3D11/DrawEngineD3D11.cpp @@ -366,7 +366,7 @@ void DrawEngineD3D11::DoFlush() { if (useCache) { // getUVGenMode can have an effect on which UV decoder we need to use! And hence what the decoded data will look like. See #9263 - u32 dcid = (u32)XXH3_64bits(&drawCalls_, sizeof(DeferredDrawCall) * numDrawCalls_) ^ gstate.getUVGenMode(); + u32 dcid = ComputeDrawcallsHash() ^ gstate.getUVGenMode(); VertexArrayInfoD3D11 *vai; if (!vai_.Get(dcid, &vai)) { @@ -384,9 +384,10 @@ void DrawEngineD3D11::DoFlush() { vai->status = VertexArrayInfoD3D11::VAI_HASHING; vai->drawsUntilNextFullHash = 0; DecodeVerts(decoded_); // writes to indexGen + DecodeInds(); vai->numVerts = indexGen.VertexCount(); vai->prim = indexGen.Prim(); - vai->maxIndex = indexGen.MaxIndex(); + vai->maxIndex = MaxIndex(); vai->flags = gstate_c.vertexFullAlpha ? VAI11_FLAG_VERTEXFULLALPHA : 0; goto rotateVBO; } @@ -409,6 +410,7 @@ void DrawEngineD3D11::DoFlush() { if (newMiniHash != vai->minihash || newHash != vai->hash) { MarkUnreliable(vai); DecodeVerts(decoded_); + DecodeInds(); goto rotateVBO; } if (vai->numVerts > 64) { @@ -428,15 +430,17 @@ void DrawEngineD3D11::DoFlush() { if (newMiniHash != vai->minihash) { MarkUnreliable(vai); DecodeVerts(decoded_); + DecodeInds(); goto rotateVBO; } } if (vai->vbo == 0) { DecodeVerts(decoded_); + DecodeInds(); vai->numVerts = indexGen.VertexCount(); vai->prim = indexGen.Prim(); - vai->maxIndex = indexGen.MaxIndex(); + vai->maxIndex = MaxIndex(); vai->flags = gstate_c.vertexFullAlpha ? VAI11_FLAG_VERTEXFULLALPHA : 0; useElements = !indexGen.SeenOnlyPurePrims() || prim == GE_PRIM_TRIANGLE_FAN; if (!useElements && indexGen.PureCount()) { @@ -446,7 +450,7 @@ void DrawEngineD3D11::DoFlush() { _dbg_assert_msg_(gstate_c.vertBounds.minV >= gstate_c.vertBounds.maxV, "Should not have checked UVs when caching."); // TODO: Combine these two into one buffer? - u32 size = dec_->GetDecVtxFmt().stride * indexGen.MaxIndex(); + u32 size = dec_->GetDecVtxFmt().stride * MaxIndex(); D3D11_BUFFER_DESC desc{ size, D3D11_USAGE_IMMUTABLE, D3D11_BIND_VERTEX_BUFFER, 0 }; D3D11_SUBRESOURCE_DATA data{ decoded_ }; ASSERT_SUCCESS(device_->CreateBuffer(&desc, &data, &vai->vbo)); @@ -500,6 +504,7 @@ void DrawEngineD3D11::DoFlush() { vai->numFrames++; } DecodeVerts(decoded_); + DecodeInds(); goto rotateVBO; } } @@ -507,11 +512,12 @@ void DrawEngineD3D11::DoFlush() { vai->lastFrame = gpuStats.numFlips; } else { DecodeVerts(decoded_); + DecodeInds(); rotateVBO: gpuStats.numUncachedVertsDrawn += indexGen.VertexCount(); useElements = !indexGen.SeenOnlyPurePrims() || prim == GE_PRIM_TRIANGLE_FAN; vertexCount = indexGen.VertexCount(); - maxIndex = indexGen.MaxIndex(); + maxIndex = MaxIndex(); if (!useElements && indexGen.PureCount()) { vertexCount = indexGen.PureCount(); } @@ -584,6 +590,7 @@ void DrawEngineD3D11::DoFlush() { dec_ = GetVertexDecoder(lastVType_); } DecodeVerts(decoded_); + DecodeInds(); bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE; if (gstate.isModeThrough()) { gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255); @@ -622,7 +629,7 @@ void DrawEngineD3D11::DoFlush() { UpdateCachedViewportState(vpAndScissor); } - int maxIndex = indexGen.MaxIndex(); + int maxIndex = MaxIndex(); SoftwareTransform swTransform(params); const Lin::Vec3 trans(gstate_c.vpXOffset, -gstate_c.vpYOffset, gstate_c.vpZOffset * 0.5f + 0.5f); @@ -719,14 +726,17 @@ void DrawEngineD3D11::DoFlush() { } gpuStats.numFlushes++; - gpuStats.numDrawCalls += numDrawCalls_; + gpuStats.numDrawCalls += numDrawInds_; + gpuStats.numVertexDecodes += numDrawVerts_; gpuStats.numVertsSubmitted += vertexCountInDrawCalls_; indexGen.Reset(); decodedVerts_ = 0; - numDrawCalls_ = 0; + numDrawVerts_ = 0; + numDrawInds_ = 0; vertexCountInDrawCalls_ = 0; - decodeCounter_ = 0; + decodeVertsCounter_ = 0; + decodeIndsCounter_ = 0; gstate_c.vertexFullAlpha = true; framebufferManager_->SetColorUpdated(gstate_c.skipDrawReason); diff --git a/GPU/D3D11/DrawEngineD3D11.h b/GPU/D3D11/DrawEngineD3D11.h index 763588e310f7..2b1a9fbf7988 100644 --- a/GPU/D3D11/DrawEngineD3D11.h +++ b/GPU/D3D11/DrawEngineD3D11.h @@ -138,19 +138,19 @@ class DrawEngineD3D11 : public DrawEngineCommon { // So that this can be inlined void Flush() { - if (!numDrawCalls_) + if (!numDrawVerts_) return; DoFlush(); } void FinishDeferred() { - if (!numDrawCalls_) + if (!numDrawVerts_) return; DecodeVerts(decoded_); } void DispatchFlush() override { - if (!numDrawCalls_) + if (!numDrawVerts_) return; Flush(); } diff --git a/GPU/Directx9/DrawEngineDX9.cpp b/GPU/Directx9/DrawEngineDX9.cpp index 9efa233dd0b8..276fde10a3dc 100644 --- a/GPU/Directx9/DrawEngineDX9.cpp +++ b/GPU/Directx9/DrawEngineDX9.cpp @@ -345,7 +345,7 @@ void DrawEngineDX9::DoFlush() { if (useCache) { // getUVGenMode can have an effect on which UV decoder we need to use! And hence what the decoded data will look like. See #9263 - u32 dcid = (u32)XXH3_64bits(&drawCalls_, sizeof(DeferredDrawCall) * numDrawCalls_) ^ gstate.getUVGenMode(); + u32 dcid = ComputeDrawcallsHash() ^ gstate.getUVGenMode(); VertexArrayInfoDX9 *vai; if (!vai_.Get(dcid, &vai)) { vai = new VertexArrayInfoDX9(); @@ -362,9 +362,10 @@ void DrawEngineDX9::DoFlush() { vai->status = VertexArrayInfoDX9::VAI_HASHING; vai->drawsUntilNextFullHash = 0; DecodeVerts(decoded_); // writes to indexGen + DecodeInds(); vai->numVerts = indexGen.VertexCount(); vai->prim = indexGen.Prim(); - vai->maxIndex = indexGen.MaxIndex(); + vai->maxIndex = MaxIndex(); vai->flags = gstate_c.vertexFullAlpha ? VAI_FLAG_VERTEXFULLALPHA : 0; goto rotateVBO; @@ -388,6 +389,7 @@ void DrawEngineDX9::DoFlush() { if (newMiniHash != vai->minihash || newHash != vai->hash) { MarkUnreliable(vai); DecodeVerts(decoded_); + DecodeInds(); goto rotateVBO; } if (vai->numVerts > 64) { @@ -407,15 +409,17 @@ void DrawEngineDX9::DoFlush() { if (newMiniHash != vai->minihash) { MarkUnreliable(vai); DecodeVerts(decoded_); + DecodeInds(); goto rotateVBO; } } if (vai->vbo == 0) { DecodeVerts(decoded_); + DecodeInds(); vai->numVerts = indexGen.VertexCount(); vai->prim = indexGen.Prim(); - vai->maxIndex = indexGen.MaxIndex(); + vai->maxIndex = MaxIndex(); vai->flags = gstate_c.vertexFullAlpha ? VAI_FLAG_VERTEXFULLALPHA : 0; useElements = !indexGen.SeenOnlyPurePrims(); if (!useElements && indexGen.PureCount()) { @@ -425,7 +429,7 @@ void DrawEngineDX9::DoFlush() { _dbg_assert_msg_(gstate_c.vertBounds.minV >= gstate_c.vertBounds.maxV, "Should not have checked UVs when caching."); void * pVb; - u32 size = dec_->GetDecVtxFmt().stride * indexGen.MaxIndex(); + u32 size = dec_->GetDecVtxFmt().stride * MaxIndex(); device_->CreateVertexBuffer(size, D3DUSAGE_WRITEONLY, 0, D3DPOOL_DEFAULT, &vai->vbo, NULL); vai->vbo->Lock(0, size, &pVb, 0); memcpy(pVb, decoded_, size); @@ -482,6 +486,7 @@ void DrawEngineDX9::DoFlush() { vai->numFrames++; } DecodeVerts(decoded_); + DecodeInds(); goto rotateVBO; } } @@ -489,17 +494,20 @@ void DrawEngineDX9::DoFlush() { vai->lastFrame = gpuStats.numFlips; } else { DecodeVerts(decoded_); + DecodeInds(); rotateVBO: gpuStats.numUncachedVertsDrawn += indexGen.VertexCount(); useElements = !indexGen.SeenOnlyPurePrims(); vertexCount = indexGen.VertexCount(); - maxIndex = indexGen.MaxIndex(); + maxIndex = MaxIndex(); if (!useElements && indexGen.PureCount()) { vertexCount = indexGen.PureCount(); } prim = indexGen.Prim(); } + _dbg_assert_((int)prim > 0); + bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE; if (gstate.isModeThrough()) { gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255); @@ -544,6 +552,7 @@ void DrawEngineDX9::DoFlush() { dec_ = GetVertexDecoder(lastVType_); } DecodeVerts(decoded_); + DecodeInds(); bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE; if (gstate.isModeThrough()) { gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255); @@ -582,7 +591,7 @@ void DrawEngineDX9::DoFlush() { UpdateCachedViewportState(vpAndScissor); } - int maxIndex = indexGen.MaxIndex(); + int maxIndex = MaxIndex(); SoftwareTransform swTransform(params); // Half pixel offset hack. @@ -658,14 +667,19 @@ void DrawEngineDX9::DoFlush() { } gpuStats.numFlushes++; - gpuStats.numDrawCalls += numDrawCalls_; + gpuStats.numDrawCalls += numDrawInds_; + gpuStats.numVertexDecodes += numDrawVerts_; gpuStats.numVertsSubmitted += vertexCountInDrawCalls_; + // TODO: The below should be shared. + indexGen.Reset(); decodedVerts_ = 0; - numDrawCalls_ = 0; + numDrawVerts_ = 0; + numDrawInds_ = 0; vertexCountInDrawCalls_ = 0; - decodeCounter_ = 0; + decodeVertsCounter_ = 0; + decodeIndsCounter_ = 0; gstate_c.vertexFullAlpha = true; framebufferManager_->SetColorUpdated(gstate_c.skipDrawReason); diff --git a/GPU/Directx9/DrawEngineDX9.h b/GPU/Directx9/DrawEngineDX9.h index 9ce7b11ec854..a527523b7158 100644 --- a/GPU/Directx9/DrawEngineDX9.h +++ b/GPU/Directx9/DrawEngineDX9.h @@ -128,19 +128,19 @@ class DrawEngineDX9 : public DrawEngineCommon { // So that this can be inlined void Flush() { - if (!numDrawCalls_) + if (!numDrawVerts_) return; DoFlush(); } void FinishDeferred() { - if (!numDrawCalls_) + if (!numDrawVerts_) return; DecodeVerts(decoded_); } void DispatchFlush() override { - if (!numDrawCalls_) + if (!numDrawVerts_) return; Flush(); } diff --git a/GPU/GLES/DrawEngineGLES.cpp b/GPU/GLES/DrawEngineGLES.cpp index d1b957ac548a..9949d0c37bb7 100644 --- a/GPU/GLES/DrawEngineGLES.cpp +++ b/GPU/GLES/DrawEngineGLES.cpp @@ -245,9 +245,11 @@ void DrawEngineGLES::DoFlush() { // can't goto bail here, skips too many variable initializations. So let's wipe the most important stuff. indexGen.Reset(); decodedVerts_ = 0; - numDrawCalls_ = 0; + numDrawVerts_ = 0; + numDrawInds_ = 0; vertexCountInDrawCalls_ = 0; - decodeCounter_ = 0; + decodeVertsCounter_ = 0; + decodeIndsCounter_ = 0; return; } @@ -284,9 +286,9 @@ void DrawEngineGLES::DoFlush() { // Figure out how much pushbuffer space we need to allocate. int vertsToDecode = ComputeNumVertsToDecode(); u8 *dest = (u8 *)frameData.pushVertex->Allocate(vertsToDecode * dec_->GetDecVtxFmt().stride, 4, &vertexBuffer, &vertexBufferOffset); - // Indices are decoded in here. DecodeVerts(dest); } + DecodeInds(); gpuStats.numUncachedVertsDrawn += indexGen.VertexCount(); @@ -343,6 +345,7 @@ void DrawEngineGLES::DoFlush() { dec_ = GetVertexDecoder(lastVType_); } DecodeVerts(decoded_); + DecodeInds(); bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE; if (gstate.isModeThrough()) { @@ -381,7 +384,7 @@ void DrawEngineGLES::DoFlush() { UpdateCachedViewportState(vpAndScissor_); } - int maxIndex = indexGen.MaxIndex(); + int maxIndex = MaxIndex(); int vertexCount = indexGen.VertexCount(); // TODO: Split up into multiple draw calls for GLES 2.0 where you can't guarantee support for more than 0x10000 verts. @@ -471,7 +474,8 @@ void DrawEngineGLES::DoFlush() { bail: gpuStats.numFlushes++; - gpuStats.numDrawCalls += numDrawCalls_; + gpuStats.numDrawCalls += numDrawInds_; + gpuStats.numVertexDecodes += numDrawVerts_; gpuStats.numVertsSubmitted += vertexCountInDrawCalls_; // TODO: When the next flush has the same vertex format, we can continue with the same offset in the vertex buffer, @@ -479,9 +483,11 @@ void DrawEngineGLES::DoFlush() { // wanted to avoid rebinding the vertex input every time). indexGen.Reset(); decodedVerts_ = 0; - numDrawCalls_ = 0; + numDrawVerts_ = 0; + numDrawInds_ = 0; vertexCountInDrawCalls_ = 0; - decodeCounter_ = 0; + decodeVertsCounter_ = 0; + decodeIndsCounter_ = 0; gstate_c.vertexFullAlpha = true; framebufferManager_->SetColorUpdated(gstate_c.skipDrawReason); diff --git a/GPU/GLES/DrawEngineGLES.h b/GPU/GLES/DrawEngineGLES.h index 8a11cbb5ea78..e61951c1f10c 100644 --- a/GPU/GLES/DrawEngineGLES.h +++ b/GPU/GLES/DrawEngineGLES.h @@ -86,19 +86,19 @@ class DrawEngineGLES : public DrawEngineCommon { // So that this can be inlined void Flush() { - if (!numDrawCalls_) + if (!numDrawVerts_) return; DoFlush(); } void FinishDeferred() { - if (!numDrawCalls_) + if (!numDrawVerts_) return; DoFlush(); } void DispatchFlush() override { - if (!numDrawCalls_) + if (!numDrawVerts_) return; Flush(); } diff --git a/GPU/GPU.h b/GPU/GPU.h index 9e9dd6049a41..5db94e2d3921 100644 --- a/GPU/GPU.h +++ b/GPU/GPU.h @@ -75,6 +75,7 @@ struct GPUStatistics { void ResetFrame() { numDrawCalls = 0; + numVertexDecodes = 0; numDrawSyncs = 0; numListSyncs = 0; numCachedDrawCalls = 0; @@ -111,6 +112,7 @@ struct GPUStatistics { // Per frame statistics int numDrawCalls; + int numVertexDecodes; int numDrawSyncs; int numListSyncs; int numCachedDrawCalls; diff --git a/GPU/GPUCommonHW.cpp b/GPU/GPUCommonHW.cpp index a9b5ded8754e..a28a16ff6822 100644 --- a/GPU/GPUCommonHW.cpp +++ b/GPU/GPUCommonHW.cpp @@ -967,6 +967,8 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { const void *verts = Memory::GetPointerUnchecked(gstate_c.vertexAddr); const void *inds = nullptr; + + bool canExtend = true; u32 vertexType = gstate.vertType; if ((vertexType & GE_VTYPE_IDX_MASK) != GE_VTYPE_IDX_NONE) { u32 indexAddr = gstate_c.indexAddr; @@ -975,6 +977,7 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { return; } inds = Memory::GetPointerUnchecked(indexAddr); + canExtend = false; } int bytesRead = 0; @@ -1017,12 +1020,28 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { if (IsTrianglePrim(newPrim) != isTriangle) goto bail; // Can't join over this boundary. Might as well exit and get this on the next time around. // TODO: more efficient updating of verts/inds + + u32 count = data & 0xFFFF; + if (canExtend) { + // Non-indexed draws can be cheaply merged if vertexAddr hasn't changed, that means the vertices + // are consecutive in memory. + _dbg_assert_((vertexType & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_NONE); + if (drawEngineCommon_->ExtendNonIndexedPrim(newPrim, count, vertTypeID, cullMode, &bytesRead)) { + gstate_c.vertexAddr += bytesRead; + totalVertCount += count; + break; + } + } + + // Failed, or can't extend? Do a normal submit. verts = Memory::GetPointerUnchecked(gstate_c.vertexAddr); inds = nullptr; if ((vertexType & GE_VTYPE_IDX_MASK) != GE_VTYPE_IDX_NONE) { inds = Memory::GetPointerUnchecked(gstate_c.indexAddr); + } else { + // We can extend again after submitting a normal draw. + canExtend = true; } - u32 count = data & 0xFFFF; drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, vertTypeID, cullMode, &bytesRead); AdvanceVerts(vertexType, count, bytesRead); totalVertCount += count; @@ -1032,18 +1051,26 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { { uint32_t diff = data ^ vertexType; // don't mask upper bits, vertexType is unmasked - if (diff & vtypeCheckMask) { - goto bail; - } else { + if (diff) { + drawEngineCommon_->FlushSkin(); + if (diff & vtypeCheckMask) + goto bail; + canExtend = false; // TODO: Might support extending between some vertex types in the future. vertexType = data; vertTypeID = GetVertTypeID(vertexType, gstate.getUVGenMode(), g_Config.bSoftwareSkinning); } break; } case GE_CMD_VADDR: + { gstate.cmdmem[GE_CMD_VADDR] = data; - gstate_c.vertexAddr = gstate_c.getRelativeAddress(data & 0x00FFFFFF); + uint32_t newAddr = gstate_c.getRelativeAddress(data & 0x00FFFFFF); + if (gstate_c.vertexAddr != newAddr) { + canExtend = false; + gstate_c.vertexAddr = newAddr; + } break; + } case GE_CMD_IADDR: gstate.cmdmem[GE_CMD_IADDR] = data; gstate_c.indexAddr = gstate_c.getRelativeAddress(data & 0x00FFFFFF); @@ -1105,6 +1132,8 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { (Memory::ReadUnchecked_U32(target + 12 * 4) >> 24) == GE_CMD_RET && (target > currentList->stall || target + 12 * 4 < currentList->stall) && (gstate.boneMatrixNumber & 0x00FFFFFF) <= 96 - 12) { + drawEngineCommon_->FlushSkin(); + canExtend = false; FastLoadBoneMatrix(target); } else { goto bail; @@ -1126,6 +1155,7 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { } bail: + drawEngineCommon_->FlushSkin(); gstate.cmdmem[GE_CMD_VERTEXTYPE] = vertexType; int cmdCount = src - start; // Skip over the commands we just read out manually. @@ -1647,7 +1677,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) { float vertexAverageCycles = gpuStats.numVertsSubmitted > 0 ? (float)gpuStats.vertexGPUCycles / (float)gpuStats.numVertsSubmitted : 0.0f; return snprintf(buffer, size, "DL processing time: %0.2f ms, %d drawsync, %d listsync\n" - "Draw calls: %d, flushes %d, clears %d, bbox jumps %d (%d updates)\n" + "Draw: %d (%d dec), flushes %d, clears %d, bbox jumps %d (%d updates)\n" "Cached draws: %d (tracked: %d)\n" "Vertices: %d cached: %d uncached: %d\n" "FBOs active: %d (evaluations: %d)\n" @@ -1660,6 +1690,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) { gpuStats.numDrawSyncs, gpuStats.numListSyncs, gpuStats.numDrawCalls, + gpuStats.numVertexDecodes, gpuStats.numFlushes, gpuStats.numClears, gpuStats.numBBOXJumps, diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp index bacff2f4d5d6..ed83c7f20756 100644 --- a/GPU/Vulkan/DrawEngineVulkan.cpp +++ b/GPU/Vulkan/DrawEngineVulkan.cpp @@ -566,10 +566,11 @@ bool DrawEngineVulkan::VertexCacheLookup(int &vertexCount, GEPrimitiveType &prim vai->minihash = ComputeMiniHash(); vai->status = VertexArrayInfoVulkan::VAI_HASHING; vai->drawsUntilNextFullHash = 0; - DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); // writes to indexGen + DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); + DecodeInds(); vai->numVerts = indexGen.VertexCount(); vai->prim = indexGen.Prim(); - vai->maxIndex = indexGen.MaxIndex(); + vai->maxIndex = MaxIndex(); vai->flags = gstate_c.vertexFullAlpha ? VAIVULKAN_FLAG_VERTEXFULLALPHA : 0; return true; } @@ -593,6 +594,7 @@ bool DrawEngineVulkan::VertexCacheLookup(int &vertexCount, GEPrimitiveType &prim if (newMiniHash != vai->minihash || newHash != vai->hash) { MarkUnreliable(vai); DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); + DecodeInds(); return true; } if (vai->numVerts > 64) { @@ -612,6 +614,7 @@ bool DrawEngineVulkan::VertexCacheLookup(int &vertexCount, GEPrimitiveType &prim if (newMiniHash != vai->minihash) { MarkUnreliable(vai); DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); + DecodeInds(); return true; } } @@ -619,9 +622,10 @@ bool DrawEngineVulkan::VertexCacheLookup(int &vertexCount, GEPrimitiveType &prim if (!vai->vb) { // Directly push to the vertex cache. DecodeVertsToPushBuffer(vertexCache_, &vai->vbOffset, &vai->vb); + DecodeInds(); _dbg_assert_msg_(gstate_c.vertBounds.minV >= gstate_c.vertBounds.maxV, "Should not have checked UVs when caching."); vai->numVerts = indexGen.VertexCount(); - vai->maxIndex = indexGen.MaxIndex(); + vai->maxIndex = MaxIndex(); vai->flags = gstate_c.vertexFullAlpha ? VAIVULKAN_FLAG_VERTEXFULLALPHA : 0; if (forceIndexed) { vai->prim = indexGen.GeneralPrim(); @@ -684,6 +688,7 @@ bool DrawEngineVulkan::VertexCacheLookup(int &vertexCount, GEPrimitiveType &prim vai->numFrames++; } DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); + DecodeInds(); return true; } default: @@ -748,6 +753,7 @@ void DrawEngineVulkan::DoFlush() { // Decode directly into the pushbuffer DecodeVertsToPushPool(pushVertex_, &vbOffset, &vbuf); } + DecodeInds(); gpuStats.numUncachedVertsDrawn += indexGen.VertexCount(); } @@ -845,6 +851,7 @@ void DrawEngineVulkan::DoFlush() { dec_ = GetVertexDecoder(lastVType_); } DecodeVerts(decoded_); + DecodeInds(); bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE; if (gstate.isModeThrough()) { gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255); @@ -857,6 +864,7 @@ void DrawEngineVulkan::DoFlush() { // Undo the strip optimization, not supported by the SW code yet. if (prim == GE_PRIM_TRIANGLE_STRIP) prim = GE_PRIM_TRIANGLES; + _dbg_assert_(prim != GE_PRIM_INVALID); u16 *inds = decIndex_; SoftwareTransformResult result{}; @@ -886,7 +894,7 @@ void DrawEngineVulkan::DoFlush() { UpdateCachedViewportState(vpAndScissor); } - int maxIndex = indexGen.MaxIndex(); + int maxIndex = MaxIndex(); SoftwareTransform swTransform(params); const Lin::Vec3 trans(gstate_c.vpXOffset, gstate_c.vpYOffset, gstate_c.vpZOffset * 0.5f + 0.5f); @@ -1007,14 +1015,17 @@ void DrawEngineVulkan::DoFlush() { } gpuStats.numFlushes++; - gpuStats.numDrawCalls += numDrawCalls_; + gpuStats.numDrawCalls += numDrawInds_; + gpuStats.numVertexDecodes += numDrawVerts_; gpuStats.numVertsSubmitted += vertexCountInDrawCalls_; indexGen.Reset(); decodedVerts_ = 0; - numDrawCalls_ = 0; + numDrawVerts_ = 0; + numDrawInds_ = 0; vertexCountInDrawCalls_ = 0; - decodeCounter_ = 0; + decodeIndsCounter_ = 0; + decodeVertsCounter_ = 0; gstate_c.vertexFullAlpha = true; framebufferManager_->SetColorUpdated(gstate_c.skipDrawReason); @@ -1030,8 +1041,11 @@ void DrawEngineVulkan::DoFlush() { void DrawEngineVulkan::ResetAfterDraw() { indexGen.Reset(); decodedVerts_ = 0; - numDrawCalls_ = 0; - decodeCounter_ = 0; + numDrawVerts_ = 0; + numDrawInds_ = 0; + vertexCountInDrawCalls_ = 0; + decodeIndsCounter_ = 0; + decodeVertsCounter_ = 0; decOptions_.applySkinInDecode = g_Config.bSoftwareSkinning; gstate_c.vertexFullAlpha = true; } diff --git a/GPU/Vulkan/DrawEngineVulkan.h b/GPU/Vulkan/DrawEngineVulkan.h index b599c35051e2..ae6124a04c6a 100644 --- a/GPU/Vulkan/DrawEngineVulkan.h +++ b/GPU/Vulkan/DrawEngineVulkan.h @@ -170,13 +170,13 @@ class DrawEngineVulkan : public DrawEngineCommon { // So that this can be inlined void Flush() { - if (!numDrawCalls_) + if (!numDrawInds_) return; DoFlush(); } void FinishDeferred() { - if (!numDrawCalls_) + if (!numDrawInds_) return; // Decode any pending vertices. And also flush while we're at it, for simplicity. // It might be possible to only decode like in the other backends, but meh, it can't matter. @@ -185,9 +185,9 @@ class DrawEngineVulkan : public DrawEngineCommon { } void DispatchFlush() override { - if (!numDrawCalls_) + if (!numDrawInds_) return; - Flush(); + DoFlush(); } VkPipelineLayout GetPipelineLayout() const { diff --git a/GPU/Vulkan/PipelineManagerVulkan.cpp b/GPU/Vulkan/PipelineManagerVulkan.cpp index b2ad9d0739a6..05803e30286e 100644 --- a/GPU/Vulkan/PipelineManagerVulkan.cpp +++ b/GPU/Vulkan/PipelineManagerVulkan.cpp @@ -291,6 +291,8 @@ static VulkanPipeline *CreateVulkanPipeline(VulkanRenderManager *renderManager, desc->geometryShaderSource = gs->GetShaderString(SHADER_STRING_SOURCE_CODE); } + _dbg_assert_(key.topology != VK_PRIMITIVE_TOPOLOGY_POINT_LIST); + _dbg_assert_(key.topology != VK_PRIMITIVE_TOPOLOGY_LINE_LIST); desc->topology = (VkPrimitiveTopology)key.topology; int vertexStride = 0;