Skip to content

Commit

Permalink
Merge pull request #18316 from hrydgard/strip-opt
Browse files Browse the repository at this point in the history
Micro-optimize draw calls a bit more.
  • Loading branch information
hrydgard committed Oct 7, 2023
2 parents 15df71c + c7a3e7b commit 3e63fe8
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 57 deletions.
61 changes: 39 additions & 22 deletions GPU/Common/DrawEngineCommon.cpp
Expand Up @@ -610,7 +610,7 @@ u32 DrawEngineCommon::ComputeMiniHash() {
}
for (int i = 0; i < numDrawInds_; i += step) {
const DeferredInds &di = drawInds_[i];
if (di.inds) {
if (di.indexType != 0) {
fullhash += ComputeMiniHashRange(di.inds, indexSize * di.vertexCount);
}
}
Expand Down Expand Up @@ -638,8 +638,10 @@ uint32_t DrawEngineCommon::ComputeDrawcallsHash() const {
}
for (int j = 0; j < numDrawInds_; j++) {
u32 dhash = dcid;
dhash = __rotl(dhash ^ (u32)(uintptr_t)drawInds_[j].inds, 19);
dcid = lowbias32_r(__rotl(dhash ^ (u32)drawInds_[j].indexType, 7));
if (drawInds_[j].inds) {
dhash = __rotl(dhash ^ (u32)(uintptr_t)drawInds_[j].inds, 19);
dcid = lowbias32_r(__rotl(dhash ^ (u32)drawInds_[j].indexType, 7));
}
}
return dcid;
}
Expand Down Expand Up @@ -678,31 +680,46 @@ uint64_t DrawEngineCommon::ComputeHash() {
return fullhash;
}

bool DrawEngineCommon::ExtendNonIndexedPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead) {
if (numDrawInds_ >= MAX_DEFERRED_DRAW_INDS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) {
return false;
}
int DrawEngineCommon::ExtendNonIndexedPrim(const uint32_t *cmd, u32 vertTypeID, int cullMode, int *bytesRead, bool isTriangle) {
const uint32_t *start = cmd;
int prevDrawVerts = numDrawVerts_ - 1;
DeferredVerts &dv = drawVerts_[prevDrawVerts];
int offset = dv.vertexCount;

_dbg_assert_(numDrawInds_ < MAX_DEFERRED_DRAW_INDS);
_dbg_assert_(numDrawVerts_ > 0);
*bytesRead = vertexCount * dec_->VertexSize();

DeferredInds &di = drawInds_[numDrawInds_++];
di.inds = nullptr;
di.indexType = 0;
di.prim = prim;
di.cullMode = cullMode;
di.vertexCount = vertexCount;
di.vertDecodeIndex = numDrawVerts_ - 1;
while (true) {
uint32_t data = *cmd;
if ((data & 0xFFF80000) != 0x04000000) {
break;
}
GEPrimitiveType newPrim = static_cast<GEPrimitiveType>((data >> 16) & 7);
if (IsTrianglePrim(newPrim) != isTriangle)
break;
int vertexCount = data & 0xFFFF;
if (numDrawInds_ >= MAX_DEFERRED_DRAW_INDS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) {
break;
}
DeferredInds &di = drawInds_[numDrawInds_++];
di.indexType = 0;
di.prim = newPrim;
di.cullMode = cullMode;
di.vertexCount = vertexCount;
di.vertDecodeIndex = prevDrawVerts;
di.offset = offset;
offset += vertexCount;
cmd++;
}

DeferredVerts &dv = drawVerts_[numDrawVerts_ - 1];
int offset = dv.vertexCount;
di.offset = offset;
dv.vertexCount += vertexCount;
dv.indexUpperBound = dv.vertexCount - 1;
vertexCountInDrawCalls_ += vertexCount;
_dbg_assert_(cmd != start);

return true;
int totalCount = offset - dv.vertexCount;
dv.vertexCount = offset;
dv.indexUpperBound = dv.vertexCount - 1;
vertexCountInDrawCalls_ += totalCount;
*bytesRead = totalCount * dec_->VertexSize();
return cmd - start;
}

// vertTypeID is the vertex type but with the UVGen mode smashed into the top bits.
Expand Down
2 changes: 1 addition & 1 deletion GPU/Common/DrawEngineCommon.h
Expand Up @@ -111,7 +111,7 @@ class DrawEngineCommon {
}
}

bool ExtendNonIndexedPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead);
int ExtendNonIndexedPrim(const uint32_t *cmd, u32 vertTypeID, int cullMode, int *bytesRead, bool isTriangle);
void SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead);
template<class Surface>
void SubmitCurve(const void *control_points, const void *indices, Surface &surface, u32 vertType, int *bytesRead, const char *scope);
Expand Down
16 changes: 0 additions & 16 deletions GPU/Common/IndexGenerator.cpp
Expand Up @@ -68,7 +68,6 @@ void IndexGenerator::AddPoints(int numVerts, int indexOffset) {
*outInds++ = indexOffset + i;
inds_ = outInds;
// ignore overflow verts
count_ += numVerts;
prim_ = GE_PRIM_POINTS;
seenPrims_ |= 1 << GE_PRIM_POINTS;
}
Expand All @@ -84,7 +83,6 @@ void IndexGenerator::AddList(int numVerts, int indexOffset, bool clockwise) {
}
inds_ = outInds;
// ignore overflow verts
count_ += numVerts;
prim_ = GE_PRIM_TRIANGLES;
seenPrims_ |= 1 << GE_PRIM_TRIANGLES;
if (!clockwise) {
Expand Down Expand Up @@ -203,8 +201,6 @@ void IndexGenerator::AddStrip(int numVerts, int indexOffset, bool clockwise) {
inds_ = outInds;
#endif

if (numTris > 0)
count_ += numTris * 3;
// This is so we can detect one single strip by just looking at seenPrims_.
if (!seenPrims_ && clockwise) {
seenPrims_ = 1 << GE_PRIM_TRIANGLE_STRIP;
Expand All @@ -228,7 +224,6 @@ void IndexGenerator::AddFan(int numVerts, int indexOffset, bool clockwise) {
*outInds++ = indexOffset + i + v2;
}
inds_ = outInds;
count_ += numTris * 3;
prim_ = GE_PRIM_TRIANGLES;
seenPrims_ |= 1 << GE_PRIM_TRIANGLE_FAN;
if (!clockwise) {
Expand All @@ -245,7 +240,6 @@ void IndexGenerator::AddLineList(int numVerts, int indexOffset) {
*outInds++ = indexOffset + i + 1;
}
inds_ = outInds;
count_ += numVerts;
prim_ = GE_PRIM_LINES;
seenPrims_ |= 1 << prim_;
}
Expand All @@ -258,7 +252,6 @@ void IndexGenerator::AddLineStrip(int numVerts, int indexOffset) {
*outInds++ = indexOffset + i + 1;
}
inds_ = outInds;
count_ += numLines * 2;
prim_ = GE_PRIM_LINES;
seenPrims_ |= 1 << GE_PRIM_LINE_STRIP;
}
Expand All @@ -272,7 +265,6 @@ void IndexGenerator::AddRectangles(int numVerts, int indexOffset) {
*outInds++ = indexOffset + i + 1;
}
inds_ = outInds;
count_ += numVerts;
prim_ = GE_PRIM_RECTANGLES;
seenPrims_ |= 1 << GE_PRIM_RECTANGLES;
}
Expand All @@ -283,7 +275,6 @@ void IndexGenerator::TranslatePoints(int numInds, const ITypeLE *inds, int index
for (int i = 0; i < numInds; i++)
*outInds++ = indexOffset + inds[i];
inds_ = outInds;
count_ += numInds;
prim_ = GE_PRIM_POINTS;
seenPrims_ |= (1 << GE_PRIM_POINTS) | flag;
}
Expand All @@ -297,7 +288,6 @@ void IndexGenerator::TranslateLineList(int numInds, const ITypeLE *inds, int ind
*outInds++ = indexOffset + inds[i + 1];
}
inds_ = outInds;
count_ += numInds;
prim_ = GE_PRIM_LINES;
seenPrims_ |= (1 << GE_PRIM_LINES) | flag;
}
Expand All @@ -311,7 +301,6 @@ void IndexGenerator::TranslateLineStrip(int numInds, const ITypeLE *inds, int in
*outInds++ = indexOffset + inds[i + 1];
}
inds_ = outInds;
count_ += numLines * 2;
prim_ = GE_PRIM_LINES;
seenPrims_ |= (1 << GE_PRIM_LINE_STRIP) | flag;
}
Expand All @@ -323,7 +312,6 @@ void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOf
if (sizeof(ITypeLE) == sizeof(inds_[0]) && indexOffset == 0 && clockwise) {
memcpy(inds_, inds, numInds * sizeof(ITypeLE));
inds_ += numInds;
count_ += numInds;
} else {
u16 *outInds = inds_;
int numTris = numInds / 3; // Round to whole triangles
Expand All @@ -337,7 +325,6 @@ void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOf
*outInds++ = indexOffset + inds[i + v2];
}
inds_ = outInds;
count_ += numInds;
}
prim_ = GE_PRIM_TRIANGLES;
seenPrims_ |= (1 << GE_PRIM_TRIANGLES) | flag;
Expand All @@ -355,7 +342,6 @@ void IndexGenerator::TranslateStrip(int numInds, const ITypeLE *inds, int indexO
*outInds++ = indexOffset + inds[i + wind];
}
inds_ = outInds;
count_ += numTris * 3;
prim_ = GE_PRIM_TRIANGLES;
seenPrims_ |= (1 << GE_PRIM_TRIANGLE_STRIP) | flag;
}
Expand All @@ -373,7 +359,6 @@ void IndexGenerator::TranslateFan(int numInds, const ITypeLE *inds, int indexOff
*outInds++ = indexOffset + inds[i + v2];
}
inds_ = outInds;
count_ += numTris * 3;
prim_ = GE_PRIM_TRIANGLES;
seenPrims_ |= (1 << GE_PRIM_TRIANGLE_FAN) | flag;
}
Expand All @@ -388,7 +373,6 @@ inline void IndexGenerator::TranslateRectangles(int numInds, const ITypeLE *inds
*outInds++ = indexOffset + inds[i+1];
}
inds_ = outInds;
count_ += numInds;
prim_ = GE_PRIM_RECTANGLES;
seenPrims_ |= (1 << GE_PRIM_RECTANGLES) | flag;
}
Expand Down
4 changes: 1 addition & 3 deletions GPU/Common/IndexGenerator.h
Expand Up @@ -27,7 +27,6 @@ class IndexGenerator {
void Setup(u16 *indexptr);
void Reset() {
prim_ = GE_PRIM_INVALID;
count_ = 0;
seenPrims_ = 0;
pureCount_ = 0;
this->inds_ = indsBase_;
Expand Down Expand Up @@ -61,7 +60,7 @@ class IndexGenerator {
void TranslatePrim(int prim, int numInds, const u16_le *inds, int indexOffset, bool clockwise);
void TranslatePrim(int prim, int numInds, const u32_le *inds, int indexOffset, bool clockwise);

int VertexCount() const { return count_; }
int VertexCount() const { return inds_ - indsBase_; }
int SeenPrims() const { return seenPrims_; }
int PureCount() const { return pureCount_; }
bool SeenOnlyPurePrims() const {
Expand Down Expand Up @@ -110,7 +109,6 @@ class IndexGenerator {

u16 *indsBase_;
u16 *inds_;
int count_;
int pureCount_;
GEPrimitiveType prim_;
int seenPrims_;
Expand Down
23 changes: 12 additions & 11 deletions GPU/GPUCommon.h
Expand Up @@ -67,6 +67,18 @@ struct TransformedVertex {
}
};

inline bool IsTrianglePrim(GEPrimitiveType prim) {
// TODO: KEEP_PREVIOUS is mistakenly treated as TRIANGLE here... This isn't new.
//
// Interesting optimization, but not confident in performance:
// static const bool p[8] = { false, false, false, true, true, true, false, true };
// 10111000 = 0xB8;
// return (0xB8U >> (u8)prim) & 1;

return prim > GE_PRIM_LINE_STRIP && prim != GE_PRIM_RECTANGLES;
}


class GPUCommon : public GPUInterface, public GPUDebugInterface {
public:
GPUCommon(GraphicsContext *gfxCtx, Draw::DrawContext *draw);
Expand Down Expand Up @@ -219,17 +231,6 @@ class GPUCommon : public GPUInterface, public GPUDebugInterface {

virtual void CheckRenderResized() {}

inline bool IsTrianglePrim(GEPrimitiveType prim) const {
// TODO: KEEP_PREVIOUS is mistakenly treated as TRIANGLE here... This isn't new.
//
// Interesting optimization, but not confident in performance:
// static const bool p[8] = { false, false, false, true, true, true, false, true };
// 10111000 = 0xB8;
// return (0xB8U >> (u8)prim) & 1;

return prim > GE_PRIM_LINE_STRIP && prim != GE_PRIM_RECTANGLES;
}

void SetDrawType(DrawType type, GEPrimitiveType prim) {
if (type != lastDraw_) {
// We always flush when drawing splines/beziers so no need to do so here
Expand Down
11 changes: 7 additions & 4 deletions GPU/GPUCommonHW.cpp
Expand Up @@ -1026,11 +1026,14 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
// Non-indexed draws can be cheaply merged if vertexAddr hasn't changed, that means the vertices
// are consecutive in memory.
_dbg_assert_((vertexType & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_NONE);
if (drawEngineCommon_->ExtendNonIndexedPrim(newPrim, count, vertTypeID, cullMode, &bytesRead)) {
gstate_c.vertexAddr += bytesRead;
totalVertCount += count;
break;
int commandsExecuted = drawEngineCommon_->ExtendNonIndexedPrim(src, vertTypeID, cullMode, &bytesRead, isTriangle);
if (!commandsExecuted) {
goto bail;
}
src += commandsExecuted - 1;
gstate_c.vertexAddr += bytesRead;
totalVertCount += count;
break;
}

// Failed, or can't extend? Do a normal submit.
Expand Down

0 comments on commit 3e63fe8

Please sign in to comment.