Skip to content

Commit

Permalink
Merge pull request #18286 from hrydgard/drawengine-refactor
Browse files Browse the repository at this point in the history
DrawEngine refactor, quickly merge non-indexed consecutive draws
  • Loading branch information
hrydgard committed Oct 3, 2023
2 parents 6a2e5dd + af7efe4 commit 4f43dac
Show file tree
Hide file tree
Showing 16 changed files with 388 additions and 344 deletions.
389 changes: 178 additions & 211 deletions GPU/Common/DrawEngineCommon.cpp

Large diffs are not rendered by default.

47 changes: 35 additions & 12 deletions GPU/Common/DrawEngineCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,14 @@ class DrawEngineCommon {

bool TestBoundingBox(const void *control_points, const void *inds, int vertexCount, u32 vertType);

void FlushSkin() {
bool applySkin = (lastVType_ & GE_VTYPE_WEIGHT_MASK) && decOptions_.applySkinInDecode;
if (applySkin) {
DecodeVerts(decoded_);
}
}

bool ExtendNonIndexedPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead);
void SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead);
template<class Surface>
void SubmitCurve(const void *control_points, const void *indices, Surface &surface, u32 vertType, int *bytesRead, const char *scope);
Expand All @@ -130,7 +138,7 @@ class DrawEngineCommon {
return false;
}
int GetNumDrawCalls() const {
return numDrawCalls_;
return numDrawVerts_;
}

VertexDecoder *GetVertexDecoder(u32 vtype);
Expand All @@ -141,8 +149,12 @@ class DrawEngineCommon {
virtual bool UpdateUseHWTessellation(bool enabled) const { return enabled; }
void UpdatePlanes();

int ComputeNumVertsToDecode() const;
void DecodeVerts(u8 *dest);
void DecodeInds();

int MaxIndex() const {
return decodedVerts_;
}

// Preprocessing for spline/bezier
u32 NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr, int lowerBound, int upperBound, u32 vertType, int *vertexSize = nullptr);
Expand All @@ -151,8 +163,7 @@ class DrawEngineCommon {
u32 ComputeMiniHash();
uint64_t ComputeHash();

// Vertex decoding
void DecodeVertsStep(u8 *dest, int &i, int &decodedVerts, const UVScale *uvScale);
int ComputeNumVertsToDecode() const;

void ApplyFramebufferRead(FBOTexState *fboTexState);

Expand Down Expand Up @@ -210,25 +221,37 @@ class DrawEngineCommon {
TransformedVertex *transformedExpanded_ = nullptr;

// Defer all vertex decoding to a "Flush" (except when software skinning)
struct DeferredDrawCall {
struct DeferredVerts {
const void *verts;
u32 vertexCount;
u16 indexLowerBound;
u16 indexUpperBound;
UVScale uvScale;
};

struct DeferredInds {
const void *inds;
u32 vertexCount;
u8 vertDecodeIndex; // index into the drawVerts_ array to look up the vertexOffset.
u8 indexType;
s8 prim;
u8 cullMode;
u16 indexLowerBound;
u16 indexUpperBound;
UVScale uvScale;
u16 offset;
};

enum { MAX_DEFERRED_DRAW_CALLS = 128 };
DeferredDrawCall drawCalls_[MAX_DEFERRED_DRAW_CALLS];
int numDrawCalls_ = 0;
enum { MAX_DEFERRED_DRAW_VERTS = 128 }; // If you change this to more than 256, change type of DeferredInds::vertDecodeIndex.
enum { MAX_DEFERRED_DRAW_INDS = 512 }; // Monster Hunter spams indexed calls that we end up merging.
DeferredVerts drawVerts_[MAX_DEFERRED_DRAW_VERTS];
uint32_t drawVertexOffsets_[MAX_DEFERRED_DRAW_VERTS];
DeferredInds drawInds_[MAX_DEFERRED_DRAW_INDS];

int numDrawVerts_ = 0;
int numDrawInds_ = 0;
int vertexCountInDrawCalls_ = 0;

int decimationCounter_ = 0;
int decodeCounter_ = 0;
int decodeVertsCounter_ = 0;
int decodeIndsCounter_ = 0;

// Vertex collector state
IndexGenerator indexGen;
Expand Down
83 changes: 32 additions & 51 deletions GPU/Common/IndexGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,44 +50,40 @@ void IndexGenerator::Setup(u16 *inds) {
Reset();
}

void IndexGenerator::AddPrim(int prim, int vertexCount, bool clockwise) {
void IndexGenerator::AddPrim(int prim, int vertexCount, int indexOffset, bool clockwise) {
switch (prim) {
case GE_PRIM_POINTS: AddPoints(vertexCount); break;
case GE_PRIM_LINES: AddLineList(vertexCount); break;
case GE_PRIM_LINE_STRIP: AddLineStrip(vertexCount); break;
case GE_PRIM_TRIANGLES: AddList(vertexCount, clockwise); break;
case GE_PRIM_TRIANGLE_STRIP: AddStrip(vertexCount, clockwise); break;
case GE_PRIM_TRIANGLE_FAN: AddFan(vertexCount, clockwise); break;
case GE_PRIM_RECTANGLES: AddRectangles(vertexCount); break; // Same
case GE_PRIM_POINTS: AddPoints(vertexCount, indexOffset); break;
case GE_PRIM_LINES: AddLineList(vertexCount, indexOffset); break;
case GE_PRIM_LINE_STRIP: AddLineStrip(vertexCount, indexOffset); break;
case GE_PRIM_TRIANGLES: AddList(vertexCount, indexOffset, clockwise); break;
case GE_PRIM_TRIANGLE_STRIP: AddStrip(vertexCount, indexOffset, clockwise); break;
case GE_PRIM_TRIANGLE_FAN: AddFan(vertexCount, indexOffset, clockwise); break;
case GE_PRIM_RECTANGLES: AddRectangles(vertexCount, indexOffset); break; // Same
}
}

void IndexGenerator::AddPoints(int numVerts) {
void IndexGenerator::AddPoints(int numVerts, int indexOffset) {
u16 *outInds = inds_;
const int startIndex = index_;
for (int i = 0; i < numVerts; i++)
*outInds++ = startIndex + i;
*outInds++ = indexOffset + i;
inds_ = outInds;
// ignore overflow verts
index_ += numVerts;
count_ += numVerts;
prim_ = GE_PRIM_POINTS;
seenPrims_ |= 1 << GE_PRIM_POINTS;
}

void IndexGenerator::AddList(int numVerts, bool clockwise) {
void IndexGenerator::AddList(int numVerts, int indexOffset, bool clockwise) {
u16 *outInds = inds_;
const int startIndex = index_;
const int v1 = clockwise ? 1 : 2;
const int v2 = clockwise ? 2 : 1;
for (int i = 0; i < numVerts; i += 3) {
*outInds++ = startIndex + i;
*outInds++ = startIndex + i + v1;
*outInds++ = startIndex + i + v2;
*outInds++ = indexOffset + i;
*outInds++ = indexOffset + i + v1;
*outInds++ = indexOffset + i + v2;
}
inds_ = outInds;
// ignore overflow verts
index_ += numVerts;
count_ += numVerts;
prim_ = GE_PRIM_TRIANGLES;
seenPrims_ |= 1 << GE_PRIM_TRIANGLES;
Expand Down Expand Up @@ -119,7 +115,7 @@ alignas(16) static const uint16_t offsets_counter_clockwise[24] = {
7, (u16)(7 + 1), (u16)(7 + 2),
};

void IndexGenerator::AddStrip(int numVerts, bool clockwise) {
void IndexGenerator::AddStrip(int numVerts, int indexOffset, bool clockwise) {
int numTris = numVerts - 2;
#ifdef _M_SSE
// In an SSE2 register we can fit 8 16-bit integers.
Expand All @@ -130,7 +126,7 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) {
// We allow ourselves to write some extra indices to avoid the fallback loop.
// That's alright as we're appending to a buffer - they will get overwritten anyway.
int numChunks = (numTris + 7) >> 3;
__m128i ibase8 = _mm_set1_epi16(index_);
__m128i ibase8 = _mm_set1_epi16(indexOffset);
const __m128i *offsets = (const __m128i *)(clockwise ? offsets_clockwise : offsets_counter_clockwise);
__m128i *dst = (__m128i *)inds_;
__m128i offsets0 = _mm_add_epi16(ibase8, _mm_load_si128(offsets));
Expand Down Expand Up @@ -158,7 +154,7 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) {
// wind doesn't need to be updated, an even number of triangles have been drawn.
#elif PPSSPP_ARCH(ARM_NEON)
int numChunks = (numTris + 7) >> 3;
uint16x8_t ibase8 = vdupq_n_u16(index_);
uint16x8_t ibase8 = vdupq_n_u16(indexOffset);
const u16 *offsets = clockwise ? offsets_clockwise : offsets_counter_clockwise;
u16 *dst = inds_;
uint16x8_t offsets0 = vaddq_u16(ibase8, vld1q_u16(offsets));
Expand All @@ -185,7 +181,7 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) {
#else
// Slow fallback loop.
int wind = clockwise ? 1 : 2;
int ibase = index_;
int ibase = indexOffset;
size_t numPairs = numTris / 2;
u16 *outInds = inds_;
while (numPairs > 0) {
Expand All @@ -207,7 +203,6 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) {
inds_ = outInds;
#endif

index_ += numVerts;
if (numTris > 0)
count_ += numTris * 3;
// This is so we can detect one single strip by just looking at seenPrims_.
Expand All @@ -222,19 +217,17 @@ void IndexGenerator::AddStrip(int numVerts, bool clockwise) {
}
}

void IndexGenerator::AddFan(int numVerts, bool clockwise) {
void IndexGenerator::AddFan(int numVerts, int indexOffset, bool clockwise) {
const int numTris = numVerts - 2;
u16 *outInds = inds_;
const int startIndex = index_;
const int v1 = clockwise ? 1 : 2;
const int v2 = clockwise ? 2 : 1;
for (int i = 0; i < numTris; i++) {
*outInds++ = startIndex;
*outInds++ = startIndex + i + v1;
*outInds++ = startIndex + i + v2;
*outInds++ = indexOffset;
*outInds++ = indexOffset + i + v1;
*outInds++ = indexOffset + i + v2;
}
inds_ = outInds;
index_ += numVerts;
count_ += numTris * 3;
prim_ = GE_PRIM_TRIANGLES;
seenPrims_ |= 1 << GE_PRIM_TRIANGLE_FAN;
Expand All @@ -245,54 +238,47 @@ void IndexGenerator::AddFan(int numVerts, bool clockwise) {
}

//Lines
void IndexGenerator::AddLineList(int numVerts) {
void IndexGenerator::AddLineList(int numVerts, int indexOffset) {
u16 *outInds = inds_;
const int startIndex = index_;
for (int i = 0; i < numVerts; i += 2) {
*outInds++ = startIndex + i;
*outInds++ = startIndex + i + 1;
*outInds++ = indexOffset + i;
*outInds++ = indexOffset + i + 1;
}
inds_ = outInds;
index_ += numVerts;
count_ += numVerts;
prim_ = GE_PRIM_LINES;
seenPrims_ |= 1 << prim_;
}

void IndexGenerator::AddLineStrip(int numVerts) {
void IndexGenerator::AddLineStrip(int numVerts, int indexOffset) {
const int numLines = numVerts - 1;
u16 *outInds = inds_;
const int startIndex = index_;
for (int i = 0; i < numLines; i++) {
*outInds++ = startIndex + i;
*outInds++ = startIndex + i + 1;
*outInds++ = indexOffset + i;
*outInds++ = indexOffset + i + 1;
}
inds_ = outInds;
index_ += numVerts;
count_ += numLines * 2;
prim_ = GE_PRIM_LINES;
seenPrims_ |= 1 << GE_PRIM_LINE_STRIP;
}

void IndexGenerator::AddRectangles(int numVerts) {
void IndexGenerator::AddRectangles(int numVerts, int indexOffset) {
u16 *outInds = inds_;
const int startIndex = index_;
//rectangles always need 2 vertices, disregard the last one if there's an odd number
numVerts = numVerts & ~1;
for (int i = 0; i < numVerts; i += 2) {
*outInds++ = startIndex + i;
*outInds++ = startIndex + i + 1;
*outInds++ = indexOffset + i;
*outInds++ = indexOffset + i + 1;
}
inds_ = outInds;
index_ += numVerts;
count_ += numVerts;
prim_ = GE_PRIM_RECTANGLES;
seenPrims_ |= 1 << GE_PRIM_RECTANGLES;
}

template <class ITypeLE, int flag>
void IndexGenerator::TranslatePoints(int numInds, const ITypeLE *inds, int indexOffset) {
indexOffset = index_ - indexOffset;
u16 *outInds = inds_;
for (int i = 0; i < numInds; i++)
*outInds++ = indexOffset + inds[i];
Expand All @@ -304,7 +290,6 @@ void IndexGenerator::TranslatePoints(int numInds, const ITypeLE *inds, int index

template <class ITypeLE, int flag>
void IndexGenerator::TranslateLineList(int numInds, const ITypeLE *inds, int indexOffset) {
indexOffset = index_ - indexOffset;
u16 *outInds = inds_;
numInds = numInds & ~1;
for (int i = 0; i < numInds; i += 2) {
Expand All @@ -319,7 +304,6 @@ void IndexGenerator::TranslateLineList(int numInds, const ITypeLE *inds, int ind

template <class ITypeLE, int flag>
void IndexGenerator::TranslateLineStrip(int numInds, const ITypeLE *inds, int indexOffset) {
indexOffset = index_ - indexOffset;
int numLines = numInds - 1;
u16 *outInds = inds_;
for (int i = 0; i < numLines; i++) {
Expand All @@ -334,7 +318,6 @@ void IndexGenerator::TranslateLineStrip(int numInds, const ITypeLE *inds, int in

template <class ITypeLE, int flag>
void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) {
indexOffset = index_ - indexOffset;
// We only bother doing this minor optimization in triangle list, since it's by far the most
// common operation that can benefit.
if (sizeof(ITypeLE) == sizeof(inds_[0]) && indexOffset == 0 && clockwise) {
Expand All @@ -347,6 +330,7 @@ void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOf
numInds = numTris * 3;
const int v1 = clockwise ? 1 : 2;
const int v2 = clockwise ? 2 : 1;
// TODO: This can actually be SIMD-d, although will need complex shuffles if clockwise.
for (int i = 0; i < numInds; i += 3) {
*outInds++ = indexOffset + inds[i];
*outInds++ = indexOffset + inds[i + v1];
Expand All @@ -362,7 +346,6 @@ void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOf
template <class ITypeLE, int flag>
void IndexGenerator::TranslateStrip(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) {
int wind = clockwise ? 1 : 2;
indexOffset = index_ - indexOffset;
int numTris = numInds - 2;
u16 *outInds = inds_;
for (int i = 0; i < numTris; i++) {
Expand All @@ -380,7 +363,6 @@ void IndexGenerator::TranslateStrip(int numInds, const ITypeLE *inds, int indexO
template <class ITypeLE, int flag>
void IndexGenerator::TranslateFan(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) {
if (numInds <= 0) return;
indexOffset = index_ - indexOffset;
int numTris = numInds - 2;
u16 *outInds = inds_;
const int v1 = clockwise ? 1 : 2;
Expand All @@ -398,7 +380,6 @@ void IndexGenerator::TranslateFan(int numInds, const ITypeLE *inds, int indexOff

template <class ITypeLE, int flag>
inline void IndexGenerator::TranslateRectangles(int numInds, const ITypeLE *inds, int indexOffset) {
indexOffset = index_ - indexOffset;
u16 *outInds = inds_;
//rectangles always need 2 vertices, disregard the last one if there's an odd number
numInds = numInds & ~1;
Expand Down
Loading

0 comments on commit 4f43dac

Please sign in to comment.