Skip to content

Commit

Permalink
Merge pull request #17567 from hrydgard/uvscale-as-argument
Browse files Browse the repository at this point in the history
Pass uvScale in as a fourth argument to the vertex decoder
  • Loading branch information
hrydgard committed Jun 13, 2023
2 parents 71a34d4 + 4af6fac commit 963ca50
Show file tree
Hide file tree
Showing 17 changed files with 132 additions and 70 deletions.
8 changes: 8 additions & 0 deletions Common/Arm64Emitter.cpp
Expand Up @@ -315,6 +315,14 @@ const u8* ARM64XEmitter::AlignCodePage()
return m_code;
}

const u8 *ARM64XEmitter::NopAlignCode16() {
int bytes = ((-(intptr_t)m_code) & 15);
for (int i = 0; i < bytes / 4; i++) {
Write32(0xD503201F); // official nop instruction
}
return m_code;
}

void ARM64XEmitter::FlushIcache()
{
FlushIcacheSection(m_lastCacheFlushEnd, m_code);
Expand Down
1 change: 1 addition & 0 deletions Common/Arm64Emitter.h
Expand Up @@ -401,6 +401,7 @@ class ARM64XEmitter
void ReserveCodeSpace(u32 bytes);
const u8* AlignCode16();
const u8* AlignCodePage();
const u8 *NopAlignCode16();
void FlushIcache();
void FlushIcacheSection(const u8* start, const u8* end);
u8* GetWritableCodePtr();
Expand Down
8 changes: 8 additions & 0 deletions Common/ArmEmitter.cpp
Expand Up @@ -613,6 +613,14 @@ const u8 *ARMXEmitter::AlignCode16()
return code;
}

const u8 *ARMXEmitter::NopAlignCode16() {
int bytes = ((-(intptr_t)code) & 15);
for (int i = 0; i < bytes / 4; i++) {
Write32(0xE320F000); // one of many possible nops
}
return code;
}

const u8 *ARMXEmitter::AlignCodePage()
{
ReserveCodeSpace((-(intptr_t)code) & 4095);
Expand Down
2 changes: 2 additions & 0 deletions Common/ArmEmitter.h
Expand Up @@ -446,6 +446,8 @@ class ARMXEmitter
void ReserveCodeSpace(u32 bytes);
const u8 *AlignCode16();
const u8 *AlignCodePage();
const u8 *NopAlignCode16();

void FlushIcache();
void FlushIcacheSection(u8 *start, u8 *end);
u8 *GetWritableCodePtr();
Expand Down
31 changes: 31 additions & 0 deletions Common/x64Emitter.cpp
Expand Up @@ -140,6 +140,37 @@ const u8 *XEmitter::AlignCodePage()
return code;
}

const u8 *XEmitter::NopAlignCode16() {
int nops = 16 - ((u64)code & 15);
if (nops == 16)
return code;

// note: the string lengths are obviously not computable with strlen, but are equal to the index.
// Nop strings from https://stackoverflow.com/questions/25545470/long-multi-byte-nops-commonly-understood-macros-or-other-notation
static const char * const nopStrings[16] = {
"",
"\x90",
"\x66\x90",
"\x0f\x1f\00",
"\x0f\x1f\x40\x00",
"\x0f\x1f\x44\x00\x00",
"\x66\x0f\x1f\x44\x00\x00",
"\x0f\x1f\x80\x00\x00\x00\x00",
"\x0f\x1f\x84\x00\x00\x00\x00\x00",
"\x66\x0f\x1f\x84\x00\x00\x00\x00\x00",
"\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00",
"\x66\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00",
"\x66\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00\x90",
"\x66\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00\x66\x90",
"\x66\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00\x0f\x1f\00",
"\x66\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00\x0f\x1f\x40\x00",
};

memcpy(code, nopStrings[nops], nops);
code += nops;
return code;
}

// This operation modifies flags; check to see the flags are locked.
// If the flags are locked, we should immediately and loudly fail before
// causing a subtle JIT bug.
Expand Down
4 changes: 4 additions & 0 deletions Common/x64Emitter.h
Expand Up @@ -406,6 +406,10 @@ class XEmitter
const u8 *AlignCode4();
const u8 *AlignCode16();
const u8 *AlignCodePage();

// Nops until the code pointer is 16-byte aligned. Good for loops.
const u8 *NopAlignCode16();

u8 *GetWritableCodePtr();

void LockFlags() { flags_locked = true; }
Expand Down
15 changes: 6 additions & 9 deletions GPU/Common/DrawEngineCommon.cpp
Expand Up @@ -103,12 +103,9 @@ int DrawEngineCommon::ComputeNumVertsToDecode() const {
}

void DrawEngineCommon::DecodeVerts(u8 *dest) {
const UVScale origUV = gstate_c.uv;
for (; decodeCounter_ < numDrawCalls_; decodeCounter_++) {
gstate_c.uv = drawCalls_[decodeCounter_].uvScale;
DecodeVertsStep(dest, decodeCounter_, decodedVerts_); // NOTE! DecodeVertsStep can modify decodeCounter_!
DecodeVertsStep(dest, decodeCounter_, decodedVerts_, &drawCalls_[decodeCounter_].uvScale); // NOTE! DecodeVertsStep can modify decodeCounter_!
}
gstate_c.uv = origUV;

// Sanity check
if (indexGen.Prim() < 0) {
Expand Down Expand Up @@ -505,7 +502,7 @@ bool DrawEngineCommon::GetCurrentSimpleVertices(int count, std::vector<GPUDebugV
u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr, VertexDecoder *dec, int lowerBound, int upperBound, u32 vertType) {
// First, decode the vertices into a GPU compatible format. This step can be eliminated but will need a separate
// implementation of the vertex decoder.
dec->DecodeVerts(bufPtr, inPtr, lowerBound, upperBound);
dec->DecodeVerts(bufPtr, inPtr, &gstate_c.uv, lowerBound, upperBound);

// OK, morphing eliminated but bones still remain to be taken care of.
// Let's do a partial software transform where we only do skinning.
Expand Down Expand Up @@ -612,7 +609,7 @@ void DrawEngineCommon::ApplyFramebufferRead(FBOTexState *fboTexState) {
gstate_c.Dirty(DIRTY_SHADERBLEND);
}

void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts) {
void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts, const UVScale *uvScale) {
PROFILE_THIS_SCOPE("vertdec");

const DeferredDrawCall &dc = drawCalls_[i];
Expand All @@ -624,7 +621,7 @@ void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts) {
if (dc.indexType == GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT) {
// Decode the verts (and at the same time apply morphing/skinning). Simple.
dec_->DecodeVerts(dest + decodedVerts * (int)dec_->GetDecVtxFmt().stride,
dc.verts, indexLowerBound, indexUpperBound);
dc.verts, uvScale, indexLowerBound, indexUpperBound);
decodedVerts += indexUpperBound - indexLowerBound + 1;

bool clockwise = true;
Expand Down Expand Up @@ -691,7 +688,7 @@ void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts) {

// 3. Decode that range of vertex data.
dec_->DecodeVerts(dest + decodedVerts * (int)dec_->GetDecVtxFmt().stride,
dc.verts, indexLowerBound, indexUpperBound);
dc.verts, uvScale, indexLowerBound, indexUpperBound);
decodedVerts += vertexCount;

// 4. Advance indexgen vertex counter.
Expand Down Expand Up @@ -849,7 +846,7 @@ void DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimiti
vertexCountInDrawCalls_ += vertexCount;

if (decOptions_.applySkinInDecode && (vertTypeID & GE_VTYPE_WEIGHT_MASK)) {
DecodeVertsStep(decoded_, decodeCounter_, decodedVerts_);
DecodeVertsStep(decoded_, decodeCounter_, decodedVerts_, &dc.uvScale);
decodeCounter_++;
}

Expand Down
2 changes: 1 addition & 1 deletion GPU/Common/DrawEngineCommon.h
Expand Up @@ -143,7 +143,7 @@ class DrawEngineCommon {
uint64_t ComputeHash();

// Vertex decoding
void DecodeVertsStep(u8 *dest, int &i, int &decodedVerts);
void DecodeVertsStep(u8 *dest, int &i, int &decodedVerts, const UVScale *uvScale);

void ApplyFramebufferRead(FBOTexState *fboTexState);

Expand Down
3 changes: 1 addition & 2 deletions GPU/Common/VertexDecoderArm.cpp
Expand Up @@ -190,7 +190,6 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int

// Keep the scale/offset in a few fp registers if we need it.
if (prescaleStep) {
MOVP2R(R3, &gstate_c.uv);
VLD1(F_32, neonUVScaleReg, R3, 2, ALIGN_NONE);
if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
VMOV_neon(F_32, neonScratchReg, by128);
Expand Down Expand Up @@ -249,7 +248,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
MOV(fullAlphaReg, 0xFF);
}

JumpTarget loopStart = GetCodePtr();
JumpTarget loopStart = NopAlignCode16();
// Preload data cache ahead of reading. This offset seems pretty good.
PLD(srcReg, 64);
for (int i = 0; i < dec.numSteps_; i++) {
Expand Down
6 changes: 4 additions & 2 deletions GPU/Common/VertexDecoderArm64.cpp
Expand Up @@ -39,6 +39,9 @@ static const ARM64Reg srcReg = X0;
static const ARM64Reg dstReg = X1;

static const ARM64Reg counterReg = W2;

static const ARM64Reg uvScaleReg = X3;

static const ARM64Reg tempReg1 = W3;
static const ARM64Reg tempRegPtr = X3;
static const ARM64Reg tempReg2 = W4;
Expand Down Expand Up @@ -178,7 +181,6 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int

// Keep the scale/offset in a few fp registers if we need it.
if (prescaleStep) {
MOVP2R(X3, &gstate_c.uv);
fp.LDR(64, INDEX_UNSIGNED, neonUVScaleReg, X3, 0);
fp.LDR(64, INDEX_UNSIGNED, neonUVOffsetReg, X3, 8);
if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
Expand Down Expand Up @@ -239,7 +241,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
LDRH(INDEX_UNSIGNED, boundsMaxVReg, scratchReg64, offsetof(KnownVertexBounds, maxV));
}

const u8 *loopStart = GetCodePtr();
const u8 *loopStart = NopAlignCode16();
for (int i = 0; i < dec.numSteps_; i++) {
if (!CompileStep(dec, i)) {
EndWrite();
Expand Down
9 changes: 5 additions & 4 deletions GPU/Common/VertexDecoderCommon.cpp
Expand Up @@ -1282,11 +1282,10 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options,
}
}

void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowerBound, int indexUpperBound) const {
void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, const UVScale *uvScaleOffset, int indexLowerBound, int indexUpperBound) const {
// Decode the vertices within the found bounds, once each
// decoded_ and ptr_ are used in the steps, so can't be turned into locals for speed.
decoded_ = decodedptr;
ptr_ = (const u8*)verts + indexLowerBound * size;
const u8 *startPtr = (const u8*)verts + indexLowerBound * size;

int count = indexUpperBound - indexLowerBound + 1;
int stride = decFmt.stride;
Expand All @@ -1300,8 +1299,10 @@ void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowe

if (jitted_) {
// We've compiled the steps into optimized machine code, so just jump!
jitted_(ptr_, decoded_, count);
jitted_(startPtr, decodedptr, count, uvScaleOffset);
} else {
ptr_ = startPtr;
decoded_ = decodedptr;
// Interpret the decode steps
for (; count; count--) {
for (int i = 0; i < numSteps_; i++) {
Expand Down
4 changes: 2 additions & 2 deletions GPU/Common/VertexDecoderCommon.h
Expand Up @@ -320,7 +320,7 @@ struct JitLookup {
// Collapse to less skinning shaders to reduce shader switching, which is expensive.
int TranslateNumBones(int bones);

typedef void(*JittedVertexDecoder)(const u8 *src, u8 *dst, int count);
typedef void (*JittedVertexDecoder)(const u8 *src, u8 *dst, int count, const UVScale *uvScaleOffset);

struct VertexDecoderOptions {
bool expandAllWeightsToFloat;
Expand All @@ -338,7 +338,7 @@ class VertexDecoder {

const DecVtxFormat &GetDecVtxFmt() const { return decFmt; }

void DecodeVerts(u8 *decoded, const void *verts, int indexLowerBound, int indexUpperBound) const;
void DecodeVerts(u8 *decoded, const void *verts, const UVScale *uvScaleOffset, int indexLowerBound, int indexUpperBound) const;

int VertexSize() const { return size; } // PSP format size

Expand Down
10 changes: 5 additions & 5 deletions GPU/Common/VertexDecoderRiscV.cpp
Expand Up @@ -33,11 +33,11 @@ static const float const65535 = 65535.0f;

using namespace RiscVGen;

static const RiscVReg srcReg = X10;
static const RiscVReg dstReg = X11;
static const RiscVReg counterReg = X12;
static const RiscVReg srcReg = X10; // a0
static const RiscVReg dstReg = X11; // a1
static const RiscVReg counterReg = X12; // a2

static const RiscVReg tempReg1 = X13;
static const RiscVReg tempReg1 = X13; // a3
static const RiscVReg tempReg2 = X14;
static const RiscVReg tempReg3 = X15;
static const RiscVReg scratchReg = X16;
Expand Down Expand Up @@ -234,7 +234,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int

// Keep the scale/offset in a few fp registers if we need it.
if (prescaleStep) {
LI(tempReg1, &gstate_c.uv);
// tempReg1 happens to be the fourth argument register.
FL(32, prescaleRegs.scale.u, tempReg1, 0);
FL(32, prescaleRegs.scale.v, tempReg1, 4);
FL(32, prescaleRegs.offset.u, tempReg1, 8);
Expand Down

0 comments on commit 963ca50

Please sign in to comment.