diff --git a/Common/Arm64Emitter.cpp b/Common/Arm64Emitter.cpp index ce047cd41d30..91c658d445a6 100644 --- a/Common/Arm64Emitter.cpp +++ b/Common/Arm64Emitter.cpp @@ -315,6 +315,14 @@ const u8* ARM64XEmitter::AlignCodePage() return m_code; } +const u8 *ARM64XEmitter::NopAlignCode16() { + int bytes = ((-(intptr_t)m_code) & 15); + for (int i = 0; i < bytes / 4; i++) { + Write32(0xD503201F); // official nop instruction + } + return m_code; +} + void ARM64XEmitter::FlushIcache() { FlushIcacheSection(m_lastCacheFlushEnd, m_code); diff --git a/Common/Arm64Emitter.h b/Common/Arm64Emitter.h index 01901af9c074..36e969dd2bad 100644 --- a/Common/Arm64Emitter.h +++ b/Common/Arm64Emitter.h @@ -401,6 +401,7 @@ class ARM64XEmitter void ReserveCodeSpace(u32 bytes); const u8* AlignCode16(); const u8* AlignCodePage(); + const u8 *NopAlignCode16(); void FlushIcache(); void FlushIcacheSection(const u8* start, const u8* end); u8* GetWritableCodePtr(); diff --git a/Common/ArmEmitter.cpp b/Common/ArmEmitter.cpp index d6e3d241e089..00b686b3dba1 100644 --- a/Common/ArmEmitter.cpp +++ b/Common/ArmEmitter.cpp @@ -613,6 +613,14 @@ const u8 *ARMXEmitter::AlignCode16() return code; } +const u8 *ARMXEmitter::NopAlignCode16() { + int bytes = ((-(intptr_t)code) & 15); + for (int i = 0; i < bytes / 4; i++) { + Write32(0xE320F000); // one of many possible nops + } + return code; +} + const u8 *ARMXEmitter::AlignCodePage() { ReserveCodeSpace((-(intptr_t)code) & 4095); diff --git a/Common/ArmEmitter.h b/Common/ArmEmitter.h index 46a36c880511..f1cf4dd224d1 100644 --- a/Common/ArmEmitter.h +++ b/Common/ArmEmitter.h @@ -446,6 +446,8 @@ class ARMXEmitter void ReserveCodeSpace(u32 bytes); const u8 *AlignCode16(); const u8 *AlignCodePage(); + const u8 *NopAlignCode16(); + void FlushIcache(); void FlushIcacheSection(u8 *start, u8 *end); u8 *GetWritableCodePtr(); diff --git a/Common/x64Emitter.cpp b/Common/x64Emitter.cpp index 1d98544e6836..73e84a3739b2 100644 --- a/Common/x64Emitter.cpp +++ b/Common/x64Emitter.cpp @@ -140,6 +140,37 @@ const u8 *XEmitter::AlignCodePage() return code; } +const u8 *XEmitter::NopAlignCode16() { + int nops = 16 - ((u64)code & 15); + if (nops == 16) + return code; + + // note: the string lengths are obviously not computable with strlen, but are equal to the index. + // Nop strings from https://stackoverflow.com/questions/25545470/long-multi-byte-nops-commonly-understood-macros-or-other-notation + static const char * const nopStrings[16] = { + "", + "\x90", + "\x66\x90", + "\x0f\x1f\00", + "\x0f\x1f\x40\x00", + "\x0f\x1f\x44\x00\x00", + "\x66\x0f\x1f\x44\x00\x00", + "\x0f\x1f\x80\x00\x00\x00\x00", + "\x0f\x1f\x84\x00\x00\x00\x00\x00", + "\x66\x0f\x1f\x84\x00\x00\x00\x00\x00", + "\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00", + "\x66\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00", + "\x66\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00\x90", + "\x66\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00\x66\x90", + "\x66\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00\x0f\x1f\00", + "\x66\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00\x0f\x1f\x40\x00", + }; + + memcpy(code, nopStrings[nops], nops); + code += nops; + return code; +} + // This operation modifies flags; check to see the flags are locked. // If the flags are locked, we should immediately and loudly fail before // causing a subtle JIT bug. diff --git a/Common/x64Emitter.h b/Common/x64Emitter.h index b52a81f35b2c..4dcfda6d70c8 100644 --- a/Common/x64Emitter.h +++ b/Common/x64Emitter.h @@ -406,6 +406,10 @@ class XEmitter const u8 *AlignCode4(); const u8 *AlignCode16(); const u8 *AlignCodePage(); + + // Nops until the code pointer is 16-byte aligned. Good for loops. + const u8 *NopAlignCode16(); + u8 *GetWritableCodePtr(); void LockFlags() { flags_locked = true; } diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 234d2af22378..bba2dbd63624 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -103,12 +103,9 @@ int DrawEngineCommon::ComputeNumVertsToDecode() const { } void DrawEngineCommon::DecodeVerts(u8 *dest) { - const UVScale origUV = gstate_c.uv; for (; decodeCounter_ < numDrawCalls_; decodeCounter_++) { - gstate_c.uv = drawCalls_[decodeCounter_].uvScale; - DecodeVertsStep(dest, decodeCounter_, decodedVerts_); // NOTE! DecodeVertsStep can modify decodeCounter_! + DecodeVertsStep(dest, decodeCounter_, decodedVerts_, &drawCalls_[decodeCounter_].uvScale); // NOTE! DecodeVertsStep can modify decodeCounter_! } - gstate_c.uv = origUV; // Sanity check if (indexGen.Prim() < 0) { @@ -505,7 +502,7 @@ bool DrawEngineCommon::GetCurrentSimpleVertices(int count, std::vectorDecodeVerts(bufPtr, inPtr, lowerBound, upperBound); + dec->DecodeVerts(bufPtr, inPtr, &gstate_c.uv, lowerBound, upperBound); // OK, morphing eliminated but bones still remain to be taken care of. // Let's do a partial software transform where we only do skinning. @@ -612,7 +609,7 @@ void DrawEngineCommon::ApplyFramebufferRead(FBOTexState *fboTexState) { gstate_c.Dirty(DIRTY_SHADERBLEND); } -void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts) { +void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts, const UVScale *uvScale) { PROFILE_THIS_SCOPE("vertdec"); const DeferredDrawCall &dc = drawCalls_[i]; @@ -624,7 +621,7 @@ void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts) { if (dc.indexType == GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT) { // Decode the verts (and at the same time apply morphing/skinning). Simple. dec_->DecodeVerts(dest + decodedVerts * (int)dec_->GetDecVtxFmt().stride, - dc.verts, indexLowerBound, indexUpperBound); + dc.verts, uvScale, indexLowerBound, indexUpperBound); decodedVerts += indexUpperBound - indexLowerBound + 1; bool clockwise = true; @@ -691,7 +688,7 @@ void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts) { // 3. Decode that range of vertex data. dec_->DecodeVerts(dest + decodedVerts * (int)dec_->GetDecVtxFmt().stride, - dc.verts, indexLowerBound, indexUpperBound); + dc.verts, uvScale, indexLowerBound, indexUpperBound); decodedVerts += vertexCount; // 4. Advance indexgen vertex counter. @@ -849,7 +846,7 @@ void DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimiti vertexCountInDrawCalls_ += vertexCount; if (decOptions_.applySkinInDecode && (vertTypeID & GE_VTYPE_WEIGHT_MASK)) { - DecodeVertsStep(decoded_, decodeCounter_, decodedVerts_); + DecodeVertsStep(decoded_, decodeCounter_, decodedVerts_, &dc.uvScale); decodeCounter_++; } diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h index d68bcf9c4965..c5545dda5122 100644 --- a/GPU/Common/DrawEngineCommon.h +++ b/GPU/Common/DrawEngineCommon.h @@ -143,7 +143,7 @@ class DrawEngineCommon { uint64_t ComputeHash(); // Vertex decoding - void DecodeVertsStep(u8 *dest, int &i, int &decodedVerts); + void DecodeVertsStep(u8 *dest, int &i, int &decodedVerts, const UVScale *uvScale); void ApplyFramebufferRead(FBOTexState *fboTexState); diff --git a/GPU/Common/VertexDecoderArm.cpp b/GPU/Common/VertexDecoderArm.cpp index ed57fd89ccbf..84ae0db678dd 100644 --- a/GPU/Common/VertexDecoderArm.cpp +++ b/GPU/Common/VertexDecoderArm.cpp @@ -190,7 +190,6 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int // Keep the scale/offset in a few fp registers if we need it. if (prescaleStep) { - MOVP2R(R3, &gstate_c.uv); VLD1(F_32, neonUVScaleReg, R3, 2, ALIGN_NONE); if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) { VMOV_neon(F_32, neonScratchReg, by128); @@ -249,7 +248,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int MOV(fullAlphaReg, 0xFF); } - JumpTarget loopStart = GetCodePtr(); + JumpTarget loopStart = NopAlignCode16(); // Preload data cache ahead of reading. This offset seems pretty good. PLD(srcReg, 64); for (int i = 0; i < dec.numSteps_; i++) { diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp index 445d86879c08..e6de22bae1aa 100644 --- a/GPU/Common/VertexDecoderArm64.cpp +++ b/GPU/Common/VertexDecoderArm64.cpp @@ -39,6 +39,9 @@ static const ARM64Reg srcReg = X0; static const ARM64Reg dstReg = X1; static const ARM64Reg counterReg = W2; + +static const ARM64Reg uvScaleReg = X3; + static const ARM64Reg tempReg1 = W3; static const ARM64Reg tempRegPtr = X3; static const ARM64Reg tempReg2 = W4; @@ -178,7 +181,6 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int // Keep the scale/offset in a few fp registers if we need it. if (prescaleStep) { - MOVP2R(X3, &gstate_c.uv); fp.LDR(64, INDEX_UNSIGNED, neonUVScaleReg, X3, 0); fp.LDR(64, INDEX_UNSIGNED, neonUVOffsetReg, X3, 8); if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) { @@ -239,7 +241,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int LDRH(INDEX_UNSIGNED, boundsMaxVReg, scratchReg64, offsetof(KnownVertexBounds, maxV)); } - const u8 *loopStart = GetCodePtr(); + const u8 *loopStart = NopAlignCode16(); for (int i = 0; i < dec.numSteps_; i++) { if (!CompileStep(dec, i)) { EndWrite(); diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp index 915b89e5494f..5015d7d0fc0e 100644 --- a/GPU/Common/VertexDecoderCommon.cpp +++ b/GPU/Common/VertexDecoderCommon.cpp @@ -1282,11 +1282,10 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options, } } -void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowerBound, int indexUpperBound) const { +void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, const UVScale *uvScaleOffset, int indexLowerBound, int indexUpperBound) const { // Decode the vertices within the found bounds, once each // decoded_ and ptr_ are used in the steps, so can't be turned into locals for speed. - decoded_ = decodedptr; - ptr_ = (const u8*)verts + indexLowerBound * size; + const u8 *startPtr = (const u8*)verts + indexLowerBound * size; int count = indexUpperBound - indexLowerBound + 1; int stride = decFmt.stride; @@ -1300,8 +1299,10 @@ void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowe if (jitted_) { // We've compiled the steps into optimized machine code, so just jump! - jitted_(ptr_, decoded_, count); + jitted_(startPtr, decodedptr, count, uvScaleOffset); } else { + ptr_ = startPtr; + decoded_ = decodedptr; // Interpret the decode steps for (; count; count--) { for (int i = 0; i < numSteps_; i++) { diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h index 7af518f0227c..c276a3ca998e 100644 --- a/GPU/Common/VertexDecoderCommon.h +++ b/GPU/Common/VertexDecoderCommon.h @@ -320,7 +320,7 @@ struct JitLookup { // Collapse to less skinning shaders to reduce shader switching, which is expensive. int TranslateNumBones(int bones); -typedef void(*JittedVertexDecoder)(const u8 *src, u8 *dst, int count); +typedef void (*JittedVertexDecoder)(const u8 *src, u8 *dst, int count, const UVScale *uvScaleOffset); struct VertexDecoderOptions { bool expandAllWeightsToFloat; @@ -338,7 +338,7 @@ class VertexDecoder { const DecVtxFormat &GetDecVtxFmt() const { return decFmt; } - void DecodeVerts(u8 *decoded, const void *verts, int indexLowerBound, int indexUpperBound) const; + void DecodeVerts(u8 *decoded, const void *verts, const UVScale *uvScaleOffset, int indexLowerBound, int indexUpperBound) const; int VertexSize() const { return size; } // PSP format size diff --git a/GPU/Common/VertexDecoderRiscV.cpp b/GPU/Common/VertexDecoderRiscV.cpp index accaebcc45fa..cbca1892b8d6 100644 --- a/GPU/Common/VertexDecoderRiscV.cpp +++ b/GPU/Common/VertexDecoderRiscV.cpp @@ -33,11 +33,11 @@ static const float const65535 = 65535.0f; using namespace RiscVGen; -static const RiscVReg srcReg = X10; -static const RiscVReg dstReg = X11; -static const RiscVReg counterReg = X12; +static const RiscVReg srcReg = X10; // a0 +static const RiscVReg dstReg = X11; // a1 +static const RiscVReg counterReg = X12; // a2 -static const RiscVReg tempReg1 = X13; +static const RiscVReg tempReg1 = X13; // a3 static const RiscVReg tempReg2 = X14; static const RiscVReg tempReg3 = X15; static const RiscVReg scratchReg = X16; @@ -234,7 +234,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int // Keep the scale/offset in a few fp registers if we need it. if (prescaleStep) { - LI(tempReg1, &gstate_c.uv); + // tempReg1 happens to be the fourth argument register. FL(32, prescaleRegs.scale.u, tempReg1, 0); FL(32, prescaleRegs.scale.v, tempReg1, 4); FL(32, prescaleRegs.offset.u, tempReg1, 8); diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp index 5828689ea3a6..d92bc964bbb2 100644 --- a/GPU/Common/VertexDecoderX86.cpp +++ b/GPU/Common/VertexDecoderX86.cpp @@ -60,6 +60,7 @@ static const X64Reg tempReg3 = R10; static const X64Reg srcReg = RCX; static const X64Reg dstReg = RDX; static const X64Reg counterReg = R8; +static const X64Reg uvScalePtrReg = R9; // only used during init static const X64Reg alphaReg = R11; #else static const X64Reg tempReg1 = RAX; @@ -68,6 +69,7 @@ static const X64Reg tempReg3 = R10; static const X64Reg srcReg = RDI; static const X64Reg dstReg = RSI; static const X64Reg counterReg = RDX; +static const X64Reg uvScalePtrReg = RCX; // only used during init static const X64Reg alphaReg = R11; #endif #else @@ -77,6 +79,7 @@ static const X64Reg tempReg3 = EDX; static const X64Reg srcReg = ESI; static const X64Reg dstReg = EDI; static const X64Reg counterReg = ECX; +static const X64Reg uvScalePtrReg = EDX; // only used during init #endif // XMM0-XMM5 are volatile on Windows X64 @@ -168,6 +171,22 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int BeginWrite(4096); const u8 *start = this->AlignCode16(); + bool prescaleStep = false; + // Look for prescaled texcoord steps + for (int i = 0; i < dec.numSteps_; i++) { + if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale || + dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale || + dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) { + prescaleStep = true; + } + if (dec.steps_[i] == &VertexDecoder::Step_TcU8PrescaleMorph || + dec.steps_[i] == &VertexDecoder::Step_TcU16PrescaleMorph || + dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescaleMorph) { + prescaleStep = true; + } + } + + #if PPSSPP_ARCH(X86) // Store register values PUSH(ESI); @@ -180,6 +199,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int MOV(32, R(srcReg), MDisp(ESP, 16 + offset + 0)); MOV(32, R(dstReg), MDisp(ESP, 16 + offset + 4)); MOV(32, R(counterReg), MDisp(ESP, 16 + offset + 8)); + MOV(32, R(uvScalePtrReg), MDisp(ESP, 16 + offset + 12)); const uint8_t STACK_FIXED_ALLOC = 64; #else @@ -210,63 +230,49 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int } #endif - bool prescaleStep = false; - // Look for prescaled texcoord steps - for (int i = 0; i < dec.numSteps_; i++) { - if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale || - dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale || - dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) { - prescaleStep = true; - } - if (dec.steps_[i] == &VertexDecoder::Step_TcU8PrescaleMorph || - dec.steps_[i] == &VertexDecoder::Step_TcU16PrescaleMorph || - dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescaleMorph) { - prescaleStep = true; + // Keep the scale/offset in a few fp registers if we need it. + // TODO: Read it from an argument pointer instead of gstate_c.uv. + if (prescaleStep) { + // uvScalePtrReg should point to gstate_c.uv, or wherever the UV scale we want to use is located. + MOVUPS(fpScaleOffsetReg, MatR(uvScalePtrReg)); + if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) { + MOV(PTRBITS, R(tempReg2), ImmPtr(&by128_11)); + MULPS(fpScaleOffsetReg, MatR(tempReg2)); + } else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) { + MOV(PTRBITS, R(tempReg2), ImmPtr(&by32768_11)); + MULPS(fpScaleOffsetReg, MatR(tempReg2)); } } // Add code to convert matrices to 4x4. // Later we might want to do this when the matrices are loaded instead. + // Can't touch fpScaleOffsetReg (XMM0) in here! if (dec.skinInDecode) { MOV(PTRBITS, R(tempReg1), ImmPtr(&threeMasks)); - MOVAPS(XMM4, MatR(tempReg1)); + MOVAPS(XMM5, MatR(tempReg1)); MOV(PTRBITS, R(tempReg1), ImmPtr(&aOne)); - MOVUPS(XMM5, MatR(tempReg1)); + MOVUPS(XMM6, MatR(tempReg1)); MOV(PTRBITS, R(tempReg1), ImmPtr(gstate.boneMatrix)); MOV(PTRBITS, R(tempReg2), ImmPtr(bones)); for (int i = 0; i < dec.nweights; i++) { - MOVUPS(XMM0, MDisp(tempReg1, (12 * i) * 4)); - MOVUPS(XMM1, MDisp(tempReg1, (12 * i + 3) * 4)); - MOVUPS(XMM2, MDisp(tempReg1, (12 * i + 3 * 2) * 4)); - MOVUPS(XMM3, MDisp(tempReg1, (12 * i + 3 * 3) * 4)); - ANDPS(XMM0, R(XMM4)); - ANDPS(XMM1, R(XMM4)); - ANDPS(XMM2, R(XMM4)); - ANDPS(XMM3, R(XMM4)); - ORPS(XMM3, R(XMM5)); - MOVAPS(MDisp(tempReg2, (16 * i) * 4), XMM0); - MOVAPS(MDisp(tempReg2, (16 * i + 4) * 4), XMM1); - MOVAPS(MDisp(tempReg2, (16 * i + 8) * 4), XMM2); - MOVAPS(MDisp(tempReg2, (16 * i + 12) * 4), XMM3); - } - } - - // Keep the scale/offset in a few fp registers if we need it. - // TODO: Read it from an argument pointer instead of gstate_c.uv. - if (prescaleStep) { - MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.uv)); - MOVUPS(fpScaleOffsetReg, MatR(tempReg1)); - if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) { - MOV(PTRBITS, R(tempReg2), ImmPtr(&by128_11)); - MULPS(fpScaleOffsetReg, MatR(tempReg2)); - } else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) { - MOV(PTRBITS, R(tempReg2), ImmPtr(&by32768_11)); - MULPS(fpScaleOffsetReg, MatR(tempReg2)); + MOVUPS(XMM1, MDisp(tempReg1, (12 * i) * 4)); + MOVUPS(XMM2, MDisp(tempReg1, (12 * i + 3) * 4)); + MOVUPS(XMM3, MDisp(tempReg1, (12 * i + 3 * 2) * 4)); + MOVUPS(XMM4, MDisp(tempReg1, (12 * i + 3 * 3) * 4)); + ANDPS(XMM1, R(XMM5)); + ANDPS(XMM2, R(XMM5)); + ANDPS(XMM3, R(XMM5)); + ANDPS(XMM4, R(XMM5)); + ORPS(XMM4, R(XMM6)); + MOVAPS(MDisp(tempReg2, (16 * i) * 4), XMM1); + MOVAPS(MDisp(tempReg2, (16 * i + 4) * 4), XMM2); + MOVAPS(MDisp(tempReg2, (16 * i + 8) * 4), XMM3); + MOVAPS(MDisp(tempReg2, (16 * i + 12) * 4), XMM4); } } // Let's not bother with a proper stack frame. We just grab the arguments and go. - JumpTarget loopStart = GetCodePtr(); + JumpTarget loopStart = NopAlignCode16(); for (int i = 0; i < dec.numSteps_; i++) { if (!CompileStep(dec, i)) { EndWrite(); @@ -775,6 +781,8 @@ void VertexDecoderJitCache::Jit_TcU8Prescale() { CVTSI2SS(fpScratchReg, R(tempReg1)); CVTSI2SS(fpScratchReg2, R(tempReg2)); UNPCKLPS(fpScratchReg, R(fpScratchReg2)); + // TODO: These are a lot of nasty consecutive dependencies. Can probably be made faster + // if we can spare another register to avoid the shuffle, like on ARM. MULPS(fpScratchReg, R(fpScaleOffsetReg)); SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); ADDPS(fpScratchReg, R(fpScaleOffsetReg)); diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp index 8fb0ea66ddf9..eacdb481ae95 100644 --- a/GPU/Software/TransformUnit.cpp +++ b/GPU/Software/TransformUnit.cpp @@ -494,7 +494,7 @@ class SoftwareVertexReader { if (useIndices_) GetIndexBounds(indices, vertex_count, vertex_type, &lowerBound_, &upperBound_); if (vertex_count != 0) - vdecoder.DecodeVerts(base, vertices, lowerBound_, upperBound_); + vdecoder.DecodeVerts(base, vertices, &gstate_c.uv, lowerBound_, upperBound_); // If we're only using a subset of verts, it's better to decode with random access (usually.) // However, if we're reusing a lot of verts, we should read and cache them. diff --git a/Windows/.gitignore b/Windows/.gitignore index 8d4dba11ef29..5c87205d53c6 100644 --- a/Windows/.gitignore +++ b/Windows/.gitignore @@ -2,3 +2,4 @@ *.VC.db *.txt enc_temp_folder +Win32 diff --git a/unittest/TestVertexJit.cpp b/unittest/TestVertexJit.cpp index e759d644444f..f5561c0fa95e 100644 --- a/unittest/TestVertexJit.cpp +++ b/unittest/TestVertexJit.cpp @@ -78,7 +78,7 @@ class VertexDecoderTestHarness { void Execute(int vtype, int indexUpperBound, bool useJit) { SetupExecute(vtype, useJit); - dec_->DecodeVerts(dst_, src_, indexLowerBound_, indexUpperBound); + dec_->DecodeVerts(dst_, src_, &gstate_c.uv, indexLowerBound_, indexUpperBound); } double ExecuteTimed(int vtype, int indexUpperBound, bool useJit) { @@ -88,7 +88,7 @@ class VertexDecoderTestHarness { double st = time_now_d(); do { for (int j = 0; j < ROUNDS; ++j) { - dec_->DecodeVerts(dst_, src_, indexLowerBound_, indexUpperBound); + dec_->DecodeVerts(dst_, src_, &gstate_c.uv, indexLowerBound_, indexUpperBound); ++total; } } while (time_now_d() - st < 0.5);