diff --git a/Common/Arm64Emitter.cpp b/Common/Arm64Emitter.cpp
index ce047cd41d30..91c658d445a6 100644
--- a/Common/Arm64Emitter.cpp
+++ b/Common/Arm64Emitter.cpp
@@ -315,6 +315,14 @@ const u8* ARM64XEmitter::AlignCodePage()
 	return m_code;
 }
 
+const u8 *ARM64XEmitter::NopAlignCode16() {
+	int bytes = ((-(intptr_t)m_code) & 15);
+	for (int i = 0; i < bytes / 4; i++) {
+		Write32(0xD503201F); // official nop instruction
+	}
+	return m_code;
+}
+
 void ARM64XEmitter::FlushIcache()
 {
 	FlushIcacheSection(m_lastCacheFlushEnd, m_code);
diff --git a/Common/Arm64Emitter.h b/Common/Arm64Emitter.h
index 01901af9c074..36e969dd2bad 100644
--- a/Common/Arm64Emitter.h
+++ b/Common/Arm64Emitter.h
@@ -401,6 +401,7 @@ class ARM64XEmitter
 	void ReserveCodeSpace(u32 bytes);
 	const u8* AlignCode16();
 	const u8* AlignCodePage();
+	const u8 *NopAlignCode16();
 	void FlushIcache();
 	void FlushIcacheSection(const u8* start, const u8* end);
 	u8* GetWritableCodePtr();
diff --git a/Common/ArmEmitter.cpp b/Common/ArmEmitter.cpp
index d6e3d241e089..00b686b3dba1 100644
--- a/Common/ArmEmitter.cpp
+++ b/Common/ArmEmitter.cpp
@@ -613,6 +613,14 @@ const u8 *ARMXEmitter::AlignCode16()
 	return code;
 }
 
+const u8 *ARMXEmitter::NopAlignCode16() {
+	int bytes = ((-(intptr_t)code) & 15);
+	for (int i = 0; i < bytes / 4; i++) {
+		Write32(0xE320F000); // one of many possible nops
+	}
+	return code;
+}
+
 const u8 *ARMXEmitter::AlignCodePage()
 {
 	ReserveCodeSpace((-(intptr_t)code) & 4095);
diff --git a/Common/ArmEmitter.h b/Common/ArmEmitter.h
index 46a36c880511..f1cf4dd224d1 100644
--- a/Common/ArmEmitter.h
+++ b/Common/ArmEmitter.h
@@ -446,6 +446,8 @@ class ARMXEmitter
 	void ReserveCodeSpace(u32 bytes);
 	const u8 *AlignCode16();
 	const u8 *AlignCodePage();
+	const u8 *NopAlignCode16();
+
 	void FlushIcache();
 	void FlushIcacheSection(u8 *start, u8 *end);
 	u8 *GetWritableCodePtr();
diff --git a/Common/x64Emitter.cpp b/Common/x64Emitter.cpp
index 1d98544e6836..73e84a3739b2 100644
--- a/Common/x64Emitter.cpp
+++ b/Common/x64Emitter.cpp
@@ -140,6 +140,37 @@ const u8 *XEmitter::AlignCodePage()
 	return code;
 }
 
+const u8 *XEmitter::NopAlignCode16() {
+	int nops = 16 - ((u64)code & 15);
+	if (nops == 16)
+		return code;
+
+	// note: the string lengths are obviously not computable with strlen, but are equal to the index.
+	// Nop strings from https://stackoverflow.com/questions/25545470/long-multi-byte-nops-commonly-understood-macros-or-other-notation
+	static const char * const nopStrings[16] = {
+		"",
+		"\x90",
+		"\x66\x90",
+		"\x0f\x1f\00",
+		"\x0f\x1f\x40\x00",
+		"\x0f\x1f\x44\x00\x00",
+		"\x66\x0f\x1f\x44\x00\x00",
+		"\x0f\x1f\x80\x00\x00\x00\x00",
+		"\x0f\x1f\x84\x00\x00\x00\x00\x00",
+		"\x66\x0f\x1f\x84\x00\x00\x00\x00\x00",
+		"\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00",
+		"\x66\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00",
+		"\x66\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00\x90",
+		"\x66\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00\x66\x90",
+		"\x66\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00\x0f\x1f\00",
+		"\x66\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00\x0f\x1f\x40\x00",
+	};
+
+	memcpy(code, nopStrings[nops], nops);
+	code += nops;
+	return code;
+}
+
 // This operation modifies flags; check to see the flags are locked.
 // If the flags are locked, we should immediately and loudly fail before
 // causing a subtle JIT bug.
diff --git a/Common/x64Emitter.h b/Common/x64Emitter.h
index b52a81f35b2c..4dcfda6d70c8 100644
--- a/Common/x64Emitter.h
+++ b/Common/x64Emitter.h
@@ -406,6 +406,10 @@ class XEmitter
 	const u8 *AlignCode4();
 	const u8 *AlignCode16();
 	const u8 *AlignCodePage();
+
+	// Nops until the code pointer is 16-byte aligned. Good for loops.
+	const u8 *NopAlignCode16();
+
 	u8 *GetWritableCodePtr();
 
 	void LockFlags() { flags_locked = true; }
diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp
index 234d2af22378..bba2dbd63624 100644
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@@ -103,12 +103,9 @@ int DrawEngineCommon::ComputeNumVertsToDecode() const {
 }
 
 void DrawEngineCommon::DecodeVerts(u8 *dest) {
-	const UVScale origUV = gstate_c.uv;
 	for (; decodeCounter_ < numDrawCalls_; decodeCounter_++) {
-		gstate_c.uv = drawCalls_[decodeCounter_].uvScale;
-		DecodeVertsStep(dest, decodeCounter_, decodedVerts_);  // NOTE! DecodeVertsStep can modify decodeCounter_!
+		DecodeVertsStep(dest, decodeCounter_, decodedVerts_, &drawCalls_[decodeCounter_].uvScale);  // NOTE! DecodeVertsStep can modify decodeCounter_!
 	}
-	gstate_c.uv = origUV;
 
 	// Sanity check
 	if (indexGen.Prim() < 0) {
@@ -505,7 +502,7 @@ bool DrawEngineCommon::GetCurrentSimpleVertices(int count, std::vector<GPUDebugV
 u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr, VertexDecoder *dec, int lowerBound, int upperBound, u32 vertType) {
 	// First, decode the vertices into a GPU compatible format. This step can be eliminated but will need a separate
 	// implementation of the vertex decoder.
-	dec->DecodeVerts(bufPtr, inPtr, lowerBound, upperBound);
+	dec->DecodeVerts(bufPtr, inPtr, &gstate_c.uv, lowerBound, upperBound);
 
 	// OK, morphing eliminated but bones still remain to be taken care of.
 	// Let's do a partial software transform where we only do skinning.
@@ -612,7 +609,7 @@ void DrawEngineCommon::ApplyFramebufferRead(FBOTexState *fboTexState) {
 	gstate_c.Dirty(DIRTY_SHADERBLEND);
 }
 
-void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts) {
+void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts, const UVScale *uvScale) {
 	PROFILE_THIS_SCOPE("vertdec");
 
 	const DeferredDrawCall &dc = drawCalls_[i];
@@ -624,7 +621,7 @@ void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts) {
 	if (dc.indexType == GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT) {
 		// Decode the verts (and at the same time apply morphing/skinning). Simple.
 		dec_->DecodeVerts(dest + decodedVerts * (int)dec_->GetDecVtxFmt().stride,
-			dc.verts, indexLowerBound, indexUpperBound);
+			dc.verts, uvScale, indexLowerBound, indexUpperBound);
 		decodedVerts += indexUpperBound - indexLowerBound + 1;
 		
 		bool clockwise = true;
@@ -691,7 +688,7 @@ void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts) {
 
 		// 3. Decode that range of vertex data.
 		dec_->DecodeVerts(dest + decodedVerts * (int)dec_->GetDecVtxFmt().stride,
-			dc.verts, indexLowerBound, indexUpperBound);
+			dc.verts, uvScale, indexLowerBound, indexUpperBound);
 		decodedVerts += vertexCount;
 
 		// 4. Advance indexgen vertex counter.
@@ -849,7 +846,7 @@ void DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimiti
 	vertexCountInDrawCalls_ += vertexCount;
 
 	if (decOptions_.applySkinInDecode && (vertTypeID & GE_VTYPE_WEIGHT_MASK)) {
-		DecodeVertsStep(decoded_, decodeCounter_, decodedVerts_);
+		DecodeVertsStep(decoded_, decodeCounter_, decodedVerts_, &dc.uvScale);
 		decodeCounter_++;
 	}
 
diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h
index d68bcf9c4965..c5545dda5122 100644
--- a/GPU/Common/DrawEngineCommon.h
+++ b/GPU/Common/DrawEngineCommon.h
@@ -143,7 +143,7 @@ class DrawEngineCommon {
 	uint64_t ComputeHash();
 
 	// Vertex decoding
-	void DecodeVertsStep(u8 *dest, int &i, int &decodedVerts);
+	void DecodeVertsStep(u8 *dest, int &i, int &decodedVerts, const UVScale *uvScale);
 
 	void ApplyFramebufferRead(FBOTexState *fboTexState);
 
diff --git a/GPU/Common/VertexDecoderArm.cpp b/GPU/Common/VertexDecoderArm.cpp
index ed57fd89ccbf..84ae0db678dd 100644
--- a/GPU/Common/VertexDecoderArm.cpp
+++ b/GPU/Common/VertexDecoderArm.cpp
@@ -190,7 +190,6 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 
 	// Keep the scale/offset in a few fp registers if we need it.
 	if (prescaleStep) {
-		MOVP2R(R3, &gstate_c.uv);
 		VLD1(F_32, neonUVScaleReg, R3, 2, ALIGN_NONE);
 		if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
 			VMOV_neon(F_32, neonScratchReg, by128);
@@ -249,7 +248,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 		MOV(fullAlphaReg, 0xFF);
 	}
 
-	JumpTarget loopStart = GetCodePtr();
+	JumpTarget loopStart = NopAlignCode16();
 	// Preload data cache ahead of reading. This offset seems pretty good.
 	PLD(srcReg, 64);
 	for (int i = 0; i < dec.numSteps_; i++) {
diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp
index 445d86879c08..e6de22bae1aa 100644
--- a/GPU/Common/VertexDecoderArm64.cpp
+++ b/GPU/Common/VertexDecoderArm64.cpp
@@ -39,6 +39,9 @@ static const ARM64Reg srcReg = X0;
 static const ARM64Reg dstReg = X1;
 
 static const ARM64Reg counterReg = W2;
+
+static const ARM64Reg uvScaleReg = X3;
+
 static const ARM64Reg tempReg1 = W3;
 static const ARM64Reg tempRegPtr = X3;
 static const ARM64Reg tempReg2 = W4;
@@ -178,7 +181,6 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 
 	// Keep the scale/offset in a few fp registers if we need it.
 	if (prescaleStep) {
-		MOVP2R(X3, &gstate_c.uv);
 		fp.LDR(64, INDEX_UNSIGNED, neonUVScaleReg, X3, 0);
 		fp.LDR(64, INDEX_UNSIGNED, neonUVOffsetReg, X3, 8);
 		if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
@@ -239,7 +241,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 		LDRH(INDEX_UNSIGNED, boundsMaxVReg, scratchReg64, offsetof(KnownVertexBounds, maxV));
 	}
 
-	const u8 *loopStart = GetCodePtr();
+	const u8 *loopStart = NopAlignCode16();
 	for (int i = 0; i < dec.numSteps_; i++) {
 		if (!CompileStep(dec, i)) {
 			EndWrite();
diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp
index 915b89e5494f..5015d7d0fc0e 100644
--- a/GPU/Common/VertexDecoderCommon.cpp
+++ b/GPU/Common/VertexDecoderCommon.cpp
@@ -1282,11 +1282,10 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options,
 	}
 }
 
-void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowerBound, int indexUpperBound) const {
+void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, const UVScale *uvScaleOffset, int indexLowerBound, int indexUpperBound) const {
 	// Decode the vertices within the found bounds, once each
 	// decoded_ and ptr_ are used in the steps, so can't be turned into locals for speed.
-	decoded_ = decodedptr;
-	ptr_ = (const u8*)verts + indexLowerBound * size;
+	const u8 *startPtr = (const u8*)verts + indexLowerBound * size;
 
 	int count = indexUpperBound - indexLowerBound + 1;
 	int stride = decFmt.stride;
@@ -1300,8 +1299,10 @@ void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowe
 
 	if (jitted_) {
 		// We've compiled the steps into optimized machine code, so just jump!
-		jitted_(ptr_, decoded_, count);
+		jitted_(startPtr, decodedptr, count, uvScaleOffset);
 	} else {
+		ptr_ = startPtr;
+		decoded_ = decodedptr;
 		// Interpret the decode steps
 		for (; count; count--) {
 			for (int i = 0; i < numSteps_; i++) {
diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h
index 7af518f0227c..c276a3ca998e 100644
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@@ -320,7 +320,7 @@ struct JitLookup {
 // Collapse to less skinning shaders to reduce shader switching, which is expensive.
 int TranslateNumBones(int bones);
 
-typedef void(*JittedVertexDecoder)(const u8 *src, u8 *dst, int count);
+typedef void (*JittedVertexDecoder)(const u8 *src, u8 *dst, int count, const UVScale *uvScaleOffset);
 
 struct VertexDecoderOptions {
 	bool expandAllWeightsToFloat;
@@ -338,7 +338,7 @@ class VertexDecoder {
 
 	const DecVtxFormat &GetDecVtxFmt() const { return decFmt; }
 
-	void DecodeVerts(u8 *decoded, const void *verts, int indexLowerBound, int indexUpperBound) const;
+	void DecodeVerts(u8 *decoded, const void *verts, const UVScale *uvScaleOffset, int indexLowerBound, int indexUpperBound) const;
 
 	int VertexSize() const { return size; }  // PSP format size
 
diff --git a/GPU/Common/VertexDecoderRiscV.cpp b/GPU/Common/VertexDecoderRiscV.cpp
index accaebcc45fa..cbca1892b8d6 100644
--- a/GPU/Common/VertexDecoderRiscV.cpp
+++ b/GPU/Common/VertexDecoderRiscV.cpp
@@ -33,11 +33,11 @@ static const float const65535 = 65535.0f;
 
 using namespace RiscVGen;
 
-static const RiscVReg srcReg = X10;
-static const RiscVReg dstReg = X11;
-static const RiscVReg counterReg = X12;
+static const RiscVReg srcReg = X10;  // a0
+static const RiscVReg dstReg = X11;  // a1
+static const RiscVReg counterReg = X12;  // a2
 
-static const RiscVReg tempReg1 = X13;
+static const RiscVReg tempReg1 = X13;  // a3
 static const RiscVReg tempReg2 = X14;
 static const RiscVReg tempReg3 = X15;
 static const RiscVReg scratchReg = X16;
@@ -234,7 +234,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 
 	// Keep the scale/offset in a few fp registers if we need it.
 	if (prescaleStep) {
-		LI(tempReg1, &gstate_c.uv);
+		// tempReg1 happens to be the fourth argument register.
 		FL(32, prescaleRegs.scale.u, tempReg1, 0);
 		FL(32, prescaleRegs.scale.v, tempReg1, 4);
 		FL(32, prescaleRegs.offset.u, tempReg1, 8);
diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp
index 5828689ea3a6..d92bc964bbb2 100644
--- a/GPU/Common/VertexDecoderX86.cpp
+++ b/GPU/Common/VertexDecoderX86.cpp
@@ -60,6 +60,7 @@ static const X64Reg tempReg3 = R10;
 static const X64Reg srcReg = RCX;
 static const X64Reg dstReg = RDX;
 static const X64Reg counterReg = R8;
+static const X64Reg uvScalePtrReg = R9;  // only used during init
 static const X64Reg alphaReg = R11;
 #else
 static const X64Reg tempReg1 = RAX;
@@ -68,6 +69,7 @@ static const X64Reg tempReg3 = R10;
 static const X64Reg srcReg = RDI;
 static const X64Reg dstReg = RSI;
 static const X64Reg counterReg = RDX;
+static const X64Reg uvScalePtrReg = RCX;  // only used during init
 static const X64Reg alphaReg = R11;
 #endif
 #else
@@ -77,6 +79,7 @@ static const X64Reg tempReg3 = EDX;
 static const X64Reg srcReg = ESI;
 static const X64Reg dstReg = EDI;
 static const X64Reg counterReg = ECX;
+static const X64Reg uvScalePtrReg = EDX;  // only used during init
 #endif
 
 // XMM0-XMM5 are volatile on Windows X64
@@ -168,6 +171,22 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 	BeginWrite(4096);
 	const u8 *start = this->AlignCode16();
 
+	bool prescaleStep = false;
+	// Look for prescaled texcoord steps
+	for (int i = 0; i < dec.numSteps_; i++) {
+		if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale ||
+			dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale ||
+			dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) {
+			prescaleStep = true;
+		}
+		if (dec.steps_[i] == &VertexDecoder::Step_TcU8PrescaleMorph ||
+			dec.steps_[i] == &VertexDecoder::Step_TcU16PrescaleMorph ||
+			dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescaleMorph) {
+			prescaleStep = true;
+		}
+	}
+
+
 #if PPSSPP_ARCH(X86)
 	// Store register values
 	PUSH(ESI);
@@ -180,6 +199,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 	MOV(32, R(srcReg), MDisp(ESP, 16 + offset + 0));
 	MOV(32, R(dstReg), MDisp(ESP, 16 + offset + 4));
 	MOV(32, R(counterReg), MDisp(ESP, 16 + offset + 8));
+	MOV(32, R(uvScalePtrReg), MDisp(ESP, 16 + offset + 12));
 
 	const uint8_t STACK_FIXED_ALLOC = 64;
 #else
@@ -210,63 +230,49 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 	}
 #endif
 
-	bool prescaleStep = false;
-	// Look for prescaled texcoord steps
-	for (int i = 0; i < dec.numSteps_; i++) {
-		if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale ||
-			dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale ||
-			dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) {
-			prescaleStep = true;
-		}
-		if (dec.steps_[i] == &VertexDecoder::Step_TcU8PrescaleMorph ||
-			dec.steps_[i] == &VertexDecoder::Step_TcU16PrescaleMorph ||
-			dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescaleMorph) {
-			prescaleStep = true;
+	// Keep the scale/offset in a few fp registers if we need it.
+	// TODO: Read it from an argument pointer instead of gstate_c.uv.
+	if (prescaleStep) {
+		// uvScalePtrReg should point to gstate_c.uv, or wherever the UV scale we want to use is located.
+		MOVUPS(fpScaleOffsetReg, MatR(uvScalePtrReg));
+		if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
+			MOV(PTRBITS, R(tempReg2), ImmPtr(&by128_11));
+			MULPS(fpScaleOffsetReg, MatR(tempReg2));
+		} else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
+			MOV(PTRBITS, R(tempReg2), ImmPtr(&by32768_11));
+			MULPS(fpScaleOffsetReg, MatR(tempReg2));
 		}
 	}
 
 	// Add code to convert matrices to 4x4.
 	// Later we might want to do this when the matrices are loaded instead.
+	// Can't touch fpScaleOffsetReg (XMM0) in here!
 	if (dec.skinInDecode) {
 		MOV(PTRBITS, R(tempReg1), ImmPtr(&threeMasks));
-		MOVAPS(XMM4, MatR(tempReg1));
+		MOVAPS(XMM5, MatR(tempReg1));
 		MOV(PTRBITS, R(tempReg1), ImmPtr(&aOne));
-		MOVUPS(XMM5, MatR(tempReg1));
+		MOVUPS(XMM6, MatR(tempReg1));
 		MOV(PTRBITS, R(tempReg1), ImmPtr(gstate.boneMatrix));
 		MOV(PTRBITS, R(tempReg2), ImmPtr(bones));
 		for (int i = 0; i < dec.nweights; i++) {
-			MOVUPS(XMM0, MDisp(tempReg1, (12 * i) * 4));
-			MOVUPS(XMM1, MDisp(tempReg1, (12 * i + 3) * 4));
-			MOVUPS(XMM2, MDisp(tempReg1, (12 * i + 3 * 2) * 4));
-			MOVUPS(XMM3, MDisp(tempReg1, (12 * i + 3 * 3) * 4));
-			ANDPS(XMM0, R(XMM4));
-			ANDPS(XMM1, R(XMM4));
-			ANDPS(XMM2, R(XMM4));
-			ANDPS(XMM3, R(XMM4));
-			ORPS(XMM3, R(XMM5));
-			MOVAPS(MDisp(tempReg2, (16 * i) * 4), XMM0);
-			MOVAPS(MDisp(tempReg2, (16 * i + 4) * 4), XMM1);
-			MOVAPS(MDisp(tempReg2, (16 * i + 8) * 4), XMM2);
-			MOVAPS(MDisp(tempReg2, (16 * i + 12) * 4), XMM3);
-		}
-	}
-
-	// Keep the scale/offset in a few fp registers if we need it.
-	// TODO: Read it from an argument pointer instead of gstate_c.uv.
-	if (prescaleStep) {
-		MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.uv));
-		MOVUPS(fpScaleOffsetReg, MatR(tempReg1));
-		if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
-			MOV(PTRBITS, R(tempReg2), ImmPtr(&by128_11));
-			MULPS(fpScaleOffsetReg, MatR(tempReg2));
-		} else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
-			MOV(PTRBITS, R(tempReg2), ImmPtr(&by32768_11));
-			MULPS(fpScaleOffsetReg, MatR(tempReg2));
+			MOVUPS(XMM1, MDisp(tempReg1, (12 * i) * 4));
+			MOVUPS(XMM2, MDisp(tempReg1, (12 * i + 3) * 4));
+			MOVUPS(XMM3, MDisp(tempReg1, (12 * i + 3 * 2) * 4));
+			MOVUPS(XMM4, MDisp(tempReg1, (12 * i + 3 * 3) * 4));
+			ANDPS(XMM1, R(XMM5));
+			ANDPS(XMM2, R(XMM5));
+			ANDPS(XMM3, R(XMM5));
+			ANDPS(XMM4, R(XMM5));
+			ORPS(XMM4, R(XMM6));
+			MOVAPS(MDisp(tempReg2, (16 * i) * 4), XMM1);
+			MOVAPS(MDisp(tempReg2, (16 * i + 4) * 4), XMM2);
+			MOVAPS(MDisp(tempReg2, (16 * i + 8) * 4), XMM3);
+			MOVAPS(MDisp(tempReg2, (16 * i + 12) * 4), XMM4);
 		}
 	}
 
 	// Let's not bother with a proper stack frame. We just grab the arguments and go.
-	JumpTarget loopStart = GetCodePtr();
+	JumpTarget loopStart = NopAlignCode16();
 	for (int i = 0; i < dec.numSteps_; i++) {
 		if (!CompileStep(dec, i)) {
 			EndWrite();
@@ -775,6 +781,8 @@ void VertexDecoderJitCache::Jit_TcU8Prescale() {
 	CVTSI2SS(fpScratchReg, R(tempReg1));
 	CVTSI2SS(fpScratchReg2, R(tempReg2));
 	UNPCKLPS(fpScratchReg, R(fpScratchReg2));
+	// TODO: These are a lot of nasty consecutive dependencies. Can probably be made faster
+	// if we can spare another register to avoid the shuffle, like on ARM.
 	MULPS(fpScratchReg, R(fpScaleOffsetReg));
 	SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
 	ADDPS(fpScratchReg, R(fpScaleOffsetReg));
diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp
index 8fb0ea66ddf9..eacdb481ae95 100644
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@@ -494,7 +494,7 @@ class SoftwareVertexReader {
 		if (useIndices_)
 			GetIndexBounds(indices, vertex_count, vertex_type, &lowerBound_, &upperBound_);
 		if (vertex_count != 0)
-			vdecoder.DecodeVerts(base, vertices, lowerBound_, upperBound_);
+			vdecoder.DecodeVerts(base, vertices, &gstate_c.uv, lowerBound_, upperBound_);
 
 		// If we're only using a subset of verts, it's better to decode with random access (usually.)
 		// However, if we're reusing a lot of verts, we should read and cache them.
diff --git a/Windows/.gitignore b/Windows/.gitignore
index 8d4dba11ef29..5c87205d53c6 100644
--- a/Windows/.gitignore
+++ b/Windows/.gitignore
@@ -2,3 +2,4 @@
 *.VC.db
 *.txt
 enc_temp_folder
+Win32
diff --git a/unittest/TestVertexJit.cpp b/unittest/TestVertexJit.cpp
index e759d644444f..f5561c0fa95e 100644
--- a/unittest/TestVertexJit.cpp
+++ b/unittest/TestVertexJit.cpp
@@ -78,7 +78,7 @@ class VertexDecoderTestHarness {
 	void Execute(int vtype, int indexUpperBound, bool useJit) {
 		SetupExecute(vtype, useJit);
 
-		dec_->DecodeVerts(dst_, src_, indexLowerBound_, indexUpperBound);
+		dec_->DecodeVerts(dst_, src_, &gstate_c.uv, indexLowerBound_, indexUpperBound);
 	}
 
 	double ExecuteTimed(int vtype, int indexUpperBound, bool useJit) {
@@ -88,7 +88,7 @@ class VertexDecoderTestHarness {
 		double st = time_now_d();
 		do {
 			for (int j = 0; j < ROUNDS; ++j) {
-				dec_->DecodeVerts(dst_, src_, indexLowerBound_, indexUpperBound);
+				dec_->DecodeVerts(dst_, src_, &gstate_c.uv, indexLowerBound_, indexUpperBound);
 				++total;
 			}
 		} while (time_now_d() - st < 0.5);