Skip to content

Commit

Permalink
Merge pull request #17992 from unknownbrackets/x86-jit-float
Browse files Browse the repository at this point in the history
x86jit: Implement trig instructions, couple other FPU
  • Loading branch information
hrydgard committed Aug 28, 2023
2 parents 54ebe46 + f10444e commit 0ecfb6b
Show file tree
Hide file tree
Showing 12 changed files with 290 additions and 22 deletions.
3 changes: 3 additions & 0 deletions Common/x64Emitter.cpp
Expand Up @@ -1689,6 +1689,9 @@ void XEmitter::CVTTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0xE6, regOp,

void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src) {WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));}

void XEmitter::MOVSHDUP(X64Reg regOp, OpArg arg) { WriteSSEOp(0xF3, sseMOVHPfromRM, regOp, arg); }
void XEmitter::MOVSLDUP(X64Reg regOp, OpArg arg) { WriteSSEOp(0xF3, sseMOVLPfromRM, regOp, arg); }

void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x50, dest, arg);}
void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, arg);}

Expand Down
4 changes: 4 additions & 0 deletions Common/x64Emitter.h
Expand Up @@ -746,6 +746,10 @@ class XEmitter
void MOVD_xmm(const OpArg &arg, X64Reg src);
void MOVQ_xmm(OpArg arg, X64Reg src);

// SSE3: Some additional moves.
void MOVSHDUP(X64Reg regOp1, OpArg arg);
void MOVSLDUP(X64Reg regOp1, OpArg arg);

// SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question.
void MOVMSKPS(X64Reg dest, OpArg arg);
void MOVMSKPD(X64Reg dest, OpArg arg);
Expand Down
28 changes: 28 additions & 0 deletions Core/MIPS/IR/IRRegCache.cpp
Expand Up @@ -159,6 +159,34 @@ bool IRNativeRegCacheBase::IsFPRMapped(IRReg fpr) {
return mr[fpr + 32].loc == MIPSLoc::FREG || mr[fpr + 32].loc == MIPSLoc::VREG;
}

int IRNativeRegCacheBase::GetFPRLaneCount(IRReg fpr) {
if (!IsFPRMapped(fpr) || mr[fpr].lane > 0)
return 0;
if (mr[fpr].lane == -1)
return 1;

int c = 1;
for (int i = 1; i < 4; ++i) {
if (mr[fpr + i].nReg != mr[fpr].nReg || mr[fpr + i].loc != mr[fpr].loc)
return c;
if (mr[fpr + i].lane != i)
return c;

c++;
}

return c;
}

int IRNativeRegCacheBase::GetFPRLane(IRReg fpr) {
_dbg_assert_(IsValidFPR(fpr));
if (mr[fpr + 32].loc == MIPSLoc::FREG || mr[fpr + 32].loc == MIPSLoc::VREG) {
int l = mr[fpr + 32].lane;
return l == -1 ? 0 : l;
}
return -1;
}

bool IRNativeRegCacheBase::IsGPRMappedAsPointer(IRReg gpr) {
_dbg_assert_(IsValidGPR(gpr));
if (mr[gpr].loc == MIPSLoc::REG) {
Expand Down
2 changes: 2 additions & 0 deletions Core/MIPS/IR/IRRegCache.h
Expand Up @@ -163,6 +163,8 @@ class IRNativeRegCacheBase {
bool IsFPRMapped(IRReg fpr);
bool IsGPRMappedAsPointer(IRReg gpr);
bool IsGPRMappedAsStaticPointer(IRReg gpr);
int GetFPRLane(IRReg fpr);
int GetFPRLaneCount(IRReg fpr);

bool IsGPRImm(IRReg gpr);
bool IsGPR2Imm(IRReg base);
Expand Down
5 changes: 3 additions & 2 deletions Core/MIPS/RiscV/RiscVCompFPU.cpp
Expand Up @@ -115,7 +115,7 @@ void RiscVJitBackend::CompIR_FCondAssign(IRInst inst) {
MAX(SCRATCH1, SCRATCH1, SCRATCH2);
SetJumpTarget(skipSwapCompare);
} else {
RiscVReg isSrc1LowerReg = regs_.GetAndLockTempR();
RiscVReg isSrc1LowerReg = regs_.GetAndLockTempGPR();
SLT(isSrc1LowerReg, SCRATCH1, SCRATCH2);
// Flip the flag (to reverse the min/max) based on if both were negative.
XOR(isSrc1LowerReg, isSrc1LowerReg, R_RA);
Expand Down Expand Up @@ -578,9 +578,10 @@ void RiscVJitBackend::CompIR_FSpecial(IRInst inst) {
#error Currently hard float is required.
#endif

auto callFuncF_F = [&](float (*func)(float)){
auto callFuncF_F = [&](float (*func)(float)) {
regs_.FlushBeforeCall();
// It might be in a non-volatile register.
// TODO: May have to handle a transfer if SIMD here.
if (regs_.IsFPRMapped(inst.src1)) {
FMV(32, F10, regs_.F(inst.src1));
} else {
Expand Down
4 changes: 2 additions & 2 deletions Core/MIPS/RiscV/RiscVRegCache.cpp
Expand Up @@ -235,15 +235,15 @@ RiscVReg RiscVRegCache::TryMapTempImm(IRReg r) {
return INVALID_REG;
}

RiscVReg RiscVRegCache::GetAndLockTempR() {
RiscVReg RiscVRegCache::GetAndLockTempGPR() {
RiscVReg reg = (RiscVReg)AllocateReg(MIPSLoc::REG, MIPSMap::INIT);
if (reg != INVALID_REG) {
nr[reg].tempLockIRIndex = irIndex_;
}
return reg;
}

RiscVReg RiscVRegCache::MapWithFPRTemp(IRInst &inst) {
RiscVReg RiscVRegCache::MapWithFPRTemp(const IRInst &inst) {
return (RiscVReg)MapWithTemp(inst, MIPSLoc::FREG);
}

Expand Down
4 changes: 2 additions & 2 deletions Core/MIPS/RiscV/RiscVRegCache.h
Expand Up @@ -51,7 +51,7 @@ class RiscVRegCache : public IRNativeRegCacheBase {
RiscVGen::RiscVReg MapGPRAsPointer(IRReg reg);
RiscVGen::RiscVReg MapFPR(IRReg reg, MIPSMap mapFlags = MIPSMap::INIT);

RiscVGen::RiscVReg MapWithFPRTemp(IRInst &inst);
RiscVGen::RiscVReg MapWithFPRTemp(const IRInst &inst);

bool IsNormalized32(IRReg reg);

Expand All @@ -60,7 +60,7 @@ class RiscVRegCache : public IRNativeRegCacheBase {

void FlushBeforeCall();

RiscVGen::RiscVReg GetAndLockTempR();
RiscVGen::RiscVReg GetAndLockTempGPR();

RiscVGen::RiscVReg R(IRReg preg); // Returns a cached register, while checking that it's NOT mapped as a pointer
RiscVGen::RiscVReg RPtr(IRReg preg); // Returns a cached register, if it has been mapped as a pointer
Expand Down
203 changes: 199 additions & 4 deletions Core/MIPS/x86/X64IRCompFPU.cpp
Expand Up @@ -45,6 +45,7 @@ alignas(16) const u32 reverseQNAN[4] = { 0x803FFFFF, 0x803FFFFF, 0x803FFFFF, 0x8
alignas(16) const u32 noSignMask[4] = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF };
alignas(16) const u32 positiveInfinity[4] = { 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000 };
alignas(16) const u32 signBitAll[4] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
alignas(16) const u32 ones[4] = { 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 };
} simdConstants;

void X64JitBackend::CompIR_FArith(IRInst inst) {
Expand Down Expand Up @@ -298,7 +299,22 @@ void X64JitBackend::CompIR_FCompare(IRInst inst) {
break;

case IROp::FCmovVfpuCC:
CompIR_Generic(inst);
regs_.MapWithExtra(inst, { { 'G', IRREG_VFPU_CC, 1, MIPSMap::INIT } });
if (regs_.HasLowSubregister(regs_.RX(IRREG_VFPU_CC))) {
TEST(8, regs_.R(IRREG_VFPU_CC), Imm8(1 << (inst.src2 & 7)));
} else {
TEST(32, regs_.R(IRREG_VFPU_CC), Imm32(1 << (inst.src2 & 7)));
}

if ((inst.src2 >> 7) & 1) {
FixupBranch skip = J_CC(CC_Z);
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
SetJumpTarget(skip);
} else {
FixupBranch skip = J_CC(CC_NZ);
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
SetJumpTarget(skip);
}
break;

case IROp::FCmpVfpuBit:
Expand Down Expand Up @@ -462,7 +478,7 @@ void X64JitBackend::CompIR_FCondAssign(IRInst inst) {
X64Reg tempReg = INVALID_REG;
switch (inst.op) {
case IROp::FMin:
tempReg = regs_.GetAndLockTempR();
tempReg = regs_.GetAndLockTempGPR();
regs_.Map(inst);
UCOMISS(regs_.FX(inst.src1), regs_.F(inst.src1));
skipNAN = J_CC(CC_NP, true);
Expand Down Expand Up @@ -499,7 +515,7 @@ void X64JitBackend::CompIR_FCondAssign(IRInst inst) {
break;

case IROp::FMax:
tempReg = regs_.GetAndLockTempR();
tempReg = regs_.GetAndLockTempGPR();
regs_.Map(inst);
UCOMISS(regs_.FX(inst.src1), regs_.F(inst.src1));
skipNAN = J_CC(CC_NP, true);
Expand Down Expand Up @@ -546,7 +562,14 @@ void X64JitBackend::CompIR_FCvt(IRInst inst) {

switch (inst.op) {
case IROp::FCvtWS:
CompIR_Generic(inst);
break;

case IROp::FCvtSW:
regs_.Map(inst);
CVTDQ2PS(regs_.FX(inst.dest), regs_.F(inst.src1));
break;

case IROp::FCvtScaledWS:
case IROp::FCvtScaledSW:
CompIR_Generic(inst);
Expand All @@ -563,7 +586,41 @@ void X64JitBackend::CompIR_FRound(IRInst inst) {

switch (inst.op) {
case IROp::FRound:
CompIR_Generic(inst);
break;

case IROp::FTrunc:
{
regs_.SpillLockFPR(inst.dest, inst.src1);
X64Reg tempZero = regs_.GetAndLockTempFPR();
regs_.Map(inst);

CVTTSS2SI(SCRATCH1, regs_.F(inst.src1));

// Did we get an indefinite integer value?
CMP(32, R(SCRATCH1), Imm32(0x80000000));
FixupBranch wasExact = J_CC(CC_NE);

XORPS(tempZero, R(tempZero));
if (inst.dest == inst.src1) {
CMPSS(regs_.FX(inst.dest), R(tempZero), CMP_LT);
} else if (cpu_info.bAVX) {
VCMPSS(regs_.FX(inst.dest), regs_.FX(inst.src1), R(tempZero), CMP_LT);
} else {
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
CMPSS(regs_.FX(inst.dest), R(tempZero), CMP_LT);
}

// At this point, -inf = 0xffffffff, inf/nan = 0x00000000.
// We want -inf to be 0x80000000 inf/nan to be 0x7fffffff, so we flip those bits.
MOVD_xmm(R(SCRATCH1), regs_.FX(inst.dest));
XOR(32, R(SCRATCH1), Imm32(0x7fffffff));

SetJumpTarget(wasExact);
MOVD_xmm(regs_.FX(inst.dest), R(SCRATCH1));
break;
}

case IROp::FCeil:
case IROp::FFloor:
CompIR_Generic(inst);
Expand All @@ -590,16 +647,154 @@ void X64JitBackend::CompIR_FSat(IRInst inst) {
}
}

#if X64JIT_USE_XMM_CALL
static float X64JIT_XMM_CALL x64_sin(float f) {
return vfpu_sin(f);
}

static float X64JIT_XMM_CALL x64_cos(float f) {
return vfpu_cos(f);
}

static float X64JIT_XMM_CALL x64_asin(float f) {
return vfpu_asin(f);
}
#else
static uint32_t x64_sin(uint32_t v) {
float f;
memcpy(&f, &v, sizeof(v));
f = vfpu_sin(f);
memcpy(&v, &f, sizeof(v));
return v;
}

static uint32_t x64_cos(uint32_t v) {
float f;
memcpy(&f, &v, sizeof(v));
f = vfpu_cos(f);
memcpy(&v, &f, sizeof(v));
return v;
}

static uint32_t x64_asin(uint32_t v) {
float f;
memcpy(&f, &v, sizeof(v));
f = vfpu_asin(f);
memcpy(&v, &f, sizeof(v));
return v;
}
#endif

void X64JitBackend::CompIR_FSpecial(IRInst inst) {
CONDITIONAL_DISABLE;

// TODO: Regcache... maybe emitter helper too?
auto laneToReg0 = [&](X64Reg dest, X64Reg src, int lane) {
if (lane == 0) {
if (dest != src)
MOVAPS(dest, R(src));
} else if (lane == 1 && cpu_info.bSSE3) {
MOVSHDUP(dest, R(src));
} else if (lane == 2) {
MOVHLPS(dest, src);
} else if (cpu_info.bAVX) {
VPERMILPS(128, dest, R(src), VFPU_SWIZZLE(lane, lane, lane, lane));
} else {
if (dest != src)
MOVAPS(dest, R(src));
SHUFPS(dest, R(dest), VFPU_SWIZZLE(lane, lane, lane, lane));
}
};

auto callFuncF_F = [&](const void *func) {
regs_.FlushBeforeCall();

#if X64JIT_USE_XMM_CALL
if (regs_.IsFPRMapped(inst.src1)) {
int lane = regs_.GetFPRLane(inst.src1);
laneToReg0(XMM0, regs_.FX(inst.src1), lane);
} else {
// Account for CTXREG being increased by 128 to reduce imm sizes.
int offset = offsetof(MIPSState, f) + inst.src1 * 4 - 128;
MOVSS(XMM0, MDisp(CTXREG, offset));
}
ABI_CallFunction((const void *)func);

// It's already in place, NOINIT won't modify.
regs_.MapFPR(inst.dest, MIPSMap::NOINIT | X64Map::XMM0);
#else
if (regs_.IsFPRMapped(inst.src1)) {
int lane = regs_.GetFPRLane(inst.src1);
if (lane == 0) {
MOVD_xmm(R(SCRATCH1), regs_.FX(inst.src1));
} else {
laneToReg0(XMM0, regs_.FX(inst.src1), lane);
MOVD_xmm(R(SCRATCH1), XMM0);
}
} else {
int offset = offsetof(MIPSState, f) + inst.src1 * 4;
MOV(32, R(SCRATCH1), MDisp(CTXREG, offset));
}
ABI_CallFunctionR((const void *)func, SCRATCH1);

regs_.MapFPR(inst.dest, MIPSMap::NOINIT);
MOVD_xmm(regs_.FX(inst.dest), R(SCRATCH1));
#endif
};

switch (inst.op) {
case IROp::FSin:
callFuncF_F((const void *)&x64_sin);
break;

case IROp::FCos:
callFuncF_F((const void *)&x64_cos);
break;

case IROp::FRSqrt:
{
X64Reg tempReg = regs_.MapWithFPRTemp(inst);
SQRTSS(tempReg, regs_.F(inst.src1));

if (RipAccessible(&simdConstants.ones)) {
MOVSS(regs_.FX(inst.dest), M(&simdConstants.ones)); // rip accessible
} else {
MOV(PTRBITS, R(SCRATCH1), ImmPtr(&simdConstants.ones));
MOVSS(regs_.FX(inst.dest), MatR(SCRATCH1));
}
DIVSS(regs_.FX(inst.dest), R(tempReg));
break;
}

case IROp::FRecip:
if (inst.dest != inst.src1) {
regs_.Map(inst);
if (RipAccessible(&simdConstants.ones)) {
MOVSS(regs_.FX(inst.dest), M(&simdConstants.ones)); // rip accessible
} else {
MOV(PTRBITS, R(SCRATCH1), ImmPtr(&simdConstants.ones));
MOVSS(regs_.FX(inst.dest), MatR(SCRATCH1));
}
DIVSS(regs_.FX(inst.dest), regs_.F(inst.src1));
} else {
X64Reg tempReg = regs_.MapWithFPRTemp(inst);
if (RipAccessible(&simdConstants.ones)) {
MOVSS(tempReg, M(&simdConstants.ones)); // rip accessible
} else {
MOV(PTRBITS, R(SCRATCH1), ImmPtr(&simdConstants.ones));
MOVSS(tempReg, MatR(SCRATCH1));
}
if (cpu_info.bAVX) {
VDIVSS(regs_.FX(inst.dest), tempReg, regs_.F(inst.src1));
} else {
DIVSS(tempReg, regs_.F(inst.src1));
MOVSS(regs_.FX(inst.dest), R(tempReg));
}
}
break;

case IROp::FAsin:
CompIR_Generic(inst);
callFuncF_F((const void *)&x64_asin);
break;

default:
Expand Down

0 comments on commit 0ecfb6b

Please sign in to comment.