From c8f888fab0942312d01eb587ddf2a46ce8d48b2b Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Tue, 5 Sep 2023 23:47:52 -0700 Subject: [PATCH 1/4] arm64jit: Implement FMin/FMax. --- Common/Arm64Emitter.cpp | 61 ++++++++++++++++++++++++++++++ Common/Arm64Emitter.h | 14 +++++++ Core/MIPS/ARM64/Arm64IRCompFPU.cpp | 39 ++++++++++++++++++- 3 files changed, 113 insertions(+), 1 deletion(-) diff --git a/Common/Arm64Emitter.cpp b/Common/Arm64Emitter.cpp index 2192a8a6e690..6b3221e76f0c 100644 --- a/Common/Arm64Emitter.cpp +++ b/Common/Arm64Emitter.cpp @@ -3011,6 +3011,12 @@ void ARM64FloatEmitter::BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EmitThreeSame(1, 1, 3, Rd, Rn, Rm); } +void ARM64FloatEmitter::BIT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(1, 2, 3, Rd, Rn, Rm); +} +void ARM64FloatEmitter::BIF(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(1, 3, 3, Rd, Rn, Rm); +} void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) { u32 imm5 = 0; @@ -3184,6 +3190,61 @@ void ARM64FloatEmitter::XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) Emit2RegMisc(true, 0, dest_size >> 4, 0x12, Rd, Rn); } +void ARM64FloatEmitter::CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + EmitThreeSame(true, size >> 4, 0b10001, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + EmitThreeSame(false, size >> 4, 0b00111, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + EmitThreeSame(false, size >> 4, 0b00110, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::CMHI(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + EmitThreeSame(true, size >> 4, 0b00110, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::CMHS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + EmitThreeSame(true, size >> 4, 0b00111, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::CMTST(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + EmitThreeSame(false, size >> 4, 0b10001, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + Emit2RegMisc(IsQuad(Rd), false, size >> 4, 0b01001, Rd, Rn); +} + +void ARM64FloatEmitter::CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + Emit2RegMisc(IsQuad(Rd), true, size >> 4, 0b01000, Rd, Rn); +} + +void ARM64FloatEmitter::CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + Emit2RegMisc(IsQuad(Rd), false, size >> 4, 0b01000, Rd, Rn); +} + +void ARM64FloatEmitter::CMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + Emit2RegMisc(IsQuad(Rd), true, size >> 4, 0b01001, Rd, Rn); +} + +void ARM64FloatEmitter::CMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + Emit2RegMisc(IsQuad(Rd), false, size >> 4, 0b01010, Rd, Rn); +} + // Move void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn) { diff --git a/Common/Arm64Emitter.h b/Common/Arm64Emitter.h index 70a7253c28c0..f800e161379f 100644 --- a/Common/Arm64Emitter.h +++ b/Common/Arm64Emitter.h @@ -851,6 +851,8 @@ class ARM64FloatEmitter void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void BIT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void BIF(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); @@ -894,6 +896,18 @@ class ARM64FloatEmitter void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); void XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMHI(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMHS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMTST(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void CMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void CMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn); + // Move void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn); void INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn); diff --git a/Core/MIPS/ARM64/Arm64IRCompFPU.cpp b/Core/MIPS/ARM64/Arm64IRCompFPU.cpp index 27f7b40a8256..74d7bfe8df07 100644 --- a/Core/MIPS/ARM64/Arm64IRCompFPU.cpp +++ b/Core/MIPS/ARM64/Arm64IRCompFPU.cpp @@ -143,16 +143,53 @@ void Arm64JitBackend::CompIR_FCompare(IRInst inst) { void Arm64JitBackend::CompIR_FCondAssign(IRInst inst) { CONDITIONAL_DISABLE; + // For Vec4, we could basically just ORR FCMPGE/FCMPLE together, but overlap is trickier. + regs_.Map(inst); + fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2)); + FixupBranch unordered = B(CC_VS); + switch (inst.op) { case IROp::FMin: + fp_.FMIN(regs_.F(inst.dest), regs_.F(inst.src1), regs_.F(inst.src2)); + break; + case IROp::FMax: - CompIR_Generic(inst); + fp_.FMAX(regs_.F(inst.dest), regs_.F(inst.src1), regs_.F(inst.src2)); break; default: INVALIDOP; break; } + + FixupBranch orderedDone = B(); + + // Not sure if this path is fast, trying to optimize it to be small but correct. + // Probably an uncommon path. + SetJumpTarget(unordered); + fp_.AND(EncodeRegToDouble(SCRATCHF1), regs_.FD(inst.src1), regs_.FD(inst.src2)); + // SCRATCHF1 = 0xFFFFFFFF if sign bit set on both, 0x00000000 otherwise. + fp_.CMLT(32, EncodeRegToDouble(SCRATCHF1), EncodeRegToDouble(SCRATCHF1)); + + switch (inst.op) { + case IROp::FMin: + fp_.SMAX(32, EncodeRegToDouble(SCRATCHF2), regs_.FD(inst.src1), regs_.FD(inst.src2)); + fp_.SMIN(32, regs_.FD(inst.dest), regs_.FD(inst.src1), regs_.FD(inst.src2)); + break; + + case IROp::FMax: + fp_.SMIN(32, EncodeRegToDouble(SCRATCHF2), regs_.FD(inst.src1), regs_.FD(inst.src2)); + fp_.SMAX(32, regs_.FD(inst.dest), regs_.FD(inst.src1), regs_.FD(inst.src2)); + break; + + default: + INVALIDOP; + break; + } + // Replace dest with SCRATCHF2 if both were less than zero. + fp_.BIT(regs_.FD(inst.dest), EncodeRegToDouble(SCRATCHF2), EncodeRegToDouble(SCRATCHF1)); + + SetJumpTarget(orderedDone); } void Arm64JitBackend::CompIR_FCvt(IRInst inst) { From 97d9a7f07fa07fd472755f644d7c5f9ca4b509a9 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Tue, 5 Sep 2023 23:48:33 -0700 Subject: [PATCH 2/4] arm64jit: Implement FCmp. --- Core/MIPS/ARM64/Arm64IRCompFPU.cpp | 42 +++++++++++++++++++++++++++++- Core/MIPS/RiscV/RiscVCompFPU.cpp | 3 +++ Core/MIPS/x86/X64IRCompFPU.cpp | 3 +++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/Core/MIPS/ARM64/Arm64IRCompFPU.cpp b/Core/MIPS/ARM64/Arm64IRCompFPU.cpp index 74d7bfe8df07..c46d41ccac06 100644 --- a/Core/MIPS/ARM64/Arm64IRCompFPU.cpp +++ b/Core/MIPS/ARM64/Arm64IRCompFPU.cpp @@ -116,15 +116,55 @@ void Arm64JitBackend::CompIR_FCompare(IRInst inst) { case IROp::FCmp: switch (inst.dest) { case IRFpCompareMode::False: + regs_.SetGPRImm(IRREG_FPCOND, 0); + break; + case IRFpCompareMode::EitherUnordered: + regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } }); + fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2)); + CSET(regs_.R(IRREG_FPCOND), CC_VS); + break; + case IRFpCompareMode::EqualOrdered: + regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } }); + fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2)); + CSET(regs_.R(IRREG_FPCOND), CC_EQ); + break; + case IRFpCompareMode::EqualUnordered: + regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } }); + fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2)); + CSET(regs_.R(IRREG_FPCOND), CC_EQ); + // If ordered, use the above result. If unordered, use ZR+1 (being 1.) + CSINC(regs_.R(IRREG_FPCOND), regs_.R(IRREG_FPCOND), WZR, CC_VC); + break; + case IRFpCompareMode::LessEqualOrdered: + regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } }); + fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2)); + CSET(regs_.R(IRREG_FPCOND), CC_LS); + break; + case IRFpCompareMode::LessEqualUnordered: + regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } }); + fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2)); + CSET(regs_.R(IRREG_FPCOND), CC_LE); + break; + case IRFpCompareMode::LessOrdered: + regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } }); + fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2)); + CSET(regs_.R(IRREG_FPCOND), CC_LO); + break; + case IRFpCompareMode::LessUnordered: - CompIR_Generic(inst); + regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } }); + fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2)); + CSET(regs_.R(IRREG_FPCOND), CC_LT); break; + + default: + _assert_msg_(false, "Unexpected IRFpCompareMode %d", inst.dest); } break; diff --git a/Core/MIPS/RiscV/RiscVCompFPU.cpp b/Core/MIPS/RiscV/RiscVCompFPU.cpp index 00c4ff866108..a0c3445ba65a 100644 --- a/Core/MIPS/RiscV/RiscVCompFPU.cpp +++ b/Core/MIPS/RiscV/RiscVCompFPU.cpp @@ -405,6 +405,9 @@ void RiscVJitBackend::CompIR_FCompare(IRInst inst) { SEQZ(regs_.R(IRREG_FPCOND), regs_.R(IRREG_FPCOND)); regs_.MarkGPRDirty(IRREG_FPCOND, true); break; + + default: + _assert_msg_(false, "Unexpected IRFpCompareMode %d", inst.dest); } break; diff --git a/Core/MIPS/x86/X64IRCompFPU.cpp b/Core/MIPS/x86/X64IRCompFPU.cpp index aae5900a3ac8..644dff7138ba 100644 --- a/Core/MIPS/x86/X64IRCompFPU.cpp +++ b/Core/MIPS/x86/X64IRCompFPU.cpp @@ -322,6 +322,9 @@ void X64JitBackend::CompIR_FCompare(IRInst inst) { // B/CF = LESS THAN or UNORDERED. ccToFpcond(inst.src1, inst.src2, CC_B); break; + + default: + _assert_msg_(false, "Unexpected IRFpCompareMode %d", inst.dest); } break; From 89a9584c3860bcf74f534fe7e5e4547a04b32552 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Tue, 5 Sep 2023 23:49:35 -0700 Subject: [PATCH 3/4] arm64jit: Implement FRound/similar. --- Core/MIPS/ARM64/Arm64IRAsm.cpp | 7 +++---- Core/MIPS/ARM64/Arm64IRCompFPU.cpp | 20 +++++++++++++++++++- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/Core/MIPS/ARM64/Arm64IRAsm.cpp b/Core/MIPS/ARM64/Arm64IRAsm.cpp index a7c97293c874..42bee863a22a 100644 --- a/Core/MIPS/ARM64/Arm64IRAsm.cpp +++ b/Core/MIPS/ARM64/Arm64IRAsm.cpp @@ -240,12 +240,11 @@ void Arm64JitBackend::GenerateFixedCode(MIPSState *mipsState) { for (size_t i = 0; i < ARRAY_SIZE(roundModes); ++i) { convertS0ToSCRATCH1_[i] = AlignCode16(); + // Invert 0x80000000 -> 0x7FFFFFFF for the NAN result. + fp_.MVNI(32, EncodeRegToDouble(SCRATCHF2), 0x80, 24); fp_.FCMP(S0, S0); // Detect NaN fp_.FCVTS(S0, S0, roundModes[i]); - FixupBranch skip = B(CC_VC); - MOVI2R(SCRATCH2, 0x7FFFFFFF); - fp_.FMOV(S0, SCRATCH2); - SetJumpTarget(skip); + fp_.FCSEL(S0, S0, SCRATCHF2, CC_VC); RET(); } diff --git a/Core/MIPS/ARM64/Arm64IRCompFPU.cpp b/Core/MIPS/ARM64/Arm64IRCompFPU.cpp index c46d41ccac06..5e38e9a24bca 100644 --- a/Core/MIPS/ARM64/Arm64IRCompFPU.cpp +++ b/Core/MIPS/ARM64/Arm64IRCompFPU.cpp @@ -252,18 +252,36 @@ void Arm64JitBackend::CompIR_FCvt(IRInst inst) { void Arm64JitBackend::CompIR_FRound(IRInst inst) { CONDITIONAL_DISABLE; + regs_.Map(inst); + // Invert 0x80000000 -> 0x7FFFFFFF for the NAN result. + fp_.MVNI(32, EncodeRegToDouble(SCRATCHF1), 0x80, 24); + fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src1)); + + // Luckily, these already saturate. switch (inst.op) { case IROp::FRound: + fp_.FCVTS(regs_.F(inst.dest), regs_.F(inst.src1), ROUND_N); + break; + case IROp::FTrunc: + fp_.FCVTS(regs_.F(inst.dest), regs_.F(inst.src1), ROUND_Z); + break; + case IROp::FCeil: + fp_.FCVTS(regs_.F(inst.dest), regs_.F(inst.src1), ROUND_P); + break; + case IROp::FFloor: - CompIR_Generic(inst); + fp_.FCVTS(regs_.F(inst.dest), regs_.F(inst.src1), ROUND_M); break; default: INVALIDOP; break; } + + // Switch to INT_MAX if it was NAN. + fp_.FCSEL(regs_.F(inst.dest), regs_.F(inst.dest), SCRATCHF1, CC_VC); } void Arm64JitBackend::CompIR_FSat(IRInst inst) { From 01ed48a3d0b967d9dad547f28d8b8232791ced2b Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Tue, 5 Sep 2023 23:54:08 -0700 Subject: [PATCH 4/4] arm64jit: Implement FCvtSW. --- Core/MIPS/ARM64/Arm64IRCompFPU.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Core/MIPS/ARM64/Arm64IRCompFPU.cpp b/Core/MIPS/ARM64/Arm64IRCompFPU.cpp index 5e38e9a24bca..5e5b171747b0 100644 --- a/Core/MIPS/ARM64/Arm64IRCompFPU.cpp +++ b/Core/MIPS/ARM64/Arm64IRCompFPU.cpp @@ -237,7 +237,14 @@ void Arm64JitBackend::CompIR_FCvt(IRInst inst) { switch (inst.op) { case IROp::FCvtWS: + CompIR_Generic(inst); + break; + case IROp::FCvtSW: + regs_.Map(inst); + fp_.SCVTF(regs_.F(inst.dest), regs_.F(inst.src1)); + break; + case IROp::FCvtScaledWS: case IROp::FCvtScaledSW: CompIR_Generic(inst);