diff --git a/Common/Arm64Emitter.cpp b/Common/Arm64Emitter.cpp index 4f66b9b12995..7cace77e899e 100644 --- a/Common/Arm64Emitter.cpp +++ b/Common/Arm64Emitter.cpp @@ -3684,6 +3684,12 @@ void ARM64FloatEmitter::USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) { USHLL(src_size, Rd, Rn, shift, true); } +void ARM64FloatEmitter::SHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) { + SHLL(src_size, Rd, Rn, false); +} +void ARM64FloatEmitter::SHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) { + SHLL(src_size, Rd, Rn, true); +} void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) { SXTL(src_size, Rd, Rn, false); @@ -3723,6 +3729,11 @@ void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, EmitShiftImm(upper, 1, imm >> 3, imm & 7, 0x14, Rd, Rn); } +void ARM64FloatEmitter::SHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper) { + _assert_msg_(src_size <= 32, "%s shift amount cannot be 64", __FUNCTION__); + Emit2RegMisc(upper, 1, src_size >> 4, 0b10011, Rd, Rn); +} + void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) { _assert_msg_(shift > 0, "%s shift amount must be greater than zero!", __FUNCTION__); diff --git a/Common/Arm64Emitter.h b/Common/Arm64Emitter.h index c8672583b1e1..3173714f4c27 100644 --- a/Common/Arm64Emitter.h +++ b/Common/Arm64Emitter.h @@ -976,6 +976,9 @@ class ARM64FloatEmitter void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + // Shift == src_size for these. + void SHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + void SHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); @@ -1034,6 +1037,7 @@ class ARM64FloatEmitter void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); + void SHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper); void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper); void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper); diff --git a/Core/MIPS/ARM64/Arm64IRCompVec.cpp b/Core/MIPS/ARM64/Arm64IRCompVec.cpp index 8f80af9af7df..9b6325a3d086 100644 --- a/Core/MIPS/ARM64/Arm64IRCompVec.cpp +++ b/Core/MIPS/ARM64/Arm64IRCompVec.cpp @@ -713,9 +713,52 @@ void Arm64JitBackend::CompIR_VecPack(IRInst inst) { break; case IROp::Vec2Unpack16To31: - case IROp::Vec4Unpack8To32: + // Viewed as 16-bit: ABxx -> 0A0B, then shift a zero into the sign place. + if (Overlap(inst.dest, 2, inst.src1, 1)) { + regs_.MapVec2(inst.dest, MIPSMap::DIRTY); + } else { + regs_.Map(inst); + } + if (inst.src1 == inst.dest + 1) { + fp_.USHLL2(16, regs_.FQ(inst.dest), regs_.FD(inst.src1), 15); + } else { + fp_.USHLL(16, regs_.FQ(inst.dest), regs_.FD(inst.src1), 15); + } + break; + case IROp::Vec2Unpack16To32: - CompIR_Generic(inst); + // Just Vec2Unpack16To31, without the shift. + if (Overlap(inst.dest, 2, inst.src1, 1)) { + regs_.MapVec2(inst.dest, MIPSMap::DIRTY); + } else { + regs_.Map(inst); + } + if (inst.src1 == inst.dest + 1) { + fp_.SHLL2(16, regs_.FQ(inst.dest), regs_.FD(inst.src1)); + } else { + fp_.SHLL(16, regs_.FQ(inst.dest), regs_.FD(inst.src1)); + } + break; + + case IROp::Vec4Unpack8To32: + // Viewed as 8-bit: ABCD -> 000A000B000C000D. + if (Overlap(inst.dest, 4, inst.src1, 1)) { + regs_.MapVec4(inst.dest, MIPSMap::DIRTY); + if (inst.dest == inst.src1 + 2) { + fp_.SHLL2(8, regs_.FQ(inst.dest), regs_.FD(inst.src1 & ~3)); + } else if (inst.dest != inst.src1) { + fp_.DUP(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), inst.src1 & 3); + fp_.SHLL(8, regs_.FQ(inst.dest), regs_.FD(inst.dest)); + } else { + fp_.SHLL(8, regs_.FQ(inst.dest), regs_.FD(inst.src1)); + } + fp_.SHLL(16, regs_.FQ(inst.dest), regs_.FD(inst.dest)); + } else { + regs_.Map(inst); + // Two steps: ABCD -> 0A0B0C0D, then to 000A000B000C000D. + fp_.SHLL(8, regs_.FQ(inst.dest), regs_.FD(inst.src1)); + fp_.SHLL(16, regs_.FQ(inst.dest), regs_.FD(inst.dest)); + } break; default: