Permalink
Browse files

Add some more SIMD support to IR interpreter. Mostly just because, bu…

…t also serves as implementation reference for later code generation backends.
  • Loading branch information...
hrydgard committed Jan 4, 2018
1 parent a128624 commit e0cc126d09390522ae30b5b8413e3490f00a4258
Showing with 46 additions and 4 deletions.
  1. +46 −4 Core/MIPS/IR/IRInterpreter.cpp
@@ -1,11 +1,16 @@
#include <algorithm>
#include <cmath>
#include "ppsspp_config.h"
#include "math/math_util.h"
#include "Common/Common.h"
#ifdef _M_SSE
#include <emmintrin.h>
#include <nmmintrin.h>
#endif
#if PPSSPP_ARCH(ARM_NEON)
#include <arm_neon.h>
#endif
#include "Core/Core.h"
@@ -40,6 +45,10 @@ alignas(16) static const uint32_t noSignMask[4] = {
0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
};
alignas(16) static const uint32_t lowBytesMask[4] = {
0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF,
};
u32 RunBreakpoint(u32 pc) {
// Should we skip this breakpoint?
if (CBreakPoints::CheckSkipFirst() == pc)
@@ -58,6 +67,7 @@ u32 RunMemCheck(u32 pc, u32 addr) {
return coreState != CORE_RUNNING ? 1 : 0;
}
// We cannot use NEON on ARM32 here until we make it a hard dependency. We can, however, on ARM64.
u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
const IRInst *end = inst + count;
while (inst != end) {
@@ -185,8 +195,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
case IROp::Vec4Shuffle:
{
// Can't use the SSE shuffle here because it takes an immediate.
// Backends with SSE support could use that though.
// Can't use the SSE shuffle here because it takes an immediate. pshufb with a table would work though,
// or a big switch - there are only 256 shuffles possible (4^4)
for (int i = 0; i < 4; i++)
mips->f[inst->dest + i] = mips->f[inst->src1 + ((inst->src2 >> (i * 2)) & 3)];
break;
@@ -195,6 +205,9 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
case IROp::Vec4Mov:
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1]));
#elif PPSSPP_CONFIG(ARM64)
float32x4_t c = vld1q_f32(&mips->f[inst->src1]);
vst1q_f32(&mips->f[inst->dest], c);
#else
memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float));
#endif
@@ -274,10 +287,17 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
break;
case IROp::Vec4Unpack8To32:
#if defined(_M_SSE)
__m128i src = _mm_cvtsi32_si128(mips->fi[inst->src1]);
src = _mm_unpacklo_epi16(src, _mm_setzero_si128());
src = _mm_unpacklo_epi32(src, _mm_setzero_si128());
_mm_store_si128((__m128i *)&mips->fi[inst->dest], _mm_slli_epi32(src, 24));
#else
mips->fi[inst->dest] = (mips->fi[inst->src1] << 24);
mips->fi[inst->dest + 1] = (mips->fi[inst->src1] << 16) & 0xFF000000;
mips->fi[inst->dest + 2] = (mips->fi[inst->src1] << 8) & 0xFF000000;
mips->fi[inst->dest + 3] = (mips->fi[inst->src1]) & 0xFF000000;
#endif
break;
case IROp::Vec2Pack32To16:
@@ -297,21 +317,36 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
case IROp::Vec4Pack32To8:
{
#if defined(_M_SSE)
// Packs the upper bits, so we need to shift down. Then we can just use SSE packing.
__m128i val = _mm_srli_epi32(_mm_load_si128((const __m128i *)&mips->fi[inst->src1]), 24);
val = _mm_packs_epi16(_mm_packs_epi32(val, _mm_setzero_si128()), _mm_setzero_si128());
mips->fi[inst->dest] = _mm_cvtsi128_si32(val);
#else
u32 val = mips->fi[inst->src1] >> 24;
val |= (mips->fi[inst->src1 + 1] >> 16) & 0xFF00;
val |= (mips->fi[inst->src1 + 2] >> 8) & 0xFF0000;
val |= (mips->fi[inst->src1 + 3]) & 0xFF000000;
mips->fi[inst->dest] = val;
break;
#endif
}
case IROp::Vec4Pack31To8:
{
#if defined(_M_SSE)
// Packs the upper bits (offset by 1), so we need to shift down and mask. Then we can just use SSE packing.
__m128i val = _mm_srli_epi32(_mm_load_si128((const __m128i *)&mips->fi[inst->src1]), 23);
val = _mm_and_si128(val, _mm_load_si128((const __m128i *)&lowBytesMask));
val = _mm_packs_epi16(_mm_packs_epi32(val, _mm_setzero_si128()), _mm_setzero_si128());
mips->fi[inst->dest] = _mm_cvtsi128_si32(val);
#else
u32 val = (mips->fi[inst->src1] >> 23) & 0xFF;
val |= (mips->fi[inst->src1 + 1] >> 15) & 0xFF00;
val |= (mips->fi[inst->src1 + 2] >> 7) & 0xFF0000;
val |= (mips->fi[inst->src1 + 3] << 1) & 0xFF000000;
mips->fi[inst->dest] = val;
#endif
break;
}
@@ -326,14 +361,21 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
case IROp::Vec4ClampToZero:
{
#if 0 && defined(_M_SSE)
// This is SSE4 only unfortunately, so only suitable for JIT, hence disabled above.
__m128i val = _mm_load_si128((const __m128i *)&mips->fi[inst->src1]);
val = _mm_max_epi32(val, _mm_setzero_si128());
mips->fi[inst->dest] = _mm_cvtsi128_si32(val);
#else
for (int i = 0; i < 4; i++) {
u32 val = mips->fi[inst->src1 + i];
mips->fi[inst->dest + i] = (int)val >= 0 ? val : 0;
}
break;
#endif
}
case IROp::Vec4DuplicateUpperBitsAndShift1:
case IROp::Vec4DuplicateUpperBitsAndShift1: // For vuc2i, the weird one.
for (int i = 0; i < 4; i++) {
u32 val = mips->fi[inst->src1 + i];
val = val | (val >> 8);

0 comments on commit e0cc126

Please sign in to comment.