Skip to content

Commit

Permalink
Merge pull request #18450 from hrydgard/neon-arm32
Browse files Browse the repository at this point in the history
Enable some NEON optimizations on ARM32 that we only had on ARM64 before
  • Loading branch information
hrydgard committed Nov 27, 2023
2 parents d58f826 + 4ec2d76 commit 8ad0ef6
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 29 deletions.
12 changes: 6 additions & 6 deletions Core/MIPS/IR/IRInterpreter.cpp
Expand Up @@ -335,7 +335,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1]));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vld1q_f32(&mips->f[inst->src1]));
#else
memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float));
Expand All @@ -347,7 +347,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_add_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vaddq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
#else
for (int i = 0; i < 4; i++)
Expand All @@ -360,7 +360,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_sub_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vsubq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
#else
for (int i = 0; i < 4; i++)
Expand All @@ -373,7 +373,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vmulq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
#else
for (int i = 0; i < 4; i++)
Expand Down Expand Up @@ -408,7 +408,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_xor_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)signBits)));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vnegq_f32(vld1q_f32(&mips->f[inst->src1])));
#else
for (int i = 0; i < 4; i++)
Expand All @@ -421,7 +421,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_and_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)noSignMask)));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vabsq_f32(vld1q_f32(&mips->f[inst->src1])));
#else
for (int i = 0; i < 4; i++)
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUCommon.cpp
Expand Up @@ -1944,7 +1944,7 @@ bool GPUCommon::DescribeCodePtr(const u8 *ptr, std::string &name) {
}

void GPUCommon::UpdateUVScaleOffset() {
#ifdef _M_SSE
#if defined(_M_SSE)
__m128i values = _mm_slli_epi32(_mm_load_si128((const __m128i *)&gstate.texscaleu), 8);
_mm_storeu_si128((__m128i *)&gstate_c.uv, values);
#elif PPSSPP_ARCH(ARM_NEON)
Expand Down
90 changes: 69 additions & 21 deletions GPU/Math3D.h
Expand Up @@ -219,7 +219,7 @@ class Vec3
#if defined(_M_SSE)
__m128i ivec;
__m128 vec;
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
int32x4_t ivec;
float32x4_t vec;
#endif
Expand All @@ -238,7 +238,7 @@ class Vec3
Vec3(const Vec3Packed<T> &_xyz) {
vec = _mm_loadu_ps(_xyz.AsArray());
}
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
Vec3(const float32x4_t &_vec) : vec(_vec) {}
#if !defined(_MSC_VER)
Vec3(const int32x4_t &_ivec) : ivec(_ivec) {}
Expand Down Expand Up @@ -578,7 +578,7 @@ class Vec4
#if defined(_M_SSE)
__m128i ivec;
__m128 vec;
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
int32x4_t ivec;
float32x4_t vec;
#endif
Expand All @@ -595,7 +595,7 @@ class Vec4
#if defined(_M_SSE)
Vec4(const __m128 &_vec) : vec(_vec) {}
Vec4(const __m128i &_ivec) : ivec(_ivec) {}
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
Vec4(const float32x4_t &_vec) : vec(_vec) {}
#if !defined(_MSC_VER)
Vec4(const int32x4_t &_ivec) : ivec(_ivec) {}
Expand All @@ -607,14 +607,14 @@ class Vec4
if constexpr (std::is_same<T, float>::value && std::is_same<T2, int>::value) {
#if defined(_M_SSE)
return _mm_cvtps_epi32(SAFE_M128(vec));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
return vcvtq_s32_f32(vec);
#endif
}
if constexpr (std::is_same<T, int>::value && std::is_same<T2, float>::value) {
#if defined(_M_SSE)
return _mm_cvtepi32_ps(SAFE_M128I(ivec));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
return vcvtq_f32_s32(ivec);
#endif
}
Expand Down Expand Up @@ -922,7 +922,7 @@ inline __m128 MATH3D_CALL Vec3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, c
_mm_add_ps(_mm_mul_ps(col2, z), col3));
return sum;
}
#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
float32x4_t col0 = vld1q_f32(m);
float32x4_t col1 = vld1q_f32(m + 3);
Expand All @@ -933,6 +933,17 @@ inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
return sum;
}
#elif PPSSPP_ARCH(ARM_NEON)
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
float32x4_t col0 = vld1q_f32(m);
float32x4_t col1 = vld1q_f32(m + 3);
float32x4_t col2 = vld1q_f32(m + 6);
float32x4_t col3 = vld1q_f32(m + 9);
float32x4_t sum = vaddq_f32(
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
return sum;
}
#endif

// v and vecOut must point to different memory.
Expand All @@ -947,7 +958,7 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12])
vecOut[0] = _mm_cvtss_f32(sum);
vecOut[1] = vectorGetByIndex<1>(sum);
vecOut[2] = vectorGetByIndex<2>(sum);
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
float vecIn[4] = {v[0], v[1], v[2], 1.0f};
float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(vecIn), m);
vecOut[0] = vgetq_lane_f32(sum, 0);
Expand All @@ -967,7 +978,7 @@ inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
return Vec3ByMatrix43Internal(x, y, z, m);
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
return Vec3ByMatrix43Internal(v.vec, m);
#else
Vec3f vecOut;
Expand Down Expand Up @@ -999,6 +1010,17 @@ inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
return sum;
}
#elif PPSSPP_ARCH(ARM_NEON)
inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
float32x4_t col0 = vld1q_f32(m);
float32x4_t col1 = vld1q_f32(m + 4);
float32x4_t col2 = vld1q_f32(m + 8);
float32x4_t col3 = vld1q_f32(m + 12);
float32x4_t sum = vaddq_f32(
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
return sum;
}
#endif

inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) {
Expand All @@ -1008,7 +1030,7 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16])
__m128 z = _mm_set1_ps(v[2]);
__m128 sum = Vec3ByMatrix44Internal(x, y, z, m);
_mm_storeu_ps(vecOut, sum);
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
float vecIn[4] = {v[0], v[1], v[2], 1.0f};
float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(vecIn), m);
vst1q_f32(vecOut, sum);
Expand All @@ -1027,7 +1049,7 @@ inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
return Vec3ByMatrix44Internal(x, y, z, m);
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
return Vec3ByMatrix44Internal(v.vec, m);
#else
Vec4f vecOut;
Expand Down Expand Up @@ -1057,6 +1079,16 @@ inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
vmulq_laneq_f32(col2, vec, 2));
return sum;
}
#elif PPSSPP_ARCH(ARM_NEON)
inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
float32x4_t col0 = vld1q_f32(m);
float32x4_t col1 = vld1q_f32(m + 3);
float32x4_t col2 = vld1q_f32(m + 6);
float32x4_t sum = vaddq_f32(
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
vmulq_lane_f32(col2, vget_high_f32(vec), 2));
return sum;
}
#endif

inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
Expand All @@ -1068,7 +1100,7 @@ inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]
vecOut[0] = _mm_cvtss_f32(sum);
vecOut[1] = vectorGetByIndex<1>(sum);
vecOut[2] = vectorGetByIndex<2>(sum);
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m);
vecOut[0] = vgetq_lane_f32(sum, 0);
vecOut[1] = vgetq_lane_f32(sum, 1);
Expand All @@ -1087,7 +1119,7 @@ inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
return Norm3ByMatrix43Internal(x, y, z, m);
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
return Norm3ByMatrix43Internal(v.vec, m);
#else
Vec3f vecOut;
Expand Down Expand Up @@ -1120,6 +1152,13 @@ inline void ConvertMatrix4x3To4x4(float *m4x4, const float *m4x3) {
}

inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
#if PPSSPP_ARCH(ARM_NEON)
// vld3q is a perfect match here!
float32x4x3_t packed = vld3q_f32(m4x3);
vst1q_f32(m4x4, packed.val[0]);
vst1q_f32(m4x4 + 4, packed.val[1]);
vst1q_f32(m4x4 + 8, packed.val[2]);
#else
m4x4[0] = m4x3[0];
m4x4[1] = m4x3[3];
m4x4[2] = m4x3[6];
Expand All @@ -1132,6 +1171,7 @@ inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
m4x4[9] = m4x3[5];
m4x4[10] = m4x3[8];
m4x4[11] = m4x3[11];
#endif
m4x4[12] = 0.0f;
m4x4[13] = 0.0f;
m4x4[14] = 0.0f;
Expand All @@ -1147,6 +1187,13 @@ inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
// 89AB
// Don't see a way to SIMD that. Should be pretty fast anyway.
inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {
#if PPSSPP_ARCH(ARM_NEON)
// vld3q is a perfect match here!
float32x4x3_t packed = vld3q_f32(m4x3);
vst1q_f32(m4x4, packed.val[0]);
vst1q_f32(m4x4 + 4, packed.val[1]);
vst1q_f32(m4x4 + 8, packed.val[2]);
#else
m4x4[0] = m4x3[0];
m4x4[1] = m4x3[3];
m4x4[2] = m4x3[6];
Expand All @@ -1159,6 +1206,7 @@ inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {
m4x4[9] = m4x3[5];
m4x4[10] = m4x3[8];
m4x4[11] = m4x3[11];
#endif
}

inline void Transpose4x4(float out[16], const float in[16]) {
Expand Down Expand Up @@ -1209,7 +1257,7 @@ inline Vec3<float> Vec3<float>::FromRGB(unsigned int rgb)
__m128i c = _mm_cvtsi32_si128(rgb);
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
return Vec3<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
return Vec3<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
Expand All @@ -1228,7 +1276,7 @@ inline Vec3<int> Vec3<int>::FromRGB(unsigned int rgb)
__m128i c = _mm_cvtsi32_si128(rgb);
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
return Vec3<int>(c);
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
return Vec3<int>(vreinterpretq_s32_u32(u));
Expand All @@ -1244,7 +1292,7 @@ __forceinline unsigned int Vec3<float>::ToRGB() const
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
__m128i c16 = _mm_packs_epi32(c, c);
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vsetq_lane_f32(0.0f, vec, 3), vdupq_n_f32(255.0f))));
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
Expand All @@ -1261,7 +1309,7 @@ __forceinline unsigned int Vec3<int>::ToRGB() const
#if defined(_M_SSE)
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
uint16x4_t c16 = vqmovun_s32(vsetq_lane_s32(0, ivec, 3));
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
Expand All @@ -1278,7 +1326,7 @@ inline Vec4<float> Vec4<float>::FromRGBA(unsigned int rgba)
__m128i c = _mm_cvtsi32_si128(rgba);
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
return Vec4<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
return Vec4<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
Expand All @@ -1304,7 +1352,7 @@ inline Vec4<int> Vec4<int>::FromRGBA(unsigned int rgba)
__m128i c = _mm_cvtsi32_si128(rgba);
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
return Vec4<int>(c);
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
return Vec4<int>(vreinterpretq_s32_u32(u));
Expand All @@ -1320,7 +1368,7 @@ __forceinline unsigned int Vec4<float>::ToRGBA() const
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
__m128i c16 = _mm_packs_epi32(c, c);
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vec, vdupq_n_f32(255.0f))));
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
Expand All @@ -1338,7 +1386,7 @@ __forceinline unsigned int Vec4<int>::ToRGBA() const
#if defined(_M_SSE)
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
uint16x4_t c16 = vqmovun_s32(ivec);
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
Expand Down
2 changes: 1 addition & 1 deletion ppsspp_config.h
Expand Up @@ -57,7 +57,7 @@
#if defined(__aarch64__) || defined(_M_ARM64)
#define PPSSPP_ARCH_ARM64 1
#define PPSSPP_ARCH_64BIT 1
#define PPSSPP_ARCH_ARM_NEON 1
#define PPSSPP_ARCH_ARM_NEON 1 // Applies to both ARM32 and ARM64
#define PPSSPP_ARCH_ARM64_NEON 1
#endif

Expand Down

0 comments on commit 8ad0ef6

Please sign in to comment.