Skip to content

Commit

Permalink
Merge pull request #17634 from fp64/macro-x86-loadu
Browse files Browse the repository at this point in the history
Streamline x86 SSE workaround
  • Loading branch information
unknownbrackets committed Jun 28, 2023
2 parents c66182a + 436b49c commit dfe113e
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 92 deletions.
125 changes: 42 additions & 83 deletions GPU/Math3D.h
Expand Up @@ -43,6 +43,29 @@
#define MATH3D_CALL
#endif

// There's probably a better place to define these macros.
#if PPSSPP_ARCH(X86)
// On 32-bit x86, MSVC does not guarantee alignment for
// SSE arguments passed on stack (Compiler Error C2719), see e.g.:
// https://stackoverflow.com/questions/10484422/msvc-cannot-send-function-parameters-of-16byte-alignment-on-x86
// https://stackoverflow.com/questions/28488986/formal-parameter-with-declspecalign16-wont-be-aligned
// So, as a workaround, "dangerous" cases are loaded via loadu* on 32-bit x86.
// Compilers are decently ok at eliminating these extra loads, at least
// in trivial cases.
// NOTE: not to be outdone, GCC has its own flavor of broken, see e.g.:
// http://www.peterstock.co.uk/games/mingw_sse/
// https://github.com/nothings/stb/issues/81
// which is probably worse since it breaks alignment of locals and/or
// spills, but that, hopefully, does not affect PPSSPP (modern GCC+Linux
// is 16-byte aligned on x86, and MinGW is not a supported PPSSPP target).
// NOTE: weird double-casts add a bit of type-safety.
#define SAFE_M128(v) _mm_loadu_ps (reinterpret_cast<const float*> (static_cast<const __m128*> (&(v))))
#define SAFE_M128I(v) _mm_loadu_si128(reinterpret_cast<const __m128i*>(static_cast<const __m128i*>(&(v))))
#else // x64, FWIW also works for non-x86.
#define SAFE_M128(v) (v)
#define SAFE_M128I(v) (v)
#endif

namespace Math3D {

// Helper for Vec classes to clamp values.
Expand Down Expand Up @@ -582,19 +605,15 @@ class Vec4
template<typename T2>
Vec4<T2> Cast() const {
if constexpr (std::is_same<T, float>::value && std::is_same<T2, int>::value) {
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
return _mm_cvtps_epi32(vec);
#elif defined(_M_SSE)
return _mm_cvtps_epi32(_mm_loadu_ps(&x));
#if defined(_M_SSE)
return _mm_cvtps_epi32(SAFE_M128(vec));
#elif PPSSPP_ARCH(ARM64_NEON)
return vcvtq_s32_f32(vec);
#endif
}
if constexpr (std::is_same<T, int>::value && std::is_same<T2, float>::value) {
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
return _mm_cvtepi32_ps(ivec);
#elif defined(_M_SSE)
return _mm_cvtepi32_ps(_mm_loadu_si128(&ivec));
#if defined(_M_SSE)
return _mm_cvtepi32_ps(SAFE_M128I(ivec));
#elif PPSSPP_ARCH(ARM64_NEON)
return vcvtq_f32_s32(ivec);
#endif
Expand Down Expand Up @@ -929,11 +948,7 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12])

inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
#if defined(_M_SSE)
#if PPSSPP_ARCH(X86)
const __m128 vv = _mm_loadu_ps(&v.x);
#else
const __m128 vv = v.vec;
#endif
const __m128 vv = SAFE_M128(v.vec);
__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
Expand Down Expand Up @@ -993,11 +1008,7 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16])

inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
#if defined(_M_SSE)
#if PPSSPP_ARCH(X86)
const __m128 vv = _mm_loadu_ps(&v.x);
#else
const __m128 vv = v.vec;
#endif
const __m128 vv = SAFE_M128(v.vec);
__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
Expand Down Expand Up @@ -1057,11 +1068,7 @@ inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]

inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
#if defined(_M_SSE)
#if PPSSPP_ARCH(X86)
const __m128 vv = _mm_loadu_ps(&v.x);
#else
const __m128 vv = v.vec;
#endif
const __m128 vv = SAFE_M128(v.vec);
__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
Expand Down Expand Up @@ -1220,11 +1227,7 @@ template<>
__forceinline unsigned int Vec3<float>::ToRGB() const
{
#if defined(_M_SSE)
#if PPSSPP_ARCH(64BIT)
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(vec, _mm_set_ps1(255.0f)));
#else
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(&x), _mm_set_ps1(255.0f)));
#endif
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
__m128i c16 = _mm_packs_epi32(c, c);
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
#elif PPSSPP_ARCH(ARM64_NEON)
Expand All @@ -1242,11 +1245,7 @@ template<>
__forceinline unsigned int Vec3<int>::ToRGB() const
{
#if defined(_M_SSE)
#if PPSSPP_ARCH(64BIT)
__m128i c16 = _mm_packs_epi32(ivec, ivec);
#else
__m128i c16 = _mm_packs_epi32(_mm_loadu_si128(&ivec), _mm_setzero_si128());
#endif
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
#elif PPSSPP_ARCH(ARM64_NEON)
uint16x4_t c16 = vqmovun_s32(vsetq_lane_s32(0, ivec, 3));
Expand Down Expand Up @@ -1304,11 +1303,7 @@ template<>
__forceinline unsigned int Vec4<float>::ToRGBA() const
{
#if defined(_M_SSE)
#if PPSSPP_ARCH(64BIT)
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(vec, _mm_set_ps1(255.0f)));
#else
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(&x), _mm_set_ps1(255.0f)));
#endif
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
__m128i c16 = _mm_packs_epi32(c, c);
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
#elif PPSSPP_ARCH(ARM64_NEON)
Expand All @@ -1327,11 +1322,7 @@ template<>
__forceinline unsigned int Vec4<int>::ToRGBA() const
{
#if defined(_M_SSE)
#if PPSSPP_ARCH(64BIT)
__m128i c16 = _mm_packs_epi32(ivec, ivec);
#else
__m128i c16 = _mm_packs_epi32(_mm_loadu_si128(&ivec), _mm_setzero_si128());
#endif
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
#elif PPSSPP_ARCH(ARM64_NEON)
uint16x4_t c16 = vqmovun_s32(ivec);
Expand All @@ -1354,75 +1345,43 @@ __forceinline void Vec4<T>::ToRGBA(u8 *rgba) const
// Vec3<float> operation
template<>
inline void Vec3<float>::operator += (const Vec3<float> &other) {
#if PPSSPP_ARCH(X86)
*this = _mm_add_ps(_mm_loadu_ps(&x), _mm_loadu_ps(&other.x));
#else
vec = _mm_add_ps(vec, other.vec);
#endif
vec = _mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec));
}

template<>
inline Vec3<float> Vec3<float>::operator + (const Vec3 &other) const {
#if PPSSPP_ARCH(X86)
return Vec3<float>(_mm_add_ps(_mm_loadu_ps(&x), _mm_loadu_ps(&other.x)));
#else
return Vec3<float>(_mm_add_ps(vec, other.vec));
#endif
return Vec3<float>(_mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
}

template<>
inline Vec3<float> Vec3<float>::operator * (const Vec3 &other) const {
#if PPSSPP_ARCH(X86)
return Vec3<float>(_mm_mul_ps(_mm_loadu_ps(&x), _mm_loadu_ps(&other.x)));
#else
return Vec3<float>(_mm_mul_ps(vec, other.vec));
#endif
return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
}

template<> template<>
inline Vec3<float> Vec3<float>::operator * (const float &other) const {
#if PPSSPP_ARCH(X86)
return Vec3<float>(_mm_mul_ps(_mm_loadu_ps(&x), _mm_set_ps1(other)));
#else
return Vec3<float>(_mm_mul_ps(vec, _mm_set_ps1(other)));
#endif
return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other)));
}

// Vec4<float> operation
template<>
inline void Vec4<float>::operator += (const Vec4<float> &other) {
#if PPSSPP_ARCH(X86)
*this = _mm_add_ps(_mm_loadu_ps(&x), _mm_loadu_ps(&other.x));
#else
vec = _mm_add_ps(vec, other.vec);
#endif
vec = _mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec));
}

template<>
inline Vec4<float> Vec4<float>::operator + (const Vec4 &other) const {
#if PPSSPP_ARCH(X86)
return Vec4<float>(_mm_add_ps(_mm_loadu_ps(&x), _mm_loadu_ps(&other.x)));
#else
return Vec4<float>(_mm_add_ps(vec, other.vec));
#endif
return Vec4<float>(_mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
}

template<>
inline Vec4<float> Vec4<float>::operator * (const Vec4 &other) const {
#if PPSSPP_ARCH(X86)
return Vec4<float>(_mm_mul_ps(_mm_loadu_ps(&x), _mm_loadu_ps(&other.x)));
#else
return Vec4<float>(_mm_mul_ps(vec, other.vec));
#endif
return Vec4<float>(_mm_mul_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
}

template<> template<>
inline Vec4<float> Vec4<float>::operator * (const float &other) const {
#if PPSSPP_ARCH(X86)
return Vec4<float>(_mm_mul_ps(_mm_loadu_ps(&x), _mm_set_ps1(other)));
#else
return Vec4<float>(_mm_mul_ps(vec, _mm_set_ps1(other)));
#endif
return Vec4<float>(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other)));
}

// Vec3<float> cross product
Expand Down
5 changes: 2 additions & 3 deletions GPU/Software/Lighting.cpp
Expand Up @@ -256,9 +256,8 @@ static inline void LightColorSum(Vec4<int> &sum, const Vec4<int> &src) {
}

static inline float Dot33(const Vec3f &a, const Vec3f &b) {
// NOTE: We can't guarantee aligned stack/parameter on 32-bit x86, so we avoid this path there.
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
__m128 v = _mm_mul_ps(a.vec, b.vec); // [X, Y, Z, W]
#if defined(_M_SSE)
__m128 v = _mm_mul_ps(SAFE_M128(a.vec), SAFE_M128(b.vec)); // [X, Y, Z, W]
__m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 2, 0, 1)); // [Y, X, Z, W]
__m128 sums = _mm_add_ps(v, shuf); // [X + Y, X + Y, Z + Z, W + W]
shuf = _mm_movehl_ps(shuf, shuf); // [Z, W, Z, W]
Expand Down
8 changes: 2 additions & 6 deletions GPU/Software/Rasterizer.cpp
Expand Up @@ -690,11 +690,7 @@ static inline void ApplyTexturing(const RasterizerState &state, Vec4<int> *prim_
static inline Vec4<int> SOFTRAST_CALL CheckDepthTestPassed4(const Vec4<int> &mask, GEComparison func, int x, int y, int stride, Vec4<int> z) {
// Skip the depth buffer read if we're masked already.
#if defined(_M_SSE)
#if PPSSPP_ARCH(64BIT)
__m128i result = mask.ivec;
#else
__m128i result = _mm_loadu_si128(&mask.ivec);
#endif
__m128i result = SAFE_M128I(mask.ivec);
int maskbits = _mm_movemask_epi8(result);
if (maskbits >= 0xFFFF)
return mask;
Expand Down Expand Up @@ -952,7 +948,7 @@ static inline bool AnyMask(const Vec4<int> &mask) {
}

// Source: https://fgiesen.wordpress.com/2013/02/10/optimizing-the-basic-rasterizer/#comment-6676
return _mm_movemask_ps(_mm_castsi128_ps(mask.ivec))!=15;
return _mm_movemask_ps(_mm_castsi128_ps(mask.ivec)) != 15;
#elif PPSSPP_ARCH(ARM64_NEON)
int64x2_t sig = vreinterpretq_s64_s32(vshrq_n_s32(mask.ivec, 31));
return vgetq_lane_s64(sig, 0) != -1 || vgetq_lane_s64(sig, 1) != -1;
Expand Down

0 comments on commit dfe113e

Please sign in to comment.