Merge pull request #17648 from fp64/div-less

Replace some signed divison in SoftGPU
hrydgard · Jul 1, 2023 · 9c08e27 · 9c08e27
2 parents 6315fae + cd9f01c
commit 9c08e27
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 17 deletions.
diff --git a/Common/MemoryUtil.cpp b/Common/MemoryUtil.cpp
@@ -258,7 +258,7 @@ void *AllocateAlignedMemory(size_t size, size_t alignment) {
 #endif
 #endif
 
-	_assert_msg_(ptr != nullptr, "Failed to allocate aligned memory of size %lu", size);
+	_assert_msg_(ptr != nullptr, "Failed to allocate aligned memory of size %llu", (unsigned long long)size);
 	return ptr;
 }
 

diff --git a/GPU/Math3D.h b/GPU/Math3D.h
@@ -665,6 +665,20 @@ class Vec4
 	{
 		return Vec4(x | other.x, y | other.y, z | other.z, w | other.w);
 	}
+	Vec4 operator & (const Vec4 &other) const
+	{
+		return Vec4(x & other.x, y & other.y, z & other.z, w & other.w);
+	}
+	Vec4 operator << (const int amount) const
+	{
+		// NOTE: x*(1<<amount), etc., might be safer, since
+		// left-shifting negatives is UB pre-C++20.
+		return Vec4(x << amount, y << amount, z << amount, w << amount);
+	}
+	Vec4 operator >> (const int amount) const
+	{
+		return Vec4(x >> amount, y >> amount, z >> amount, w >> amount);
+	}
 	template<typename V>
 	Vec4 operator * (const V& f) const
 	{
@@ -1363,6 +1377,57 @@ inline Vec3<float> Vec3<float>::operator * (const float &other) const {
 	return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other)));
 }
 
+// Vec4<int> operation
+template<>
+inline Vec4<int> Vec4<int>::operator + (const Vec4 &other) const {
+	return Vec4<int>(_mm_add_epi32(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
+}
+
+template<>
+inline Vec4<int> Vec4<int>::operator * (const Vec4 &other) const {
+	__m128i a = SAFE_M128I(ivec);
+	__m128i b = SAFE_M128I(other.ivec);
+	// Intel in its immense wisdom decided that
+	// SSE2 does not get _mm_mullo_epi32(),
+	// so we do it this way. This is what clang does,
+	// which seems about as good as it gets.
+	__m128i m02 = _mm_mul_epu32(a, b);
+	__m128i m13 = _mm_mul_epu32(
+		_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 1, 1)),
+		_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)));
+	__m128i ret = _mm_unpacklo_epi32(
+		_mm_shuffle_epi32(m02, _MM_SHUFFLE(3, 2, 2, 0)),
+		_mm_shuffle_epi32(m13, _MM_SHUFFLE(3, 2, 2, 0)));
+	return Vec4<int>(ret);
+}
+
+template<> template<>
+inline Vec4<int> Vec4<int>::operator * (const int &other) const {
+	return (*this) * Vec4<int>(_mm_set1_epi32(other));
+}
+
+template<>
+inline Vec4<int> Vec4<int>::operator | (const Vec4 &other) const {
+	return Vec4<int>(_mm_or_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
+}
+
+template<>
+inline Vec4<int> Vec4<int>::operator & (const Vec4 &other) const {
+	return Vec4<int>(_mm_and_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
+}
+
+// NOTE: modern GCC, clang, and MSVC are all ok with
+// non-compile-time-const amount for _mm_slli_epi32/_mm_srli_epi32.
+template<>
+inline Vec4<int> Vec4<int>::operator << (const int amount) const {
+	return Vec4<int>(_mm_slli_epi32(SAFE_M128I(ivec), amount));
+}
+
+template<>
+inline Vec4<int> Vec4<int>::operator >> (const int amount) const {
+	return Vec4<int>(_mm_srli_epi32(SAFE_M128I(ivec), amount));
+}
+
 // Vec4<float> operation
 template<>
 inline void Vec4<float>::operator += (const Vec4<float> &other) {

diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp
@@ -283,7 +283,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
 	Vec4<int> mec = Vec4<int>::FromRGBA(gstate.getMaterialEmissive());
 
 	Vec4<int> mac = state.colorForAmbient ? colorFactor : state.material.ambientColorFactor;
-	Vec4<int> ambient = (mac * state.baseAmbientColorFactor) / 1024;
+	Vec4<int> ambient = (mac * state.baseAmbientColorFactor) >> 10;
 
 	Vec4<int> final_color = mec + ambient;
 	Vec4<int> specular_color = Vec4<int>::AssignToAll(0);

diff --git a/GPU/Software/Sampler.cpp b/GPU/Software/Sampler.cpp
@@ -276,13 +276,13 @@ static inline int GetPixelDataOffset(uint32_t row_pitch_pixels, uint32_t u, uint
 	if (!swizzled)
 		return (v * (row_pitch_pixels * texel_size_bits >> 3)) + (u * texel_size_bits >> 3);
 
-	const int tile_size_bits = 32;
-	const int tiles_in_block_horizontal = 4;
-	const int tiles_in_block_vertical = 8;
+	const uint32_t tile_size_bits = 32;
+	const uint32_t tiles_in_block_horizontal = 4;
+	const uint32_t tiles_in_block_vertical = 8;
 
-	constexpr int texels_per_tile = tile_size_bits / texel_size_bits;
-	int tile_u = u / texels_per_tile;
-	int tile_idx = (v % tiles_in_block_vertical) * (tiles_in_block_horizontal) +
+	constexpr uint32_t texels_per_tile = tile_size_bits / texel_size_bits;
+	uint32_t tile_u = u / texels_per_tile;
+	uint32_t tile_idx = (v % tiles_in_block_vertical) * (tiles_in_block_horizontal) +
 	// TODO: not sure if the *texel_size_bits/8 factor is correct
 					(v / tiles_in_block_vertical) * ((row_pitch_pixels*texel_size_bits/(tile_size_bits))*tiles_in_block_vertical) +
 					(tile_u % tiles_in_block_horizontal) +
@@ -408,22 +408,22 @@ inline static Nearest4 SOFTRAST_CALL SampleNearest(const int u[N], const int v[N
 
 	case GE_TFMT_DXT1:
 		for (int i = 0; i < N; ++i) {
-			const DXT1Block *block = (const DXT1Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4);
-			res.v[i] = GetDXT1Texel(block, u[i] % 4, v[i] % 4);
+			const DXT1Block *block = (const DXT1Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);
+			res.v[i] = GetDXT1Texel(block, u[i] & 3, v[i] & 3);
 		}
 		return res;
 
 	case GE_TFMT_DXT3:
 		for (int i = 0; i < N; ++i) {
-			const DXT3Block *block = (const DXT3Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4);
-			res.v[i] = GetDXT3Texel(block, u[i] % 4, v[i] % 4);
+			const DXT3Block *block = (const DXT3Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);
+			res.v[i] = GetDXT3Texel(block, u[i] & 3, v[i] & 3);
 		}
 		return res;
 
 	case GE_TFMT_DXT5:
 		for (int i = 0; i < N; ++i) {
-			const DXT5Block *block = (const DXT5Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4);
-			res.v[i] = GetDXT5Texel(block, u[i] % 4, v[i] % 4);
+			const DXT5Block *block = (const DXT5Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);
+			res.v[i] = GetDXT5Texel(block, u[i] & 3, v[i] & 3);
 		}
 		return res;
 
@@ -613,7 +613,7 @@ static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, Vec4IntArg pr
 		GetTexelCoordinates(level + 1, s, t, u, v, samplerID);
 		Vec4<int> c1 = Vec4<int>::FromRGBA(SampleNearest<1>(&u, &v, tptr[1], bufw[1], level + 1, samplerID).v[0]);
 
-		c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) / 16;
+		c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) >> 4;
 	}
 
 	return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0), samplerID);
@@ -748,15 +748,15 @@ static Vec4IntResult SOFTRAST_CALL SampleLinearLevel(float s, float t, const u8
 	Vec4<int> texcolor_br = Vec4<int>::FromRGBA(c.v[3]);
 	Vec4<int> top = texcolor_tl * (0x10 - frac_u) + texcolor_tr * frac_u;
 	Vec4<int> bot = texcolor_bl * (0x10 - frac_u) + texcolor_br * frac_u;
-	return ToVec4IntResult((top * (0x10 - frac_v) + bot * frac_v) / (16 * 16));
+	return ToVec4IntResult((top * (0x10 - frac_v) + bot * frac_v) >> (4 + 4));
 #endif
 }
 
 static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int texlevel, int levelFrac, const SamplerID &samplerID) {
 	Vec4<int> c0 = SampleLinearLevel(s, t, tptr, bufw, texlevel, samplerID);
 	if (levelFrac) {
 		const Vec4<int> c1 = SampleLinearLevel(s, t, tptr + 1, bufw + 1, texlevel + 1, samplerID);
-		c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) / 16;
+		c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) >> 4;
 	}
 	return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0), samplerID);
 }