Skip to content

Commit

Permalink
Merge pull request #17648 from fp64/div-less
Browse files Browse the repository at this point in the history
Replace some signed divison in SoftGPU
  • Loading branch information
unknownbrackets committed Jul 1, 2023
2 parents 6315fae + cd9f01c commit 9c08e27
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 17 deletions.
2 changes: 1 addition & 1 deletion Common/MemoryUtil.cpp
Expand Up @@ -258,7 +258,7 @@ void *AllocateAlignedMemory(size_t size, size_t alignment) {
#endif
#endif

_assert_msg_(ptr != nullptr, "Failed to allocate aligned memory of size %lu", size);
_assert_msg_(ptr != nullptr, "Failed to allocate aligned memory of size %llu", (unsigned long long)size);
return ptr;
}

Expand Down
65 changes: 65 additions & 0 deletions GPU/Math3D.h
Expand Up @@ -665,6 +665,20 @@ class Vec4
{
return Vec4(x | other.x, y | other.y, z | other.z, w | other.w);
}
Vec4 operator & (const Vec4 &other) const
{
return Vec4(x & other.x, y & other.y, z & other.z, w & other.w);
}
Vec4 operator << (const int amount) const
{
// NOTE: x*(1<<amount), etc., might be safer, since
// left-shifting negatives is UB pre-C++20.
return Vec4(x << amount, y << amount, z << amount, w << amount);
}
Vec4 operator >> (const int amount) const
{
return Vec4(x >> amount, y >> amount, z >> amount, w >> amount);
}
template<typename V>
Vec4 operator * (const V& f) const
{
Expand Down Expand Up @@ -1363,6 +1377,57 @@ inline Vec3<float> Vec3<float>::operator * (const float &other) const {
return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other)));
}

// Vec4<int> operation
template<>
inline Vec4<int> Vec4<int>::operator + (const Vec4 &other) const {
return Vec4<int>(_mm_add_epi32(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
}

template<>
inline Vec4<int> Vec4<int>::operator * (const Vec4 &other) const {
__m128i a = SAFE_M128I(ivec);
__m128i b = SAFE_M128I(other.ivec);
// Intel in its immense wisdom decided that
// SSE2 does not get _mm_mullo_epi32(),
// so we do it this way. This is what clang does,
// which seems about as good as it gets.
__m128i m02 = _mm_mul_epu32(a, b);
__m128i m13 = _mm_mul_epu32(
_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 1, 1)),
_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)));
__m128i ret = _mm_unpacklo_epi32(
_mm_shuffle_epi32(m02, _MM_SHUFFLE(3, 2, 2, 0)),
_mm_shuffle_epi32(m13, _MM_SHUFFLE(3, 2, 2, 0)));
return Vec4<int>(ret);
}

template<> template<>
inline Vec4<int> Vec4<int>::operator * (const int &other) const {
return (*this) * Vec4<int>(_mm_set1_epi32(other));
}

template<>
inline Vec4<int> Vec4<int>::operator | (const Vec4 &other) const {
return Vec4<int>(_mm_or_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
}

template<>
inline Vec4<int> Vec4<int>::operator & (const Vec4 &other) const {
return Vec4<int>(_mm_and_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
}

// NOTE: modern GCC, clang, and MSVC are all ok with
// non-compile-time-const amount for _mm_slli_epi32/_mm_srli_epi32.
template<>
inline Vec4<int> Vec4<int>::operator << (const int amount) const {
return Vec4<int>(_mm_slli_epi32(SAFE_M128I(ivec), amount));
}

template<>
inline Vec4<int> Vec4<int>::operator >> (const int amount) const {
return Vec4<int>(_mm_srli_epi32(SAFE_M128I(ivec), amount));
}

// Vec4<float> operation
template<>
inline void Vec4<float>::operator += (const Vec4<float> &other) {
Expand Down
2 changes: 1 addition & 1 deletion GPU/Software/Lighting.cpp
Expand Up @@ -283,7 +283,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
Vec4<int> mec = Vec4<int>::FromRGBA(gstate.getMaterialEmissive());

Vec4<int> mac = state.colorForAmbient ? colorFactor : state.material.ambientColorFactor;
Vec4<int> ambient = (mac * state.baseAmbientColorFactor) / 1024;
Vec4<int> ambient = (mac * state.baseAmbientColorFactor) >> 10;

Vec4<int> final_color = mec + ambient;
Vec4<int> specular_color = Vec4<int>::AssignToAll(0);
Expand Down
30 changes: 15 additions & 15 deletions GPU/Software/Sampler.cpp
Expand Up @@ -276,13 +276,13 @@ static inline int GetPixelDataOffset(uint32_t row_pitch_pixels, uint32_t u, uint
if (!swizzled)
return (v * (row_pitch_pixels * texel_size_bits >> 3)) + (u * texel_size_bits >> 3);

const int tile_size_bits = 32;
const int tiles_in_block_horizontal = 4;
const int tiles_in_block_vertical = 8;
const uint32_t tile_size_bits = 32;
const uint32_t tiles_in_block_horizontal = 4;
const uint32_t tiles_in_block_vertical = 8;

constexpr int texels_per_tile = tile_size_bits / texel_size_bits;
int tile_u = u / texels_per_tile;
int tile_idx = (v % tiles_in_block_vertical) * (tiles_in_block_horizontal) +
constexpr uint32_t texels_per_tile = tile_size_bits / texel_size_bits;
uint32_t tile_u = u / texels_per_tile;
uint32_t tile_idx = (v % tiles_in_block_vertical) * (tiles_in_block_horizontal) +
// TODO: not sure if the *texel_size_bits/8 factor is correct
(v / tiles_in_block_vertical) * ((row_pitch_pixels*texel_size_bits/(tile_size_bits))*tiles_in_block_vertical) +
(tile_u % tiles_in_block_horizontal) +
Expand Down Expand Up @@ -408,22 +408,22 @@ inline static Nearest4 SOFTRAST_CALL SampleNearest(const int u[N], const int v[N

case GE_TFMT_DXT1:
for (int i = 0; i < N; ++i) {
const DXT1Block *block = (const DXT1Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4);
res.v[i] = GetDXT1Texel(block, u[i] % 4, v[i] % 4);
const DXT1Block *block = (const DXT1Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);
res.v[i] = GetDXT1Texel(block, u[i] & 3, v[i] & 3);
}
return res;

case GE_TFMT_DXT3:
for (int i = 0; i < N; ++i) {
const DXT3Block *block = (const DXT3Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4);
res.v[i] = GetDXT3Texel(block, u[i] % 4, v[i] % 4);
const DXT3Block *block = (const DXT3Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);
res.v[i] = GetDXT3Texel(block, u[i] & 3, v[i] & 3);
}
return res;

case GE_TFMT_DXT5:
for (int i = 0; i < N; ++i) {
const DXT5Block *block = (const DXT5Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4);
res.v[i] = GetDXT5Texel(block, u[i] % 4, v[i] % 4);
const DXT5Block *block = (const DXT5Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);
res.v[i] = GetDXT5Texel(block, u[i] & 3, v[i] & 3);
}
return res;

Expand Down Expand Up @@ -613,7 +613,7 @@ static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, Vec4IntArg pr
GetTexelCoordinates(level + 1, s, t, u, v, samplerID);
Vec4<int> c1 = Vec4<int>::FromRGBA(SampleNearest<1>(&u, &v, tptr[1], bufw[1], level + 1, samplerID).v[0]);

c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) / 16;
c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) >> 4;
}

return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0), samplerID);
Expand Down Expand Up @@ -748,15 +748,15 @@ static Vec4IntResult SOFTRAST_CALL SampleLinearLevel(float s, float t, const u8
Vec4<int> texcolor_br = Vec4<int>::FromRGBA(c.v[3]);
Vec4<int> top = texcolor_tl * (0x10 - frac_u) + texcolor_tr * frac_u;
Vec4<int> bot = texcolor_bl * (0x10 - frac_u) + texcolor_br * frac_u;
return ToVec4IntResult((top * (0x10 - frac_v) + bot * frac_v) / (16 * 16));
return ToVec4IntResult((top * (0x10 - frac_v) + bot * frac_v) >> (4 + 4));
#endif
}

static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int texlevel, int levelFrac, const SamplerID &samplerID) {
Vec4<int> c0 = SampleLinearLevel(s, t, tptr, bufw, texlevel, samplerID);
if (levelFrac) {
const Vec4<int> c1 = SampleLinearLevel(s, t, tptr + 1, bufw + 1, texlevel + 1, samplerID);
c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) / 16;
c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) >> 4;
}
return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0), samplerID);
}
Expand Down

0 comments on commit 9c08e27

Please sign in to comment.