Skip to content

Commit

Permalink
softgpu: Keep arguments in vectors for sampling.
Browse files Browse the repository at this point in the history
  • Loading branch information
unknownbrackets committed Dec 4, 2021
1 parent d7c25b3 commit 823c4ad
Show file tree
Hide file tree
Showing 7 changed files with 66 additions and 35 deletions.
22 changes: 12 additions & 10 deletions GPU/Software/Rasterizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,8 +290,10 @@ static inline bool IsRightSideOrFlatBottomLine(const Vec2<int>& vertex, const Ve
}
}

Vec4<int> GetTextureFunctionOutput(const Vec4<int>& prim_color, const Vec4<int>& texcolor)
{
Vec4IntResult SOFTRAST_CALL GetTextureFunctionOutput(Vec4IntArg prim_color_in, Vec4IntArg texcolor_in) {
const Vec4<int> prim_color = prim_color_in;
const Vec4<int> texcolor = texcolor_in;

Vec3<int> out_rgb;
int out_a;

Expand All @@ -314,7 +316,7 @@ Vec4<int> GetTextureFunctionOutput(const Vec4<int>& prim_color, const Vec4<int>&
}

if (rgba) {
return Vec4<int>(out_rgb.ivec);
return ToVec4IntResult(Vec4<int>(out_rgb.ivec));
} else {
out_a = prim_color.a();
}
Expand Down Expand Up @@ -366,7 +368,7 @@ Vec4<int> GetTextureFunctionOutput(const Vec4<int>& prim_color, const Vec4<int>&
out_a = 0;
}

return Vec4<int>(out_rgb.r(), out_rgb.g(), out_rgb.b(), out_a);
return ToVec4IntResult(Vec4<int>(out_rgb, out_a));
}

static inline Vec3<int> GetSourceFactor(GEBlendSrcFactor factor, const Vec4<int> &source, const Vec4<int> &dst) {
Expand Down Expand Up @@ -577,26 +579,26 @@ static inline void ApplyTexturing(Sampler::Funcs sampler, Vec4<int> &prim_color,
GetTexelCoordinates(texlevel + 1, s, t, u[1], v[1]);
}

texcolor0 = Vec4<int>::FromRGBA(sampler.nearest(u[0], v[0], tptr0, bufw0, texlevel));
texcolor0 = Vec4<int>(sampler.nearest(u[0], v[0], tptr0, bufw0, texlevel));
if (frac_texlevel) {
texcolor1 = Vec4<int>::FromRGBA(sampler.nearest(u[1], v[1], tptr1, bufw1, texlevel + 1));
texcolor1 = Vec4<int>(sampler.nearest(u[1], v[1], tptr1, bufw1, texlevel + 1));
}
} else {
GetTexelCoordinatesQuad(texlevel, s, t, u, v, frac_u[0], frac_v[0]);
if (frac_texlevel) {
GetTexelCoordinatesQuad(texlevel + 1, s, t, u + 4, v + 4, frac_u[1], frac_v[1]);
}

texcolor0 = Vec4<int>::FromRGBA(sampler.linear(u, v, frac_u[0], frac_v[0], tptr0, bufw0, texlevel));
texcolor0 = Vec4<int>(sampler.linear(u, v, frac_u[0], frac_v[0], tptr0, bufw0, texlevel));
if (frac_texlevel) {
texcolor1 = Vec4<int>::FromRGBA(sampler.linear(u + 4, v + 4, frac_u[1], frac_v[1], tptr1, bufw1, texlevel + 1));
texcolor1 = Vec4<int>(sampler.linear(u + 4, v + 4, frac_u[1], frac_v[1], tptr1, bufw1, texlevel + 1));
}
}

if (frac_texlevel) {
texcolor0 = (texcolor1 * frac_texlevel + texcolor0 * (256 - frac_texlevel)) / 256;
}
prim_color = GetTextureFunctionOutput(prim_color, texcolor0);
prim_color = GetTextureFunctionOutput(ToVec4IntArg(prim_color), ToVec4IntArg(texcolor0));
}

// Produces a signed 1.23.8 value.
Expand Down Expand Up @@ -1392,7 +1394,7 @@ bool GetCurrentTexture(GPUDebugBuffer &buffer, int level)
u32 *row = (u32 *)buffer.GetData();
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
row[x] = sampler.nearest(x, y, texptr, texbufw, level);
row[x] = Vec4<int>(sampler.nearest(x, y, texptr, texbufw, level)).ToRGBA();
}
row += w;
}
Expand Down
3 changes: 2 additions & 1 deletion GPU/Software/Rasterizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#pragma once

#include "GPU/Software/FuncId.h"
#include "GPU/Software/RasterizerRegCache.h"
#include "GPU/Software/TransformUnit.h" // for DrawingCoords

struct GPUDebugBuffer;
Expand All @@ -35,6 +36,6 @@ bool GetCurrentTexture(GPUDebugBuffer &buffer, int level);

// Shared functions with RasterizerRectangle.cpp
Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &source, const Vec4<int> &dst);
Vec4<int> GetTextureFunctionOutput(const Vec4<int>& prim_color, const Vec4<int>& texcolor);
Vec4IntResult SOFTRAST_CALL GetTextureFunctionOutput(Vec4IntArg prim_color, Vec4IntArg texcolor);

} // namespace Rasterizer
8 changes: 4 additions & 4 deletions GPU/Software/RasterizerRectangle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ void DrawSprite(const VertexData& v0, const VertexData& v1) {
int s = s_start;
u16 *pixel = fb.Get16Ptr(pos0.x, y, gstate.FrameBufStride());
for (int x = pos0.x; x < pos1.x; x++) {
u32 tex_color = nearestFunc(s, t, texptr, texbufw, 0);
u32 tex_color = Vec4<int>(nearestFunc(s, t, texptr, texbufw, 0)).ToRGBA();
if (tex_color & 0xFF000000) {
DrawSinglePixel5551(pixel, tex_color, pixelID);
}
Expand All @@ -171,7 +171,7 @@ void DrawSprite(const VertexData& v0, const VertexData& v1) {
u16 *pixel = fb.Get16Ptr(pos0.x, y, gstate.FrameBufStride());
for (int x = pos0.x; x < pos1.x; x++) {
Vec4<int> prim_color = v1.color0;
Vec4<int> tex_color = Vec4<int>::FromRGBA(nearestFunc(s, t, texptr, texbufw, 0));
Vec4<int> tex_color = nearestFunc(s, t, texptr, texbufw, 0);
prim_color = ModulateRGBA(prim_color, tex_color);
if (prim_color.a() > 0) {
DrawSinglePixel5551(pixel, prim_color.ToRGBA(), pixelID);
Expand All @@ -191,8 +191,8 @@ void DrawSprite(const VertexData& v0, const VertexData& v1) {
// Not really that fast but faster than triangle.
for (int x = pos0.x; x < pos1.x; x++) {
Vec4<int> prim_color = v1.color0;
Vec4<int> tex_color = Vec4<int>::FromRGBA(nearestFunc(s, t, texptr, texbufw, 0));
prim_color = GetTextureFunctionOutput(prim_color, tex_color);
Vec4<int> tex_color = nearestFunc(s, t, texptr, texbufw, 0);
prim_color = GetTextureFunctionOutput(ToVec4IntArg(prim_color), ToVec4IntArg(tex_color));
drawPixel(x, y, z, 255, ToVec4IntArg(prim_color), pixelID);
s += ds;
}
Expand Down
13 changes: 13 additions & 0 deletions GPU/Software/RasterizerRegCache.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,25 @@ typedef FakeGen::FakeXCodeBlock CodeBlock;
// We also have the types of things that end up in regs.
#if PPSSPP_ARCH(ARM64)
typedef int32x4_t Vec4IntArg;
typedef int32x4_t Vec4IntResult;
typedef float32x4_t Vec4FloatArg;
static inline Vec4IntArg ToVec4IntArg(const Math3D::Vec4<int> &a) { return vld1q_s32(a.AsArray()); }
static inline Vec4IntResult ToVec4IntResult(const Math3D::Vec4<int> &a) { return vld1q_s32(a.AsArray()); }
static inline Vec4FloatArg ToVec4FloatArg(const Math3D::Vec4<float> &a) { return vld1q_f32(a.AsArray()); }
#elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
typedef __m128i Vec4IntArg;
typedef __m128i Vec4IntResult;
typedef __m128 Vec4FloatArg;
static inline Vec4IntArg ToVec4IntArg(const Math3D::Vec4<int> &a) { return a.ivec; }
static inline Vec4IntResult ToVec4IntResult(const Math3D::Vec4<int> &a) { return a.ivec; }
static inline Vec4FloatArg ToVec4FloatArg(const Math3D::Vec4<float> &a) { return a.vec; }
#else
typedef const Math3D::Vec4<int> &Vec4IntArg;
typedef Math3D::Vec4<int> Vec4IntResult;
typedef const Math3D::Vec4<float> &Vec4FloatArg;
static inline Vec4IntArg ToVec4IntArg(const Math3D::Vec4<int> &a) { return a; }
static inline Vec4IntResult ToVec4IntResult(const Math3D::Vec4<int> &a) { return a; }
static inline Vec4FloatArg ToVec4FloatArg(const Math3D::Vec4<float> &a) { return a; }
#endif

#if PPSSPP_ARCH(AMD64) && PPSSPP_PLATFORM(WINDOWS) && (defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER))
Expand All @@ -85,6 +97,7 @@ struct RegCache {
FLAG_TEMP = 0x1000,

VEC_ZERO = 0x0000,
VEC_RESULT = 0x0001,

GEN_SRC_ALPHA = 0x0100,
GEN_GSTATE = 0x0101,
Expand Down
18 changes: 10 additions & 8 deletions GPU/Software/Sampler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,22 @@
#include "Core/Reporting.h"
#include "GPU/Common/TextureDecoder.h"
#include "GPU/GPUState.h"
#include "GPU/Software/RasterizerRegCache.h"
#include "GPU/Software/Sampler.h"

#if defined(_M_SSE)
#include <emmintrin.h>
#endif

using namespace Math3D;
using namespace Rasterizer;

extern u32 clut[4096];

namespace Sampler {

static u32 SampleNearest(int u, int v, const u8 *tptr, int bufw, int level);
static u32 SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level);
static Vec4IntResult SOFTRAST_CALL SampleNearest(int u, int v, const u8 *tptr, int bufw, int level);
static Vec4IntResult SOFTRAST_CALL SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level);

std::mutex jitCacheLock;
SamplerJitCache *jitCache = nullptr;
Expand Down Expand Up @@ -305,8 +307,7 @@ struct Nearest4 {
};

template <int N>
inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int texbufw, int level)
{
inline static Nearest4 SOFTRAST_CALL SampleNearest(int u[N], int v[N], const u8 *srcptr, int texbufw, int level) {
Nearest4 res;
if (!srcptr) {
memset(res.v, 0, sizeof(res.v));
Expand Down Expand Up @@ -407,11 +408,12 @@ inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int t
}
}

static u32 SampleNearest(int u, int v, const u8 *tptr, int bufw, int level) {
return SampleNearest<1>(&u, &v, tptr, bufw, level);
static Vec4IntResult SOFTRAST_CALL SampleNearest(int u, int v, const u8 *tptr, int bufw, int level) {
Nearest4 c = SampleNearest<1>(&u, &v, tptr, bufw, level);
return ToVec4IntResult(Vec4<int>::FromRGBA(c.v[0]));
}

static u32 SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int texlevel) {
static Vec4IntResult SOFTRAST_CALL SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int texlevel) {
Nearest4 c = SampleNearest<4>(u, v, tptr, bufw, texlevel);

Vec4<int> texcolor_tl = Vec4<int>::FromRGBA(c.v[0]);
Expand All @@ -420,7 +422,7 @@ static u32 SampleLinear(int u[4], int v[4], int frac_u, int frac_v, const u8 *tp
Vec4<int> texcolor_br = Vec4<int>::FromRGBA(c.v[3]);
Vec4<int> t = texcolor_tl * (0x100 - frac_u) + texcolor_tr * frac_u;
Vec4<int> b = texcolor_bl * (0x100 - frac_u) + texcolor_br * frac_u;
return ((t * (0x100 - frac_v) + b * frac_v) / (256 * 256)).ToRGBA();
return ToVec4IntResult((t * (0x100 - frac_v) + b * frac_v) / (256 * 256));
}

};
4 changes: 2 additions & 2 deletions GPU/Software/Sampler.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@

namespace Sampler {

typedef u32 (*NearestFunc)(int u, int v, const u8 *tptr, int bufw, int level);
typedef Rasterizer::Vec4IntResult (SOFTRAST_CALL *NearestFunc)(int u, int v, const u8 *tptr, int bufw, int level);
NearestFunc GetNearestFunc();

typedef u32 (*LinearFunc)(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level);
typedef Rasterizer::Vec4IntResult (SOFTRAST_CALL *LinearFunc)(int u[4], int v[4], int frac_u, int frac_v, const u8 *tptr, int bufw, int level);
LinearFunc GetLinearFunc();

struct Funcs {
Expand Down
33 changes: 23 additions & 10 deletions GPU/Software/SamplerX86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ NearestFunc SamplerJitCache::Compile(const SamplerID &id) {
RegCache::GEN_ARG_LEVEL,
});
regCache_.ChangeReg(RAX, RegCache::GEN_RESULT);
regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);

BeginWrite();
const u8 *start = AlignCode16();
Expand All @@ -74,9 +75,9 @@ NearestFunc SamplerJitCache::Compile(const SamplerID &id) {
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);

FixupBranch nonZeroSrc = J_CC(CC_NZ);
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
XOR(32, R(resultReg), R(resultReg));
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);
PXOR(vecResultReg, R(vecResultReg));
regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);
zeroSrc = J(true);
SetJumpTarget(nonZeroSrc);
}
Expand All @@ -89,6 +90,23 @@ NearestFunc SamplerJitCache::Compile(const SamplerID &id) {
return nullptr;
}

X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);

X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
MOVD_xmm(vecResultReg, R(resultReg));
regCache_.Release(resultReg, RegCache::GEN_RESULT);

if (cpu_info.bSSE4_1) {
PMOVZXBD(vecResultReg, R(vecResultReg));
} else {
X64Reg vecTempReg = regCache_.Find(RegCache::VEC_TEMP0);
PXOR(vecTempReg, R(vecTempReg));
PUNPCKLBW(vecResultReg, R(vecTempReg));
PUNPCKLWD(vecResultReg, R(vecTempReg));
regCache_.Unlock(vecTempReg, RegCache::VEC_TEMP0);
}
regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);

if (id.hasInvalidPtr) {
SetJumpTarget(zeroSrc);
}
Expand Down Expand Up @@ -171,7 +189,7 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
if (id.hasInvalidPtr) {
CMP(PTRBITS, R(R14), Imm8(0));
FixupBranch nonZeroSrc = J_CC(CC_NZ);
XOR(32, R(RAX), R(RAX));
PXOR(XMM0, R(XMM0));
zeroSrc = J(true);
SetJumpTarget(nonZeroSrc);
}
Expand Down Expand Up @@ -289,12 +307,7 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
ADDPS(fpScratchReg1, R(fpScratchReg3));

// Time to convert back to a single 32 bit value.
CVTPS2DQ(fpScratchReg1, R(fpScratchReg1));
PACKSSDW(fpScratchReg1, R(fpScratchReg1));
PACKUSWB(fpScratchReg1, R(fpScratchReg1));

const X64Reg resultReg = RAX;
MOVD_xmm(R(resultReg), fpScratchReg1);
CVTPS2DQ(XMM0, R(fpScratchReg1));

if (id.hasInvalidPtr) {
SetJumpTarget(zeroSrc);
Expand Down

0 comments on commit 823c4ad

Please sign in to comment.