Permalink
Browse files

Merge pull request #11374 from unknownbrackets/texcache

TexCache: Fix texture alignment in GLES
  • Loading branch information...
hrydgard committed Sep 9, 2018
2 parents 959a999 + b05f75f commit 7a5ec714c33e0a4a2fa8b56b4cd3564a0082ed64
@@ -203,59 +203,62 @@ void DoSwizzleTex16(const u32 *ysrcp, u8 *texptr, int bxc, int byc, u32 pitch) {
// ysrcp is in 32-bits, so this is convenient.
const u32 pitchBy32 = pitch >> 2;
#ifdef _M_SSE
__m128i *dest = (__m128i *)texptr;
// The pitch parameter is in bytes, so shift down for 128-bit.
// Note: it's always aligned to 16 bytes, so this is safe.
const u32 pitchBy128 = pitch >> 4;
for (int by = 0; by < byc; by++) {
const __m128i *xsrc = (const __m128i *)ysrcp;
for (int bx = 0; bx < bxc; bx++) {
const __m128i *src = xsrc;
for (int n = 0; n < 2; n++) {
// Textures are always 16-byte aligned so this is fine.
__m128i temp1 = _mm_load_si128(src);
src += pitchBy128;
__m128i temp2 = _mm_load_si128(src);
src += pitchBy128;
__m128i temp3 = _mm_load_si128(src);
src += pitchBy128;
__m128i temp4 = _mm_load_si128(src);
src += pitchBy128;
_mm_store_si128(dest, temp1);
_mm_store_si128(dest + 1, temp2);
_mm_store_si128(dest + 2, temp3);
_mm_store_si128(dest + 3, temp4);
dest += 4;
if (((uintptr_t)ysrcp & 0xF) == 0 && (pitch & 0xF) == 0) {
__m128i *dest = (__m128i *)texptr;
// The pitch parameter is in bytes, so shift down for 128-bit.
// Note: it's always aligned to 16 bytes, so this is safe.
const u32 pitchBy128 = pitch >> 4;
for (int by = 0; by < byc; by++) {
const __m128i *xsrc = (const __m128i *)ysrcp;
for (int bx = 0; bx < bxc; bx++) {
const __m128i *src = xsrc;
for (int n = 0; n < 2; n++) {
// Textures are always 16-byte aligned so this is fine.
__m128i temp1 = _mm_load_si128(src);
src += pitchBy128;
__m128i temp2 = _mm_load_si128(src);
src += pitchBy128;
__m128i temp3 = _mm_load_si128(src);
src += pitchBy128;
__m128i temp4 = _mm_load_si128(src);
src += pitchBy128;
_mm_store_si128(dest, temp1);
_mm_store_si128(dest + 1, temp2);
_mm_store_si128(dest + 2, temp3);
_mm_store_si128(dest + 3, temp4);
dest += 4;
}
xsrc++;
}
xsrc++;
ysrcp += pitchBy32 * 8;
}
ysrcp += pitchBy32 * 8;
}
#else
u32 *dest = (u32 *)texptr;
for (int by = 0; by < byc; by++) {
const u32 *xsrc = ysrcp;
for (int bx = 0; bx < bxc; bx++) {
const u32 *src = xsrc;
for (int n = 0; n < 8; n++) {
memcpy(dest, src, 16);
src += pitchBy32;
dest += 4;
} else
#endif
{
u32 *dest = (u32 *)texptr;
for (int by = 0; by < byc; by++) {
const u32 *xsrc = ysrcp;
for (int bx = 0; bx < bxc; bx++) {
const u32 *src = xsrc;
for (int n = 0; n < 8; n++) {
memcpy(dest, src, 16);
src += pitchBy32;
dest += 4;
}
xsrc += 4;
}
xsrc += 4;
ysrcp += pitchBy32 * 8;
}
ysrcp += pitchBy32 * 8;
}
#endif
}
void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch) {
// ydestp is in 32-bits, so this is convenient.
const u32 pitchBy32 = pitch >> 2;
#ifdef _M_SSE
if (((uintptr_t)ydestp & 0xF) == 0) {
if (((uintptr_t)ydestp & 0xF) == 0 && (pitch & 0xF) == 0) {
const __m128i *src = (const __m128i *)texptr;
// The pitch parameter is in bytes, so shift down for 128-bit.
// Note: it's always aligned to 16 bytes, so this is safe.
@@ -41,7 +41,7 @@ u32 QuickTexHashSSE2(const void *checkp, u32 size);
#define DoQuickTexHash QuickTexHashSSE2
#define StableQuickTexHash QuickTexHashSSE2
// Pitch must be aligned to 16 bits (as is the case on a PSP)
// Pitch must be aligned to 16 bytes (as is the case on a PSP)
void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch);
#define DoUnswizzleTex16 DoUnswizzleTex16Basic
@@ -699,7 +699,7 @@ void TextureCacheD3D11::LoadTextureLevel(TexCacheEntry &entry, ReplacedTexture &
mapData = (u32 *)AllocateAlignedMemory(sizeof(u32) * (w * scaleFactor) * (h * scaleFactor), 16);
mapRowPitch = w * scaleFactor * 4;
} else {
mapRowPitch = std::max(bufw, w) * bpp;
mapRowPitch = std::max(w * bpp, 16);
size_t bufSize = sizeof(u32) * (mapRowPitch / bpp) * h;
mapData = (u32 *)AllocateAlignedMemory(bufSize, 16);
if (!mapData) {
@@ -294,6 +294,7 @@ bool GenerateFragmentShaderHLSL(const FShaderID &id, char *buffer, ShaderLanguag
if (lang == HLSL_D3D11) {
WRITE(p, " if ((roundAndScaleTo255i(v.a) & u_alphacolormask.a) %s u_alphacolorref.a) discard;\n", alphaTestFuncs[alphaTestFunc]);
} else {
// TODO: Use a texture to lookup bitwise ops?
WRITE(p, " if (roundAndScaleTo255f(v.a) %s u_alphacolorref.a) clip(-1);\n", alphaTestFuncs[alphaTestFunc]);
}
} else {
@@ -319,13 +320,17 @@ bool GenerateFragmentShaderHLSL(const FShaderID &id, char *buffer, ShaderLanguag
} else {
const char *colorTestFuncs[] = { "#", "#", " != ", " == " }; // never/always don't make sense
if (colorTestFuncs[colorTestFunc][0] != '#') {
const char * test = colorTestFuncs[colorTestFunc];
const char *test = colorTestFuncs[colorTestFunc];
if (lang == HLSL_D3D11) {
WRITE(p, " uint3 v_scaled = roundAndScaleTo255iv(v.rgb);\n");
WRITE(p, " if ((v_scaled & u_alphacolormask.rgb) %s (u_alphacolorref.rgb & u_alphacolormask.rgb)) discard;\n", colorTestFuncs[colorTestFunc]);
WRITE(p, " uint3 v_masked = v_scaled & u_alphacolormask.rgb;\n");
WRITE(p, " uint3 colorTestRef = u_alphacolorref.rgb & u_alphacolormask.rgb;\n");
// We have to test the components separately, or we get incorrect results. See #10629.
WRITE(p, " if (v_masked.r %s colorTestRef.r && v_masked.g %s colorTestRef.g && v_masked.b %s colorTestRef.b) discard;\n", test, test, test);
} else {
// TODO: Use a texture to lookup bitwise ops instead?
WRITE(p, " float3 colortest = roundAndScaleTo255v(v.rgb);\n");
WRITE(p, " if ((colortest.r %s u_alphacolorref.r) && (colortest.g %s u_alphacolorref.g) && (colortest.b %s u_alphacolorref.b )) clip(-1);\n", test, test, test);
WRITE(p, " if ((colortest.r %s u_alphacolorref.r) && (colortest.g %s u_alphacolorref.g) && (colortest.b %s u_alphacolorref.b)) clip(-1);\n", test, test, test);
}
}
else {
@@ -58,7 +58,7 @@ PSShader::PSShader(LPDIRECT3DDEVICE9 device, FShaderID id, const char *code) : i
ERROR_LOG(G3D, "Error in shader compilation!");
}
ERROR_LOG(G3D, "Messages: %s", errorMessage.c_str());
ERROR_LOG(G3D, "Shader source:\n%s", code);
ERROR_LOG(G3D, "Shader source:\n%s", LineNumberString(code).c_str());
OutputDebugStringUTF8("Messages:\n");
OutputDebugStringUTF8(errorMessage.c_str());
Reporting::ReportMessage("D3D error in shader compilation: info: %s / code: %s", errorMessage.c_str(), code);
@@ -265,10 +265,10 @@ void ShaderManagerDX9::PSUpdateUniforms(u64 dirtyUniforms) {
PSSetColorUniform3(CONST_PS_TEXENV, gstate.texenvcolor);
}
if (dirtyUniforms & DIRTY_ALPHACOLORREF) {
PSSetColorUniform3Alpha255(CONST_PS_ALPHACOLORREF, gstate.getColorTestRef(), gstate.getAlphaTestRef());
PSSetColorUniform3Alpha255(CONST_PS_ALPHACOLORREF, gstate.getColorTestRef(), gstate.getAlphaTestRef() & gstate.getAlphaTestMask());
}
if (dirtyUniforms & DIRTY_ALPHACOLORMASK) {
PSSetColorUniform3(CONST_PS_ALPHACOLORMASK, gstate.colortestmask);
PSSetColorUniform3Alpha255(CONST_PS_ALPHACOLORMASK, gstate.colortestmask, gstate.getAlphaTestMask());
}
if (dirtyUniforms & DIRTY_FOGCOLOR) {
PSSetColorUniform3(CONST_PS_FOGCOLOR, gstate.fogcolor);
@@ -744,25 +744,21 @@ TexCacheEntry::TexStatus TextureCacheGLES::CheckAlpha(const uint8_t *pixelData,
void TextureCacheGLES::LoadTextureLevel(TexCacheEntry &entry, ReplacedTexture &replaced, int level, int scaleFactor, GLenum dstFmt) {
int w = gstate.getTextureWidth(level);
int h = gstate.getTextureHeight(level);
bool useUnpack = false;
uint8_t *pixelData;
// TODO: only do this once
u32 texByteAlign = 1;
int decPitch = 0;
gpuStats.numTexturesDecoded++;
if (replaced.GetSize(level, w, h)) {
PROFILE_THIS_SCOPE("replacetex");
int bpp = replaced.Format(level) == ReplacedTextureFormat::F_8888 ? 4 : 2;
uint8_t *rearrange = (uint8_t *)AllocateAlignedMemory(w * h * bpp, 16);
replaced.Load(level, rearrange, bpp * w);
decPitch = w * bpp;
uint8_t *rearrange = (uint8_t *)AllocateAlignedMemory(decPitch * h, 16);
replaced.Load(level, rearrange, decPitch);
pixelData = rearrange;
dstFmt = ToGLESFormat(replaced.Format(level));
texByteAlign = bpp;
} else {
PROFILE_THIS_SCOPE("decodetex");
@@ -771,14 +767,15 @@ void TextureCacheGLES::LoadTextureLevel(TexCacheEntry &entry, ReplacedTexture &r
int bufw = GetTextureBufw(level, texaddr, GETextureFormat(entry.format));
int pixelSize = dstFmt == GL_UNSIGNED_BYTE ? 4 : 2;
int decPitch = w * pixelSize;
// We leave GL_UNPACK_ALIGNMENT at 4, so this must be at least 4.
decPitch = std::max(w * pixelSize, 4);
pixelData = (uint8_t *)AllocateAlignedMemory(decPitch * h * pixelSize, 16);
DecodeTextureLevel(pixelData, decPitch, GETextureFormat(entry.format), clutformat, texaddr, level, bufw, true, false, false);
// We check before scaling since scaling shouldn't invent alpha from a full alpha texture.
if ((entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) {
TexCacheEntry::TexStatus alphaStatus = CheckAlpha(pixelData, dstFmt, useUnpack ? bufw : w, w, h);
TexCacheEntry::TexStatus alphaStatus = CheckAlpha(pixelData, dstFmt, decPitch / pixelSize, w, h);
entry.SetAlphaStatus(alphaStatus, level);
} else {
entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_UNKNOWN);
@@ -789,11 +786,9 @@ void TextureCacheGLES::LoadTextureLevel(TexCacheEntry &entry, ReplacedTexture &r
scaler.ScaleAlways((u32 *)rearrange, (u32 *)pixelData, dstFmt, w, h, scaleFactor);
FreeAlignedMemory(pixelData);
pixelData = rearrange;
decPitch = w * 4;
}
// Textures are always aligned to 16 bytes bufw, so this could safely be 4 always.
texByteAlign = dstFmt == GL_UNSIGNED_BYTE ? 4 : 2;
if (replacer_.Enabled()) {
ReplacedTextureDecodeInfo replacedInfo;
replacedInfo.cachekey = entry.CacheKey();
@@ -804,8 +799,7 @@ void TextureCacheGLES::LoadTextureLevel(TexCacheEntry &entry, ReplacedTexture &r
replacedInfo.scaleFactor = scaleFactor;
replacedInfo.fmt = FromGLESFormat(dstFmt);
int bpp = dstFmt == GL_UNSIGNED_BYTE ? 4 : 2;
replacer_.NotifyTextureDecoded(replacedInfo, pixelData, (useUnpack ? bufw : w) * bpp, level, w, h);
replacer_.NotifyTextureDecoded(replacedInfo, pixelData, decPitch, level, w, h);
}
}

0 comments on commit 7a5ec71

Please sign in to comment.