Skip to content

Commit

Permalink
Merge pull request #10108 from unknownbrackets/vulkan-scaling
Browse files Browse the repository at this point in the history
Remove texture checks for simple alpha
  • Loading branch information
hrydgard committed Nov 13, 2017
2 parents 68c7b7d + 65e71f5 commit a1b3a43
Show file tree
Hide file tree
Showing 13 changed files with 175 additions and 239 deletions.
2 changes: 0 additions & 2 deletions Core/TextureReplacer.cpp
Expand Up @@ -561,8 +561,6 @@ void ReplacedTexture::Load(int level, void *out, int rowPitch) {
CheckAlphaResult res = CheckAlphaRGBA8888Basic((u32 *)out, rowPitch / sizeof(u32), png.width, png.height); CheckAlphaResult res = CheckAlphaRGBA8888Basic((u32 *)out, rowPitch / sizeof(u32), png.width, png.height);
if (res == CHECKALPHA_ANY || level == 0) { if (res == CHECKALPHA_ANY || level == 0) {
alphaStatus_ = ReplacedTextureAlpha(res); alphaStatus_ = ReplacedTextureAlpha(res);
} else if (res == CHECKALPHA_ZERO && alphaStatus_ == ReplacedTextureAlpha::FULL) {
alphaStatus_ = ReplacedTextureAlpha(res);
} }
} }


Expand Down
1 change: 0 additions & 1 deletion GPU/Common/TextureCacheCommon.cpp
Expand Up @@ -1467,7 +1467,6 @@ void TextureCacheCommon::ApplyTexture() {
} else { } else {
BindTexture(entry); BindTexture(entry);
gstate_c.SetTextureFullAlpha(entry->GetAlphaStatus() == TexCacheEntry::STATUS_ALPHA_FULL); gstate_c.SetTextureFullAlpha(entry->GetAlphaStatus() == TexCacheEntry::STATUS_ALPHA_FULL);
gstate_c.SetTextureSimpleAlpha(entry->GetAlphaStatus() != TexCacheEntry::STATUS_ALPHA_UNKNOWN);
} }
} }


Expand Down
7 changes: 3 additions & 4 deletions GPU/Common/TextureCacheCommon.h
Expand Up @@ -99,8 +99,9 @@ struct TexCacheEntry {


STATUS_ALPHA_UNKNOWN = 0x04, STATUS_ALPHA_UNKNOWN = 0x04,
STATUS_ALPHA_FULL = 0x00, // Has no alpha channel, or always full alpha. STATUS_ALPHA_FULL = 0x00, // Has no alpha channel, or always full alpha.
STATUS_ALPHA_SIMPLE = 0x08, // Like above, but also has 0 alpha (e.g. 5551.) STATUS_ALPHA_MASK = 0x04,
STATUS_ALPHA_MASK = 0x0c,
// 0x08 free.


STATUS_CHANGE_FREQUENT = 0x10, // Changes often (less than 6 frames in between.) STATUS_CHANGE_FREQUENT = 0x10, // Changes often (less than 6 frames in between.)
STATUS_CLUT_RECHECK = 0x20, // Another texture with same addr had a hashfail. STATUS_CLUT_RECHECK = 0x20, // Another texture with same addr had a hashfail.
Expand Down Expand Up @@ -166,8 +167,6 @@ struct TexCacheEntry {
// For non-level zero, only set more restrictive. // For non-level zero, only set more restrictive.
if (newStatus == STATUS_ALPHA_UNKNOWN || level == 0) { if (newStatus == STATUS_ALPHA_UNKNOWN || level == 0) {
SetAlphaStatus(newStatus); SetAlphaStatus(newStatus);
} else if (newStatus == STATUS_ALPHA_SIMPLE && GetAlphaStatus() == STATUS_ALPHA_FULL) {
SetAlphaStatus(STATUS_ALPHA_SIMPLE);
} }
} }


Expand Down
205 changes: 75 additions & 130 deletions GPU/Common/TextureDecoder.cpp
Expand Up @@ -438,92 +438,57 @@ static inline u32 CombineSSEBitsToDWORD(const __m128i &v) {
} }


CheckAlphaResult CheckAlphaRGBA8888SSE2(const u32 *pixelData, int stride, int w, int h) { CheckAlphaResult CheckAlphaRGBA8888SSE2(const u32 *pixelData, int stride, int w, int h) {
const __m128i zero = _mm_setzero_si128(); const __m128i mask = _mm_set1_epi32(0xFF000000);
const __m128i full = _mm_set1_epi32(0xFF);


const __m128i *p = (const __m128i *)pixelData; const __m128i *p = (const __m128i *)pixelData;
const int w4 = w / 4; const int w4 = w / 4;
const int stride4 = stride / 4; const int stride4 = stride / 4;


// Have alpha values == 0 been seen? __m128i bits = mask;
__m128i hasZeroCursor = _mm_setzero_si128();
for (int y = 0; y < h; ++y) { for (int y = 0; y < h; ++y) {
// Have alpha values > 0 and < 0xFF been seen?
__m128i hasAnyCursor = _mm_setzero_si128();

for (int i = 0; i < w4; ++i) { for (int i = 0; i < w4; ++i) {
const __m128i a = _mm_srli_epi32(_mm_load_si128(&p[i]), 24); const __m128i a = _mm_load_si128(&p[i]);

bits = _mm_and_si128(bits, a);
const __m128i isZero = _mm_cmpeq_epi32(a, zero);
hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero);

// If a = FF, isNotFull will be 0 -> hasAny will be 0.
// If a = 00, a & isNotFull will be 0 -> hasAny will be 0.
// In any other case, hasAny will have some bits set.
const __m128i isNotFull = _mm_cmplt_epi32(a, full);
hasAnyCursor = _mm_or_si128(hasAnyCursor, _mm_and_si128(a, isNotFull));
} }
p += stride4;


// We check any early, in case we can skip the rest of the rows. __m128i result = _mm_xor_si128(bits, mask);
if (CombineSSEBitsToDWORD(hasAnyCursor) != 0) { if (CombineSSEBitsToDWORD(result) != 0) {
return CHECKALPHA_ANY; return CHECKALPHA_ANY;
} }
}


// Now let's sum up the bits. p += stride4;
if (CombineSSEBitsToDWORD(hasZeroCursor) != 0) {
return CHECKALPHA_ZERO;
} else {
return CHECKALPHA_FULL;
} }

return CHECKALPHA_FULL;
} }


CheckAlphaResult CheckAlphaABGR4444SSE2(const u32 *pixelData, int stride, int w, int h) { CheckAlphaResult CheckAlphaABGR4444SSE2(const u32 *pixelData, int stride, int w, int h) {
const __m128i zero = _mm_setzero_si128(); const __m128i mask = _mm_set1_epi16((short)0x000F);
const __m128i full = _mm_set1_epi16((short)0xF000);


const __m128i *p = (const __m128i *)pixelData; const __m128i *p = (const __m128i *)pixelData;
const int w8 = w / 8; const int w8 = w / 8;
const int stride8 = stride / 8; const int stride8 = stride / 8;


__m128i hasZeroCursor = _mm_setzero_si128(); __m128i bits = mask;
for (int y = 0; y < h; ++y) { for (int y = 0; y < h; ++y) {
__m128i hasAnyCursor = _mm_setzero_si128();

for (int i = 0; i < w8; ++i) { for (int i = 0; i < w8; ++i) {
// This moves XXXA to A000. const __m128i a = _mm_load_si128(&p[i]);
const __m128i a = _mm_slli_epi16(_mm_load_si128(&p[i]), 12); bits = _mm_and_si128(bits, a);

// At least one bit in isZero, and therefore hasZeroCursor, will get set if there's a zero.
const __m128i isZero = _mm_cmpeq_epi16(a, zero);
hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero);

// If a = F, isFull will be 1 -> hasAny will be 0.
// If a = 0, a & !isFull will be 0 -> hasAny will be 0.
// In any other case, hasAny will have some bits set.
const __m128i isFull = _mm_cmpeq_epi32(a, full);
const __m128i hasAny = _mm_andnot_si128(isFull, a);
hasAnyCursor = _mm_or_si128(hasAnyCursor, hasAny);
} }
p += stride8;


// We check any early, in case we can skip the rest of the rows. __m128i result = _mm_xor_si128(bits, mask);
if (CombineSSEBitsToDWORD(hasAnyCursor) != 0) { if (CombineSSEBitsToDWORD(result) != 0) {
return CHECKALPHA_ANY; return CHECKALPHA_ANY;
} }
}


// Now let's sum up the bits. p += stride8;
if (CombineSSEBitsToDWORD(hasZeroCursor) != 0) {
return CHECKALPHA_ZERO;
} else {
return CHECKALPHA_FULL;
} }

return CHECKALPHA_FULL;
} }


CheckAlphaResult CheckAlphaABGR1555SSE2(const u32 *pixelData, int stride, int w, int h) { CheckAlphaResult CheckAlphaABGR1555SSE2(const u32 *pixelData, int stride, int w, int h) {
const __m128i mask = _mm_set1_epi16(1); const __m128i mask = _mm_set1_epi16((short)0x0001);


const __m128i *p = (const __m128i *)pixelData; const __m128i *p = (const __m128i *)pixelData;
const int w8 = w / 8; const int w8 = w / 8;
Expand All @@ -538,7 +503,7 @@ CheckAlphaResult CheckAlphaABGR1555SSE2(const u32 *pixelData, int stride, int w,


__m128i result = _mm_xor_si128(bits, mask); __m128i result = _mm_xor_si128(bits, mask);
if (CombineSSEBitsToDWORD(result) != 0) { if (CombineSSEBitsToDWORD(result) != 0) {
return CHECKALPHA_ZERO; return CHECKALPHA_ANY;
} }


p += stride8; p += stride8;
Expand All @@ -548,43 +513,28 @@ CheckAlphaResult CheckAlphaABGR1555SSE2(const u32 *pixelData, int stride, int w,
} }


CheckAlphaResult CheckAlphaRGBA4444SSE2(const u32 *pixelData, int stride, int w, int h) { CheckAlphaResult CheckAlphaRGBA4444SSE2(const u32 *pixelData, int stride, int w, int h) {
const __m128i zero = _mm_setzero_si128(); const __m128i mask = _mm_set1_epi16((short)0xF000);
const __m128i full = _mm_set1_epi16(0x000F);


const __m128i *p = (const __m128i *)pixelData; const __m128i *p = (const __m128i *)pixelData;
const int w8 = w / 8; const int w8 = w / 8;
const int stride8 = stride / 8; const int stride8 = stride / 8;


__m128i hasZeroCursor = _mm_setzero_si128(); __m128i bits = mask;
for (int y = 0; y < h; ++y) { for (int y = 0; y < h; ++y) {
__m128i hasAnyCursor = _mm_setzero_si128();

for (int i = 0; i < w8; ++i) { for (int i = 0; i < w8; ++i) {
const __m128i a = _mm_srli_epi16(_mm_load_si128(&p[i]), 12); const __m128i a = _mm_load_si128(&p[i]);

bits = _mm_and_si128(bits, a);
const __m128i isZero = _mm_cmpeq_epi16(a, zero);
hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero);

// If a = F, isNotFull will be 0 -> hasAny will be 0.
// If a = 0, a & isNotFull will be 0 -> hasAny will be 0.
// In any other case, hasAny will have some bits set.
const __m128i isNotFull = _mm_cmplt_epi32(a, full);
hasAnyCursor = _mm_or_si128(hasAnyCursor, _mm_and_si128(a, isNotFull));
} }
p += stride8;


// We check any early, in case we can skip the rest of the rows. __m128i result = _mm_xor_si128(bits, mask);
if (CombineSSEBitsToDWORD(hasAnyCursor) != 0) { if (CombineSSEBitsToDWORD(result) != 0) {
return CHECKALPHA_ANY; return CHECKALPHA_ANY;
} }
}


// Now let's sum up the bits. p += stride8;
if (CombineSSEBitsToDWORD(hasZeroCursor) != 0) {
return CHECKALPHA_ZERO;
} else {
return CHECKALPHA_FULL;
} }

return CHECKALPHA_FULL;
} }


CheckAlphaResult CheckAlphaRGBA5551SSE2(const u32 *pixelData, int stride, int w, int h) { CheckAlphaResult CheckAlphaRGBA5551SSE2(const u32 *pixelData, int stride, int w, int h) {
Expand All @@ -603,7 +553,7 @@ CheckAlphaResult CheckAlphaRGBA5551SSE2(const u32 *pixelData, int stride, int w,


__m128i result = _mm_xor_si128(bits, mask); __m128i result = _mm_xor_si128(bits, mask);
if (CombineSSEBitsToDWORD(result) != 0) { if (CombineSSEBitsToDWORD(result) != 0) {
return CHECKALPHA_ZERO; return CHECKALPHA_ANY;
} }


p += stride8; p += stride8;
Expand All @@ -625,26 +575,22 @@ CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w
#endif #endif
} }


u32 hitZeroAlpha = 0;

const u32 *p = pixelData; const u32 *p = pixelData;
for (int y = 0; y < h; ++y) { for (int y = 0; y < h; ++y) {
u32 bits = 0xFF000000;
for (int i = 0; i < w; ++i) { for (int i = 0; i < w; ++i) {
u32 a = p[i] & 0xFF000000; bits &= p[i];
hitZeroAlpha |= a ^ 0xFF000000; }
if (a != 0xFF000000 && a != 0) {
// We're done, we hit non-zero, non-full alpha. if (bits != 0xFF000000) {
return CHECKALPHA_ANY; // We're done, we hit non-full alpha.
} return CHECKALPHA_ANY;
} }

p += stride; p += stride;
} }


if (hitZeroAlpha) { return CHECKALPHA_FULL;
return CHECKALPHA_ZERO;
} else {
return CHECKALPHA_FULL;
}
} }


CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h) { CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h) {
Expand All @@ -659,29 +605,25 @@ CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w
#endif #endif
} }


u32 hitZeroAlpha = 0;

const u32 *p = pixelData; const u32 *p = pixelData;
const int w2 = (w + 1) / 2; const int w2 = (w + 1) / 2;
const int stride2 = (stride + 1) / 2; const int stride2 = (stride + 1) / 2;


for (int y = 0; y < h; ++y) { for (int y = 0; y < h; ++y) {
u32 bits = 0x000F000F;
for (int i = 0; i < w2; ++i) { for (int i = 0; i < w2; ++i) {
u32 a = p[i] & 0x000F000F; bits &= p[i];
hitZeroAlpha |= a ^ 0x000F000F;
if (a != 0x000F000F && a != 0x0000000F && a != 0x000F0000 && a != 0) {
// We're done, we hit non-zero, non-full alpha.
return CHECKALPHA_ANY;
}
} }

if (bits != 0x000F000F) {
// We're done, we hit non-full alpha.
return CHECKALPHA_ANY;
}

p += stride2; p += stride2;
} }


if (hitZeroAlpha) { return CHECKALPHA_FULL;
return CHECKALPHA_ZERO;
} else {
return CHECKALPHA_FULL;
}
} }


CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w, int h) { CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w, int h) {
Expand All @@ -700,14 +642,14 @@ CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w
const int w2 = (w + 1) / 2; const int w2 = (w + 1) / 2;
const int stride2 = (stride + 1) / 2; const int stride2 = (stride + 1) / 2;


u32 bits = 0x00010001;
for (int y = 0; y < h; ++y) { for (int y = 0; y < h; ++y) {
u32 bits = 0x00010001;
for (int i = 0; i < w2; ++i) { for (int i = 0; i < w2; ++i) {
bits &= p[i]; bits &= p[i];
} }


if ((bits ^ 0x00010001) != 0) { if (bits != 0x00010001) {
return CHECKALPHA_ZERO; return CHECKALPHA_ANY;
} }


p += stride2; p += stride2;
Expand All @@ -717,59 +659,62 @@ CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w
} }


CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h) { CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h) {
#ifdef _M_SSE
// Use SSE if aligned to 16 bytes / 8 pixels (usually the case.) // Use SSE if aligned to 16 bytes / 8 pixels (usually the case.)
if ((w & 7) == 0 && (stride & 7) == 0) { if ((w & 7) == 0 && (stride & 7) == 0) {
#ifdef _M_SSE
return CheckAlphaRGBA4444SSE2(pixelData, stride, w, h); return CheckAlphaRGBA4444SSE2(pixelData, stride, w, h);
} #elif PPSSPP_ARCH(ARMV7) || PPSSPP_ARCH(ARM64)
if (cpu_info.bNEON) {
return CheckAlphaRGBA4444NEON(pixelData, stride, w, h);
}
#endif #endif

}
u32 hitZeroAlpha = 0;


const u32 *p = pixelData; const u32 *p = pixelData;
const int w2 = (w + 1) / 2; const int w2 = (w + 1) / 2;
const int stride2 = (stride + 1) / 2; const int stride2 = (stride + 1) / 2;


for (int y = 0; y < h; ++y) { for (int y = 0; y < h; ++y) {
u32 bits = 0xF000F000;
for (int i = 0; i < w2; ++i) { for (int i = 0; i < w2; ++i) {
u32 a = p[i] & 0xF000F000; bits &= p[i];
hitZeroAlpha |= a ^ 0xF000F000;
if (a != 0xF000F000 && a != 0xF0000000 && a != 0x0000F000 && a != 0) {
// We're done, we hit non-zero, non-full alpha.
return CHECKALPHA_ANY;
}
} }

if (bits != 0xF000F000) {
// We're done, we hit non-full alpha.
return CHECKALPHA_ANY;
}

p += stride2; p += stride2;
} }


if (hitZeroAlpha) { return CHECKALPHA_FULL;
return CHECKALPHA_ZERO;
} else {
return CHECKALPHA_FULL;
}
} }


CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w, int h) { CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w, int h) {
#ifdef _M_SSE
// Use SSE if aligned to 16 bytes / 8 pixels (usually the case.) // Use SSE if aligned to 16 bytes / 8 pixels (usually the case.)
if ((w & 7) == 0 && (stride & 7) == 0) { if ((w & 7) == 0 && (stride & 7) == 0) {
#ifdef _M_SSE
return CheckAlphaRGBA5551SSE2(pixelData, stride, w, h); return CheckAlphaRGBA5551SSE2(pixelData, stride, w, h);
} #elif PPSSPP_ARCH(ARMV7) || PPSSPP_ARCH(ARM64)
if (cpu_info.bNEON) {
return CheckAlphaRGBA5551NEON(pixelData, stride, w, h);
}
#endif #endif

}
u32 bits = 0x80008000;


const u32 *p = pixelData; const u32 *p = pixelData;
const int w2 = (w + 1) / 2; const int w2 = (w + 1) / 2;
const int stride2 = (stride + 1) / 2; const int stride2 = (stride + 1) / 2;


for (int y = 0; y < h; ++y) { for (int y = 0; y < h; ++y) {
u32 bits = 0x80008000;
for (int i = 0; i < w2; ++i) { for (int i = 0; i < w2; ++i) {
bits &= p[i]; bits &= p[i];
} }


if ((bits ^ 0x80008000) != 0) { if (bits != 0x80008000) {
return CHECKALPHA_ZERO; return CHECKALPHA_ANY;
} }


p += stride2; p += stride2;
Expand Down

0 comments on commit a1b3a43

Please sign in to comment.