Skip to content

Commit

Permalink
Merge pull request #5696 from unknownbrackets/texcache
Browse files Browse the repository at this point in the history
Use NEON for unswizzling, minor tweak to texcache
  • Loading branch information
hrydgard committed Mar 23, 2014
2 parents 8b92dce + 2482b2a commit 941b8b4
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 68 deletions.
55 changes: 52 additions & 3 deletions GPU/Common/TextureDecoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,16 +105,65 @@ static u32 QuickTexHashBasic(const void *checkp, u32 size) {
return check;
}

void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth) {
#ifdef _M_SSE
const __m128i *src = (const __m128i *)texptr;
for (int by = 0; by < byc; by++) {
__m128i *xdest = (__m128i *)ydestp;
for (int bx = 0; bx < bxc; bx++) {
__m128i *dest = xdest;
for (int n = 0; n < 2; n++) {
// Textures are always 16-byte aligned so this is fine.
__m128i temp1 = _mm_load_si128(src);
__m128i temp2 = _mm_load_si128(src + 1);
__m128i temp3 = _mm_load_si128(src + 2);
__m128i temp4 = _mm_load_si128(src + 3);
_mm_store_si128(dest, temp1);
dest += pitch >> 2;
_mm_store_si128(dest, temp2);
dest += pitch >> 2;
_mm_store_si128(dest, temp3);
dest += pitch >> 2;
_mm_store_si128(dest, temp4);
dest += pitch >> 2;
src += 4;
}
xdest ++;
}
ydestp += (rowWidth * 8) / 4;
}
#else
const u32 *src = (const u32 *)texptr;
for (int by = 0; by < byc; by++) {
u32 *xdest = ydestp;
for (int bx = 0; bx < bxc; bx++) {
u32 *dest = xdest;
for (int n = 0; n < 8; n++) {
memcpy(dest, src, 16);
dest += pitch;
src += 4;
}
xdest += 4;
}
ydestp += (rowWidth * 8) / 4;
}
#endif
}

QuickTexHashFunc DoQuickTexHash = &QuickTexHashBasic;
UnswizzleTex16Func DoUnswizzleTex16 = &DoUnswizzleTex16Basic;

// This has to be done after CPUDetect has done its magic.
void SetupQuickTexHash() {
void SetupTextureDecoder() {
#ifdef ARMV7
if (cpu_info.bNEON)
if (cpu_info.bNEON) {
DoQuickTexHash = &QuickTexHashNEON;
DoUnswizzleTex16 = &DoUnswizzleTex16NEON;
}
#elif _M_SSE
if (cpu_info.bSSE2)
if (cpu_info.bSSE2) {
DoQuickTexHash = &QuickTexHashSSE2;
}
#endif
}

Expand Down
6 changes: 5 additions & 1 deletion GPU/Common/TextureDecoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,14 @@
#include "GPU/ge_constants.h"
#include "GPU/GPUState.h"

void SetupQuickTexHash();
void SetupTextureDecoder();

typedef u32 (*QuickTexHashFunc)(const void *checkp, u32 size);
extern QuickTexHashFunc DoQuickTexHash;

typedef void (*UnswizzleTex16Func)(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth);
extern UnswizzleTex16Func DoUnswizzleTex16;

// All these DXT structs are in the reverse order, as compared to PC.
// On PC, alpha comes before color, and interpolants are before the tile data.

Expand Down
34 changes: 33 additions & 1 deletion GPU/Common/TextureDecoderNEON.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) {
// Okay, do the memory hashing.
"QuickTexHashNEON_next:\n"
"pld [%2, #0xc0]\n"
"vldmia %2!, {d16-d23}\n"
"vld1.32 {d16, d17, d18, d19}, [%2, :128]!\n"
"vld1.32 {d20, d21, d22, d23}, [%2, :128]!\n"
"vmla.i32 q0, q1, q8\n"
"vmul.i32 q11, q11, q1\n"
"veor.i32 q0, q0, q9\n"
Expand Down Expand Up @@ -118,3 +119,34 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) {

return check;
}

void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth) {
__builtin_prefetch(texptr, 0, 0);
__builtin_prefetch(ydestp, 1, 1);

const u32 *src = (const u32 *)texptr;
for (int by = 0; by < byc; by++) {
u32 *xdest = ydestp;
for (int bx = 0; bx < bxc; bx++) {
u32 *dest = xdest;
for (int n = 0; n < 2; n++) {
// Textures are always 16-byte aligned so this is fine.
uint32x4_t temp1 = vld1q_u32(src);
uint32x4_t temp2 = vld1q_u32(src + 4);
uint32x4_t temp3 = vld1q_u32(src + 8);
uint32x4_t temp4 = vld1q_u32(src + 12);
vst1q_u32(dest, temp1);
dest += pitch;
vst1q_u32(dest, temp2);
dest += pitch;
vst1q_u32(dest, temp3);
dest += pitch;
vst1q_u32(dest, temp4);
dest += pitch;
src += 16;
}
xdest += 4;
}
ydestp += (rowWidth * 8) / 4;
}
}
3 changes: 2 additions & 1 deletion GPU/Common/TextureDecoderNEON.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@

#include "GPU/Common/TextureDecoder.h"

u32 QuickTexHashNEON(const void *checkp, u32 size);
u32 QuickTexHashNEON(const void *checkp, u32 size);
void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth);
18 changes: 3 additions & 15 deletions GPU/Directx9/TextureCacheDX9.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ TextureCacheDX9::TextureCacheDX9() : clearCacheNextFrame_(false), lowMemoryMode_
clutBufRaw_ = (u32 *)AllocateAlignedMemory(4096 * sizeof(u32), 16); // 16KB
// glGetFloatv(GL_MAX_TEXTURE_MAX_ANISOTROPY_EXT, &maxAnisotropyLevel);
maxAnisotropyLevel = 16;
SetupQuickTexHash();
SetupTextureDecoder();
#ifdef _XBOX
// TODO: Maybe not? This decimates more often, but it may be speed harmful if unnecessary.
lowMemoryMode_ = true;
Expand Down Expand Up @@ -286,21 +286,9 @@ void *TextureCacheDX9::UnswizzleFromMem(u32 texaddr, u32 bufw, u32 bytesPerPixel

u32 ydest = 0;
if (rowWidth >= 16) {
const u32 *src = (u32 *) Memory::GetPointer(texaddr);
u32 *ydestp = tmpTexBuf32.data();
for (int by = 0; by < byc; by++) {
u32 *xdest = ydestp;
for (int bx = 0; bx < bxc; bx++) {
u32 *dest = xdest;
for (int n = 0; n < 8; n++) {
memcpy(dest, src, 16);
dest += pitch;
src += 4;
}
xdest += 4;
}
ydestp += (rowWidth * 8) / 4;
}
// The most common one, so it gets an optimized implementation.
DoUnswizzleTex16(Memory::GetPointer(texaddr), ydestp, bxc, byc, pitch, rowWidth);
} else if (rowWidth == 8) {
const u32 *src = (u32 *) Memory::GetPointer(texaddr);
for (int by = 0; by < byc; by++) {
Expand Down
49 changes: 2 additions & 47 deletions GPU/GLES/TextureCache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ TextureCache::TextureCache() : clearCacheNextFrame_(false), lowMemoryMode_(false
clutBufConverted_ = (u32 *)AllocateAlignedMemory(4096 * sizeof(u32), 16); // 16KB
clutBufRaw_ = (u32 *)AllocateAlignedMemory(4096 * sizeof(u32), 16); // 16KB
glGetFloatv(GL_MAX_TEXTURE_MAX_ANISOTROPY_EXT, &maxAnisotropyLevel);
SetupQuickTexHash();
SetupTextureDecoder();
}

TextureCache::~TextureCache() {
Expand Down Expand Up @@ -285,51 +285,6 @@ void TextureCache::NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffe
}
}

static void Unswizzle16(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth) {
#ifdef _M_SSE
const __m128i *src = (const __m128i *)texptr;
for (int by = 0; by < byc; by++) {
__m128i *xdest = (__m128i *)ydestp;
for (int bx = 0; bx < bxc; bx++) {
__m128i *dest = xdest;
for (int n = 0; n < 2; n++) {
// Textures are always 16-byte aligned so this is fine.
__m128i temp1 = _mm_load_si128(src);
__m128i temp2 = _mm_load_si128(src + 1);
__m128i temp3 = _mm_load_si128(src + 2);
__m128i temp4 = _mm_load_si128(src + 3);
_mm_store_si128(dest, temp1);
dest += pitch >> 2;
_mm_store_si128(dest, temp2);
dest += pitch >> 2;
_mm_store_si128(dest, temp3);
dest += pitch >> 2;
_mm_store_si128(dest, temp4);
dest += pitch >> 2;
src += 4;
}
xdest ++;
}
ydestp += (rowWidth * 8) / 4;
}
#else
const u32 *src = (const u32 *)texptr;
for (int by = 0; by < byc; by++) {
u32 *xdest = ydestp;
for (int bx = 0; bx < bxc; bx++) {
u32 *dest = xdest;
for (int n = 0; n < 8; n++) {
memcpy(dest, src, 16);
dest += pitch;
src += 4;
}
xdest += 4;
}
ydestp += (rowWidth * 8) / 4;
}
#endif
}

void *TextureCache::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 bytesPerPixel, u32 level) {
const u32 rowWidth = (bytesPerPixel > 0) ? (bufw * bytesPerPixel) : (bufw / 2);
const u32 pitch = rowWidth / 4;
Expand All @@ -342,7 +297,7 @@ void *TextureCache::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 bytesPerPix
if (rowWidth >= 16) {
u32 *ydestp = tmpTexBuf32.data();
// The most common one, so it gets an optimized implementation.
Unswizzle16(texptr, ydestp, bxc, byc, pitch, rowWidth);
DoUnswizzleTex16(texptr, ydestp, bxc, byc, pitch, rowWidth);
} else if (rowWidth == 8) {
const u32 *src = (const u32 *) texptr;
for (int by = 0; by < byc; by++) {
Expand Down

0 comments on commit 941b8b4

Please sign in to comment.