Merge pull request #5696 from unknownbrackets/texcache

Use NEON for unswizzling, minor tweak to texcache
hrydgard · Mar 23, 2014 · 941b8b4 · 941b8b4
2 parents 8b92dce + 2482b2a
commit 941b8b4
Show file tree

Hide file tree

Showing 6 changed files with 97 additions and 68 deletions.
diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp
@@ -105,16 +105,65 @@ static u32 QuickTexHashBasic(const void *checkp, u32 size) {
 	return check;
 }
 
+void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth) {
+#ifdef _M_SSE
+	const __m128i *src = (const __m128i *)texptr;
+	for (int by = 0; by < byc; by++) {
+		__m128i *xdest = (__m128i *)ydestp;
+		for (int bx = 0; bx < bxc; bx++) {
+			__m128i *dest = xdest;
+			for (int n = 0; n < 2; n++) {
+				// Textures are always 16-byte aligned so this is fine.
+				__m128i temp1 = _mm_load_si128(src);
+				__m128i temp2 = _mm_load_si128(src + 1);
+				__m128i temp3 = _mm_load_si128(src + 2);
+				__m128i temp4 = _mm_load_si128(src + 3);
+				_mm_store_si128(dest, temp1);
+				dest += pitch >> 2;
+				_mm_store_si128(dest, temp2);
+				dest += pitch >> 2;
+				_mm_store_si128(dest, temp3);
+				dest += pitch >> 2;
+				_mm_store_si128(dest, temp4);
+				dest += pitch >> 2;
+				src += 4;
+			}
+			xdest ++;
+		}
+		ydestp += (rowWidth * 8) / 4;
+	}
+#else
+	const u32 *src = (const u32 *)texptr;
+	for (int by = 0; by < byc; by++) {
+		u32 *xdest = ydestp;
+		for (int bx = 0; bx < bxc; bx++) {
+			u32 *dest = xdest;
+			for (int n = 0; n < 8; n++) {
+				memcpy(dest, src, 16);
+				dest += pitch;
+				src += 4;
+			}
+			xdest += 4;
+		}
+		ydestp += (rowWidth * 8) / 4;
+	}
+#endif
+}
+
 QuickTexHashFunc DoQuickTexHash = &QuickTexHashBasic;
+UnswizzleTex16Func DoUnswizzleTex16 = &DoUnswizzleTex16Basic;
 
 // This has to be done after CPUDetect has done its magic.
-void SetupQuickTexHash() {
+void SetupTextureDecoder() {
 #ifdef ARMV7
-	if (cpu_info.bNEON)
+	if (cpu_info.bNEON) {
 		DoQuickTexHash = &QuickTexHashNEON;
+		DoUnswizzleTex16 = &DoUnswizzleTex16NEON;
+	}
 #elif _M_SSE
-	if (cpu_info.bSSE2)
+	if (cpu_info.bSSE2) {
 		DoQuickTexHash = &QuickTexHashSSE2;
+	}
 #endif
 }
 

diff --git a/GPU/Common/TextureDecoder.h b/GPU/Common/TextureDecoder.h
@@ -22,10 +22,14 @@
 #include "GPU/ge_constants.h"
 #include "GPU/GPUState.h"
 
-void SetupQuickTexHash();
+void SetupTextureDecoder();
+
 typedef u32 (*QuickTexHashFunc)(const void *checkp, u32 size);
 extern QuickTexHashFunc DoQuickTexHash;
 
+typedef void (*UnswizzleTex16Func)(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth);
+extern UnswizzleTex16Func DoUnswizzleTex16;
+
 // All these DXT structs are in the reverse order, as compared to PC.
 // On PC, alpha comes before color, and interpolants are before the tile data.
 

diff --git a/GPU/Common/TextureDecoderNEON.cpp b/GPU/Common/TextureDecoderNEON.cpp
@@ -84,7 +84,8 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) {
 			// Okay, do the memory hashing.
 			"QuickTexHashNEON_next:\n"
 			"pld [%2, #0xc0]\n"
-			"vldmia %2!, {d16-d23}\n"
+			"vld1.32 {d16, d17, d18, d19}, [%2, :128]!\n"
+			"vld1.32 {d20, d21, d22, d23}, [%2, :128]!\n"
 			"vmla.i32 q0, q1, q8\n"
 			"vmul.i32 q11, q11, q1\n"
 			"veor.i32 q0, q0, q9\n"
@@ -118,3 +119,34 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) {
 
 	return check;
 }
+
+void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth) {
+	__builtin_prefetch(texptr, 0, 0);
+	__builtin_prefetch(ydestp, 1, 1);
+
+	const u32 *src = (const u32 *)texptr;
+	for (int by = 0; by < byc; by++) {
+		u32 *xdest = ydestp;
+		for (int bx = 0; bx < bxc; bx++) {
+			u32 *dest = xdest;
+			for (int n = 0; n < 2; n++) {
+				// Textures are always 16-byte aligned so this is fine.
+				uint32x4_t temp1 = vld1q_u32(src);
+				uint32x4_t temp2 = vld1q_u32(src + 4);
+				uint32x4_t temp3 = vld1q_u32(src + 8);
+				uint32x4_t temp4 = vld1q_u32(src + 12);
+				vst1q_u32(dest, temp1);
+				dest += pitch;
+				vst1q_u32(dest, temp2);
+				dest += pitch;
+				vst1q_u32(dest, temp3);
+				dest += pitch;
+				vst1q_u32(dest, temp4);
+				dest += pitch;
+				src += 16;
+			}
+			xdest += 4;
+		}
+		ydestp += (rowWidth * 8) / 4;
+	}
+}
diff --git a/GPU/Common/TextureDecoderNEON.h b/GPU/Common/TextureDecoderNEON.h
@@ -17,4 +17,5 @@
 
 #include "GPU/Common/TextureDecoder.h"
 
-u32 QuickTexHashNEON(const void *checkp, u32 size);
+u32 QuickTexHashNEON(const void *checkp, u32 size);
+void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth);
diff --git a/GPU/Directx9/TextureCacheDX9.cpp b/GPU/Directx9/TextureCacheDX9.cpp
@@ -58,7 +58,7 @@ TextureCacheDX9::TextureCacheDX9() : clearCacheNextFrame_(false), lowMemoryMode_
 	clutBufRaw_ = (u32 *)AllocateAlignedMemory(4096 * sizeof(u32), 16);  // 16KB
 	// glGetFloatv(GL_MAX_TEXTURE_MAX_ANISOTROPY_EXT, &maxAnisotropyLevel);
 	maxAnisotropyLevel = 16;
-	SetupQuickTexHash();
+	SetupTextureDecoder();
 #ifdef _XBOX
 	// TODO: Maybe not?  This decimates more often, but it may be speed harmful if unnecessary.
 	lowMemoryMode_ = true;
@@ -286,21 +286,9 @@ void *TextureCacheDX9::UnswizzleFromMem(u32 texaddr, u32 bufw, u32 bytesPerPixel
 
 	u32 ydest = 0;
 	if (rowWidth >= 16) {
-		const u32 *src = (u32 *) Memory::GetPointer(texaddr);
 		u32 *ydestp = tmpTexBuf32.data();
-		for (int by = 0; by < byc; by++) {
-			u32 *xdest = ydestp;
-			for (int bx = 0; bx < bxc; bx++) {
-				u32 *dest = xdest;
-				for (int n = 0; n < 8; n++) {
-					memcpy(dest, src, 16);
-					dest += pitch;
-					src += 4;
-				}
-				xdest += 4;
-			}
-			ydestp += (rowWidth * 8) / 4;
-		}
+		// The most common one, so it gets an optimized implementation.
+		DoUnswizzleTex16(Memory::GetPointer(texaddr), ydestp, bxc, byc, pitch, rowWidth);
 	} else if (rowWidth == 8) {
 		const u32 *src = (u32 *) Memory::GetPointer(texaddr);
 		for (int by = 0; by < byc; by++) {

diff --git a/GPU/GLES/TextureCache.cpp b/GPU/GLES/TextureCache.cpp
@@ -66,7 +66,7 @@ TextureCache::TextureCache() : clearCacheNextFrame_(false), lowMemoryMode_(false
 	clutBufConverted_ = (u32 *)AllocateAlignedMemory(4096 * sizeof(u32), 16);  // 16KB
 	clutBufRaw_ = (u32 *)AllocateAlignedMemory(4096 * sizeof(u32), 16);  // 16KB
 	glGetFloatv(GL_MAX_TEXTURE_MAX_ANISOTROPY_EXT, &maxAnisotropyLevel);
-	SetupQuickTexHash();
+	SetupTextureDecoder();
 }
 
 TextureCache::~TextureCache() {
@@ -285,51 +285,6 @@ void TextureCache::NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffe
 	}
 }
 
-static void Unswizzle16(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth) {
-#ifdef _M_SSE
-	const __m128i *src = (const __m128i *)texptr;
-	for (int by = 0; by < byc; by++) {
-		__m128i *xdest = (__m128i *)ydestp;
-		for (int bx = 0; bx < bxc; bx++) {
-			__m128i *dest = xdest;
-			for (int n = 0; n < 2; n++) {
-				// Textures are always 16-byte aligned so this is fine.
-				__m128i temp1 = _mm_load_si128(src);
-				__m128i temp2 = _mm_load_si128(src + 1);
-				__m128i temp3 = _mm_load_si128(src + 2);
-				__m128i temp4 = _mm_load_si128(src + 3);
-				_mm_store_si128(dest, temp1);
-				dest += pitch >> 2;
-				_mm_store_si128(dest, temp2);
-				dest += pitch >> 2;
-				_mm_store_si128(dest, temp3);
-				dest += pitch >> 2;
-				_mm_store_si128(dest, temp4);
-				dest += pitch >> 2;
-				src += 4;
-			}
-			xdest ++;
-		}
-		ydestp += (rowWidth * 8) / 4;
-	}
-#else
-	const u32 *src = (const u32 *)texptr;
-	for (int by = 0; by < byc; by++) {
-		u32 *xdest = ydestp;
-		for (int bx = 0; bx < bxc; bx++) {
-			u32 *dest = xdest;
-			for (int n = 0; n < 8; n++) {
-				memcpy(dest, src, 16);
-				dest += pitch;
-				src += 4;
-			}
-			xdest += 4;
-		}
-		ydestp += (rowWidth * 8) / 4;
-	}
-#endif
-}
-
 void *TextureCache::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 bytesPerPixel, u32 level) {
 	const u32 rowWidth = (bytesPerPixel > 0) ? (bufw * bytesPerPixel) : (bufw / 2);
 	const u32 pitch = rowWidth / 4;
@@ -342,7 +297,7 @@ void *TextureCache::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 bytesPerPix
 	if (rowWidth >= 16) {
 		u32 *ydestp = tmpTexBuf32.data();
 		// The most common one, so it gets an optimized implementation.
-		Unswizzle16(texptr, ydestp, bxc, byc, pitch, rowWidth);
+		DoUnswizzleTex16(texptr, ydestp, bxc, byc, pitch, rowWidth);
 	} else if (rowWidth == 8) {
 		const u32 *src = (const u32 *) texptr;
 		for (int by = 0; by < byc; by++) {