Skip to content

Commit

Permalink
Merge pull request #9585 from unknownbrackets/gpu-clear
Browse files Browse the repository at this point in the history
Skip VRAM clears when no download has been made
  • Loading branch information
hrydgard committed Apr 9, 2017
2 parents d703886 + 4a8c661 commit a85b76e
Show file tree
Hide file tree
Showing 8 changed files with 94 additions and 73 deletions.
66 changes: 0 additions & 66 deletions GPU/Common/DrawEngineCommon.cpp
Expand Up @@ -131,72 +131,6 @@ u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr,
return DrawEngineCommon::NormalizeVertices(outPtr, bufPtr, inPtr, dec, lowerBound, upperBound, vertType);
}

void DrawEngineCommon::ApplyClearToMemory(int x1, int y1, int x2, int y2, u32 clearColor) {
u8 *addr = Memory::GetPointer(gstate.getFrameBufAddress());
const bool singleByteClear = (clearColor >> 16) == (clearColor & 0xFFFF) && (clearColor >> 24) == (clearColor & 0xFF);
const int bpp = gstate.FrameBufFormat() == GE_FORMAT_8888 ? 4 : 2;
const int stride = gstate.FrameBufStride();
const int width = x2 - x1;

// Can use memset for simple cases. Often alpha is different and gums up the works.
// The check for bpp==4 etc is because we don't properly convert the clear color to the correct
// 16-bit format before computing the singleByteClear value. That could be done, but it was easier
// to just fall back to the generic case.
if (singleByteClear && (bpp == 4 || clearColor == 0)) {
const int byteStride = stride * bpp;
const int byteWidth = width * bpp;
addr += x1 * bpp;
for (int y = y1; y < y2; ++y) {
memset(addr + y * byteStride, clearColor, byteWidth);
}
} else {
u16 clear16 = 0;
switch (gstate.FrameBufFormat()) {
case GE_FORMAT_565: ConvertRGBA8888ToRGB565(&clear16, &clearColor, 1); break;
case GE_FORMAT_5551: ConvertRGBA8888ToRGBA5551(&clear16, &clearColor, 1); break;
case GE_FORMAT_4444: ConvertRGBA8888ToRGBA4444(&clear16, &clearColor, 1); break;
}

// This will most often be true - rarely is the width not aligned.
// TODO: We should really use non-temporal stores here to avoid the cache,
// as it's unlikely that these bytes will be read.
if ((width & 3) == 0 && (x1 & 3) == 0) {
u64 val64 = clearColor | ((u64)clearColor << 32);
int xstride = 2;
if (bpp == 2) {
// Spread to all eight bytes.
u64 c2 = clear16 | (clear16 << 16);
val64 = c2 | (c2 << 32);
xstride = 4;
}

u64 *addr64 = (u64 *)addr;
const int stride64 = stride / xstride;
const int x1_64 = x1 / xstride;
const int x2_64 = x2 / xstride;
for (int y = y1; y < y2; ++y) {
for (int x = x1_64; x < x2_64; ++x) {
addr64[y * stride64 + x] = val64;
}
}
} else if (bpp == 4) {
u32 *addr32 = (u32 *)addr;
for (int y = y1; y < y2; ++y) {
for (int x = x1; x < x2; ++x) {
addr32[y * stride + x] = clearColor;
}
}
} else if (bpp == 2) {
u16 *addr16 = (u16 *)addr;
for (int y = y1; y < y2; ++y) {
for (int x = x1; x < x2; ++x) {
addr16[y * stride + x] = clear16;
}
}
}
}
}

// This code is HIGHLY unoptimized!
//
// It does the simplest and safest test possible: If all points of a bbox is outside a single of
Expand Down
1 change: 0 additions & 1 deletion GPU/Common/DrawEngineCommon.h
Expand Up @@ -66,7 +66,6 @@ class DrawEngineCommon {
// Preprocessing for spline/bezier
u32 NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr, int lowerBound, int upperBound, u32 vertType);

void ApplyClearToMemory(int x1, int y1, int x2, int y2, u32 clearColor);
bool ApplyShaderBlending();

VertexDecoder *GetVertexDecoder(u32 vtype);
Expand Down
89 changes: 87 additions & 2 deletions GPU/Common/FramebufferCommon.cpp
Expand Up @@ -23,6 +23,7 @@
#include "gfx_es2/gpu_features.h"

#include "i18n/i18n.h"
#include "Common/ColorConv.h"
#include "Common/Common.h"
#include "Core/Config.h"
#include "Core/CoreParameter.h"
Expand Down Expand Up @@ -564,6 +565,7 @@ void FramebufferManagerCommon::NotifyRenderFramebufferUpdated(VirtualFramebuffer
void FramebufferManagerCommon::NotifyRenderFramebufferSwitched(VirtualFramebuffer *prevVfb, VirtualFramebuffer *vfb, bool isClearingDepth) {
if (ShouldDownloadFramebuffer(vfb) && !vfb->memoryUpdated) {
ReadFramebufferToMemory(vfb, true, 0, 0, vfb->width, vfb->height);
vfb->usageFlags = (vfb->usageFlags | FB_USAGE_DOWNLOAD) & ~FB_USAGE_DOWNLOAD_CLEAR;
} else {
DownloadFramebufferOnSwitch(prevVfb);
}
Expand Down Expand Up @@ -796,12 +798,13 @@ void FramebufferManagerCommon::DrawFramebufferToOutput(const u8 *srcPixels, GEBu
}

void FramebufferManagerCommon::DownloadFramebufferOnSwitch(VirtualFramebuffer *vfb) {
if (vfb && vfb->safeWidth > 0 && vfb->safeHeight > 0 && !vfb->firstFrameSaved) {
if (vfb && vfb->safeWidth > 0 && vfb->safeHeight > 0 && !vfb->firstFrameSaved && !vfb->memoryUpdated) {
// Some games will draw to some memory once, and use it as a render-to-texture later.
// To support this, we save the first frame to memory when we have a save w/h.
// To support this, we save the first frame to memory when we have a safe w/h.
// Saving each frame would be slow.
if (!g_Config.bDisableSlowFramebufEffects) {
ReadFramebufferToMemory(vfb, true, 0, 0, vfb->safeWidth, vfb->safeHeight);
vfb->usageFlags = (vfb->usageFlags | FB_USAGE_DOWNLOAD) & ~FB_USAGE_DOWNLOAD_CLEAR;
vfb->firstFrameSaved = true;
vfb->safeWidth = 0;
vfb->safeHeight = 0;
Expand Down Expand Up @@ -1047,6 +1050,7 @@ void FramebufferManagerCommon::DecimateFBOs() {
if (ShouldDownloadFramebuffer(vfb) && age == 0 && !vfb->memoryUpdated) {
bool sync = gl_extensions.IsGLES;
ReadFramebufferToMemory(vfb, sync, 0, 0, vfb->width, vfb->height);
vfb->usageFlags = (vfb->usageFlags | FB_USAGE_DOWNLOAD) & ~FB_USAGE_DOWNLOAD_CLEAR;
}

// Let's also "decimate" the usageFlags.
Expand Down Expand Up @@ -1265,6 +1269,7 @@ bool FramebufferManagerCommon::NotifyFramebufferCopy(u32 src, u32 dst, int size,
WARN_LOG_REPORT_ONCE(btdcpyheight, G3D, "Memcpy fbo download %08x -> %08x skipped, %d+%d is taller than %d", src, dst, srcY, srcH, srcBuffer->bufferHeight);
} else if (g_Config.bBlockTransferGPU && !srcBuffer->memoryUpdated) {
ReadFramebufferToMemory(srcBuffer, true, 0, srcY, srcBuffer->width, srcH);
srcBuffer->usageFlags = (srcBuffer->usageFlags | FB_USAGE_DOWNLOAD) & ~FB_USAGE_DOWNLOAD_CLEAR;
}
return false;
} else {
Expand Down Expand Up @@ -1419,6 +1424,84 @@ VirtualFramebuffer *FramebufferManagerCommon::FindDownloadTempBuffer(VirtualFram
return nvfb;
}

void FramebufferManagerCommon::ApplyClearToMemory(int x1, int y1, int x2, int y2, u32 clearColor) {
if (currentRenderVfb_) {
if ((currentRenderVfb_->usageFlags & FB_USAGE_DOWNLOAD_CLEAR) != 0) {
// Already zeroed in memory.
return;
}
}

u8 *addr = Memory::GetPointer(gstate.getFrameBufAddress());
const int bpp = gstate.FrameBufFormat() == GE_FORMAT_8888 ? 4 : 2;

u32 clearBits = clearColor;
if (bpp == 2) {
u16 clear16 = 0;
switch (gstate.FrameBufFormat()) {
case GE_FORMAT_565: ConvertRGBA8888ToRGB565(&clear16, &clearColor, 1); break;
case GE_FORMAT_5551: ConvertRGBA8888ToRGBA5551(&clear16, &clearColor, 1); break;
case GE_FORMAT_4444: ConvertRGBA8888ToRGBA4444(&clear16, &clearColor, 1); break;
default: _dbg_assert_(G3D, 0); break;
}
clearBits = clear16 | (clear16 << 16);
}

const bool singleByteClear = (clearBits >> 16) == (clearBits & 0xFFFF) && (clearBits >> 24) == (clearBits & 0xFF);
const int stride = gstate.FrameBufStride();
const int width = x2 - x1;

// Can use memset for simple cases. Often alpha is different and gums up the works.
if (singleByteClear) {
const int byteStride = stride * bpp;
const int byteWidth = width * bpp;
addr += x1 * bpp;
for (int y = y1; y < y2; ++y) {
memset(addr + y * byteStride, clearBits, byteWidth);
}
} else {
// This will most often be true - rarely is the width not aligned.
// TODO: We should really use non-temporal stores here to avoid the cache,
// as it's unlikely that these bytes will be read.
if ((width & 3) == 0 && (x1 & 3) == 0) {
u64 val64 = clearBits | ((u64)clearBits << 32);
int xstride = 8 / bpp;

u64 *addr64 = (u64 *)addr;
const int stride64 = stride / xstride;
const int x1_64 = x1 / xstride;
const int x2_64 = x2 / xstride;
for (int y = y1; y < y2; ++y) {
for (int x = x1_64; x < x2_64; ++x) {
addr64[y * stride64 + x] = val64;
}
}
} else if (bpp == 4) {
u32 *addr32 = (u32 *)addr;
for (int y = y1; y < y2; ++y) {
for (int x = x1; x < x2; ++x) {
addr32[y * stride + x] = clearBits;
}
}
} else if (bpp == 2) {
u16 *addr16 = (u16 *)addr;
for (int y = y1; y < y2; ++y) {
for (int x = x1; x < x2; ++x) {
addr16[y * stride + x] = (u16)clearBits;
}
}
}
}

if (currentRenderVfb_) {
// The current content is in memory now, so update the flag.
if (x1 == 0 && y1 == 0 && x2 >= currentRenderVfb_->width && y2 >= currentRenderVfb_->height) {
currentRenderVfb_->usageFlags |= FB_USAGE_DOWNLOAD_CLEAR;
currentRenderVfb_->memoryUpdated = true;
}
}
}

void FramebufferManagerCommon::OptimizeDownloadRange(VirtualFramebuffer * vfb, int & x, int & y, int & w, int & h) {
if (gameUsesSequentialCopies_) {
// Ignore the x/y/etc., read the entire thing.
Expand All @@ -1430,6 +1513,7 @@ void FramebufferManagerCommon::OptimizeDownloadRange(VirtualFramebuffer * vfb, i
if (x == 0 && y == 0 && w == vfb->width && h == vfb->height) {
// Mark it as fully downloaded until next render to it.
vfb->memoryUpdated = true;
vfb->usageFlags |= FB_USAGE_DOWNLOAD;
} else {
// Let's try to set the flag eventually, if the game copies a lot.
// Some games copy subranges very frequently.
Expand Down Expand Up @@ -1511,6 +1595,7 @@ bool FramebufferManagerCommon::NotifyBlockTransferBefore(u32 dstBasePtr, int dst
if (tooTall)
WARN_LOG_ONCE(btdheight, G3D, "Block transfer download %08x -> %08x dangerous, %d+%d is taller than %d", srcBasePtr, dstBasePtr, srcY, srcHeight, srcBuffer->bufferHeight);
ReadFramebufferToMemory(srcBuffer, true, static_cast<int>(srcX * srcXFactor), srcY, static_cast<int>(srcWidth * srcXFactor), srcHeight);
srcBuffer->usageFlags = (srcBuffer->usageFlags | FB_USAGE_DOWNLOAD) & ~FB_USAGE_DOWNLOAD_CLEAR;
}
}
return false; // Let the bit copy happen
Expand Down
3 changes: 3 additions & 0 deletions GPU/Common/FramebufferCommon.h
Expand Up @@ -31,6 +31,8 @@ enum {
FB_USAGE_RENDERTARGET = 2,
FB_USAGE_TEXTURE = 4,
FB_USAGE_CLUT = 8,
FB_USAGE_DOWNLOAD = 16,
FB_USAGE_DOWNLOAD_CLEAR = 32,
};

enum {
Expand Down Expand Up @@ -191,6 +193,7 @@ class FramebufferManagerCommon {
bool NotifyFramebufferCopy(u32 src, u32 dest, int size, bool isMemset, u32 skipDrawReason);
void NotifyVideoUpload(u32 addr, int size, int width, GEBufferFormat fmt);
void UpdateFromMemory(u32 addr, int size, bool safe);
void ApplyClearToMemory(int x1, int y1, int x2, int y2, u32 clearColor);
virtual bool NotifyStencilUpload(u32 addr, int size, bool skipZero = false) = 0;
// Returns true if it's sure this is a direct FBO->FBO transfer and it has already handle it.
// In that case we hardly need to actually copy the bytes in VRAM, they will be wrong anyway (unless
Expand Down
2 changes: 1 addition & 1 deletion GPU/D3D11/DrawEngineD3D11.cpp
Expand Up @@ -935,7 +935,7 @@ void DrawEngineD3D11::DoFlush() {
if (g_Config.bBlockTransferGPU && (gstate_c.featureFlags & GPU_USE_CLEAR_RAM_HACK) && gstate.isClearModeColorMask() && (gstate.isClearModeAlphaMask() || gstate.FrameBufFormat() == GE_FORMAT_565)) {
int scissorX1 = gstate.getScissorX1();
int scissorY1 = gstate.getScissorY1();
ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, clearColor);
framebufferManager_->ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, clearColor);
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion GPU/Directx9/DrawEngineDX9.cpp
Expand Up @@ -874,7 +874,7 @@ void DrawEngineDX9::DoFlush() {
if (g_Config.bBlockTransferGPU && (gstate_c.featureFlags & GPU_USE_CLEAR_RAM_HACK) && gstate.isClearModeColorMask() && (gstate.isClearModeAlphaMask() || gstate.FrameBufFormat() == GE_FORMAT_565)) {
int scissorX1 = gstate.getScissorX1();
int scissorY1 = gstate.getScissorY1();
ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, clearColor);
framebufferManager_->ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, clearColor);
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion GPU/GLES/DrawEngineGLES.cpp
Expand Up @@ -976,7 +976,7 @@ void DrawEngineGLES::DoFlush() {
framebufferManager_->SetSafeSize(scissorX2, scissorY2);

if (g_Config.bBlockTransferGPU && (gstate_c.featureFlags & GPU_USE_CLEAR_RAM_HACK) && colorMask && (alphaMask || gstate.FrameBufFormat() == GE_FORMAT_565)) {
ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, clearColor);
framebufferManager_->ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, clearColor);
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion GPU/Vulkan/DrawEngineVulkan.cpp
Expand Up @@ -892,7 +892,7 @@ void DrawEngineVulkan::DoFlush(VkCommandBuffer cmd) {
framebufferManager_->SetSafeSize(scissorX2, scissorY2);

if (g_Config.bBlockTransferGPU && (gstate_c.featureFlags & GPU_USE_CLEAR_RAM_HACK) && gstate.isClearModeColorMask() && (gstate.isClearModeAlphaMask() || gstate.FrameBufFormat() == GE_FORMAT_565)) {
ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, result.color);
framebufferManager_->ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, result.color);
}
}
}
Expand Down

0 comments on commit a85b76e

Please sign in to comment.