Permalink
Browse files

Merge pull request #9906 from hrydgard/fastrunloop-optimize

GPU FastRunLoop optimizations
  • Loading branch information...
hrydgard committed Aug 17, 2017
2 parents e50cb4f + 319df97 commit d60bb27cd981588bc9d7be5e7cc7bc1014272b86
Showing with 156 additions and 88 deletions.
  1. +0 −2 Common/Vulkan/VulkanContext.h
  2. +33 −15 GPU/D3D11/GPU_D3D11.cpp
  3. +34 −16 GPU/Directx9/GPU_DX9.cpp
  4. +30 −17 GPU/GLES/GPU_GLES.cpp
  5. +12 −20 GPU/GPUCommon.cpp
  6. +18 −1 GPU/GPUCommon.h
  7. +29 −17 GPU/Vulkan/GPU_Vulkan.cpp
@@ -314,7 +314,6 @@ class VulkanContext {
const VkPhysicalDeviceFeatures &GetFeaturesEnabled() const { return featuresEnabled_; }
const VulkanPhysicalDeviceInfo &GetDeviceInfo() const { return deviceInfo_; }
private:
VkSemaphore acquireSemaphore;
VkSemaphore renderingCompleteSemaphore;
@@ -381,7 +380,6 @@ class VulkanContext {
VulkanDeleteList deleteList;
};
FrameData frame_[2];
int curFrame_;
View
@@ -73,6 +73,7 @@ struct D3D11CommandTableEntry {
GPU_D3D11::CmdFunc func;
};
// This table gets crunched into a faster form by init.
static const D3D11CommandTableEntry commandTable[] = {
// Changes that dirty the current texture.
{ GE_CMD_TEXSIZE0, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTE, 0, &GPU_D3D11::Execute_TexSize0 },
@@ -88,7 +89,6 @@ static const D3D11CommandTableEntry commandTable[] = {
// Changes that trigger data copies. Only flushing on change for LOADCLUT must be a bit of a hack...
{ GE_CMD_LOADCLUT, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTE, 0, &GPU_D3D11::Execute_LoadClut },
{ GE_CMD_TRANSFERSTART, FLAG_FLUSHBEFORE | FLAG_EXECUTE | FLAG_READS_PC, 0, &GPUCommon::Execute_BlockTransferStart },
};
GPU_D3D11::CommandInfo GPU_D3D11::cmdInfo_[256]{};
@@ -389,27 +389,38 @@ void GPU_D3D11::CopyDisplayToOutputInternal() {
void GPU_D3D11::FastRunLoop(DisplayList &list) {
PROFILE_THIS_SCOPE("gpuloop");
const CommandInfo *cmdInfo = cmdInfo_;
for (; downcount > 0; --downcount) {
int dc = downcount;
for (; dc > 0; --dc) {
// We know that display list PCs have the upper nibble == 0 - no need to mask the pointer
const u32 op = *(const u32 *)(Memory::base + list.pc);
const u32 cmd = op >> 24;
const CommandInfo info = cmdInfo[cmd];
const u8 cmdFlags = info.flags; // If we stashed the cmdFlags in the top bits of the cmdmem, we could get away with one table lookup instead of two
const CommandInfo &info = cmdInfo[cmd];
const u32 diff = op ^ gstate.cmdmem[cmd];
// Inlined CheckFlushOp here to get rid of the dumpThisFrame_ check.
if ((cmdFlags & FLAG_FLUSHBEFORE) || (diff && (cmdFlags & FLAG_FLUSHBEFOREONCHANGE))) {
drawEngine_.Flush();
}
gstate.cmdmem[cmd] = op; // TODO: no need to write if diff==0...
if ((cmdFlags & FLAG_EXECUTE) || (diff && (cmdFlags & FLAG_EXECUTEONCHANGE))) {
(this->*info.func)(op, diff);
} else if (diff) {
uint64_t dirty = info.flags >> 8;
if (dirty)
gstate_c.Dirty(dirty);
if (diff == 0) {
if (info.flags & FLAG_EXECUTE) {
downcount = dc;
(this->*info.func)(op, diff);
dc = downcount;
}
} else {
uint64_t flags = info.flags;
if (flags & FLAG_FLUSHBEFOREONCHANGE) {
drawEngine_.Flush();
}
gstate.cmdmem[cmd] = op;
if (flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) {
downcount = dc;
(this->*info.func)(op, diff);
dc = downcount;
} else {
uint64_t dirty = flags >> 8;
if (dirty)
gstate_c.Dirty(dirty);
}
}
list.pc += 4;
}
downcount = 0;
}
void GPU_D3D11::FinishDeferred() {
@@ -533,6 +544,7 @@ void GPU_D3D11::Execute_Prim(u32 op, u32 diff) {
#endif
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitPrim(verts, inds, prim, count, vertexType, &bytesRead);
int vertexCost = EstimatePerVertexCost() * count;
@@ -546,6 +558,8 @@ void GPU_D3D11::Execute_Prim(u32 op, u32 diff) {
}
void GPU_D3D11::Execute_Bezier(u32 op, u32 diff) {
Flush();
// We don't dirty on normal changes anymore as we prescale, but it's needed for splines/bezier.
gstate_c.Dirty(DIRTY_UVSCALEOFFSET);
@@ -596,6 +610,7 @@ void GPU_D3D11::Execute_Bezier(u32 op, u32 diff) {
}
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitBezier(control_points, indices, gstate.getPatchDivisionU(), gstate.getPatchDivisionV(), bz_ucount, bz_vcount, patchPrim, computeNormals, patchFacing, gstate.vertType, &bytesRead);
if (gstate_c.bezier)
@@ -608,6 +623,8 @@ void GPU_D3D11::Execute_Bezier(u32 op, u32 diff) {
}
void GPU_D3D11::Execute_Spline(u32 op, u32 diff) {
Flush();
// We don't dirty on normal changes anymore as we prescale, but it's needed for splines/bezier.
gstate_c.Dirty(DIRTY_UVSCALEOFFSET);
@@ -664,6 +681,7 @@ void GPU_D3D11::Execute_Spline(u32 op, u32 diff) {
}
}
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitSpline(control_points, indices, gstate.getPatchDivisionU(), gstate.getPatchDivisionV(), sp_ucount, sp_vcount, sp_utype, sp_vtype, patchPrim, computeNormals, patchFacing, vertType, &bytesRead);
if (gstate_c.spline)
View
@@ -57,6 +57,7 @@ struct D3D9CommandTableEntry {
GPU_DX9::CmdFunc func;
};
// This table gets crunched into a faster form by init.
static const D3D9CommandTableEntry commandTable[] = {
// Changes that dirty the current texture.
{ GE_CMD_TEXSIZE0, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTE, 0, &GPU_DX9::Execute_TexSize0 },
@@ -72,7 +73,6 @@ static const D3D9CommandTableEntry commandTable[] = {
// Changes that trigger data copies. Only flushing on change for LOADCLUT must be a bit of a hack...
{ GE_CMD_LOADCLUT, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTE, 0, &GPU_DX9::Execute_LoadClut },
{ GE_CMD_TRANSFERSTART, FLAG_FLUSHBEFORE | FLAG_EXECUTE | FLAG_READS_PC, 0, &GPUCommon::Execute_BlockTransferStart },
};
GPU_DX9::CommandInfo GPU_DX9::cmdInfo_[256];
@@ -356,27 +356,38 @@ void GPU_DX9::CopyDisplayToOutputInternal() {
void GPU_DX9::FastRunLoop(DisplayList &list) {
PROFILE_THIS_SCOPE("gpuloop");
const CommandInfo *cmdInfo = cmdInfo_;
for (; downcount > 0; --downcount) {
int dc = downcount;
for (; dc > 0; --dc) {
// We know that display list PCs have the upper nibble == 0 - no need to mask the pointer
const u32 op = *(const u32 *)(Memory::base + list.pc);
const u32 cmd = op >> 24;
const CommandInfo info = cmdInfo[cmd];
const u8 cmdFlags = info.flags; // If we stashed the cmdFlags in the top bits of the cmdmem, we could get away with one table lookup instead of two
const CommandInfo &info = cmdInfo[cmd];
const u32 diff = op ^ gstate.cmdmem[cmd];
// Inlined CheckFlushOp here to get rid of the dumpThisFrame_ check.
if ((cmdFlags & FLAG_FLUSHBEFORE) || (diff && (cmdFlags & FLAG_FLUSHBEFOREONCHANGE))) {
drawEngine_.Flush();
}
gstate.cmdmem[cmd] = op; // TODO: no need to write if diff==0...
if ((cmdFlags & FLAG_EXECUTE) || (diff && (cmdFlags & FLAG_EXECUTEONCHANGE))) {
(this->*info.func)(op, diff);
} else if (diff) {
uint64_t dirty = info.flags >> 8;
if (dirty)
gstate_c.Dirty(dirty);
if (diff == 0) {
if (info.flags & FLAG_EXECUTE) {
downcount = dc;
(this->*info.func)(op, diff);
dc = downcount;
}
} else {
uint64_t flags = info.flags;
if (flags & FLAG_FLUSHBEFOREONCHANGE) {
drawEngine_.Flush();
}
gstate.cmdmem[cmd] = op;
if (flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) {
downcount = dc;
(this->*info.func)(op, diff);
dc = downcount;
} else {
uint64_t dirty = flags >> 8;
if (dirty)
gstate_c.Dirty(dirty);
}
}
list.pc += 4;
}
downcount = 0;
}
void GPU_DX9::FinishDeferred() {
@@ -386,7 +397,7 @@ void GPU_DX9::FinishDeferred() {
inline void GPU_DX9::CheckFlushOp(int cmd, u32 diff) {
const u8 cmdFlags = cmdInfo_[cmd].flags;
if ((cmdFlags & FLAG_FLUSHBEFORE) || (diff && (cmdFlags & FLAG_FLUSHBEFOREONCHANGE))) {
if (diff && (cmdFlags & FLAG_FLUSHBEFOREONCHANGE)) {
if (dumpThisFrame_) {
NOTICE_LOG(G3D, "================ FLUSH ================");
}
@@ -499,6 +510,7 @@ void GPU_DX9::Execute_Prim(u32 op, u32 diff) {
#endif
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitPrim(verts, inds, prim, count, vertexType, &bytesRead);
int vertexCost = EstimatePerVertexCost() * count;
@@ -512,6 +524,8 @@ void GPU_DX9::Execute_Prim(u32 op, u32 diff) {
}
void GPU_DX9::Execute_Bezier(u32 op, u32 diff) {
Flush();
// We don't dirty on normal changes anymore as we prescale, but it's needed for splines/bezier.
gstate_c.Dirty(DIRTY_UVSCALEOFFSET);
@@ -551,6 +565,7 @@ void GPU_DX9::Execute_Bezier(u32 op, u32 diff) {
bool computeNormals = gstate.isLightingEnabled();
bool patchFacing = gstate.patchfacing & 1;
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitBezier(control_points, indices, gstate.getPatchDivisionU(), gstate.getPatchDivisionV(), bz_ucount, bz_vcount, patchPrim, computeNormals, patchFacing, gstate.vertType, &bytesRead);
// After drawing, we advance pointers - see SubmitPrim which does the same.
@@ -559,6 +574,8 @@ void GPU_DX9::Execute_Bezier(u32 op, u32 diff) {
}
void GPU_DX9::Execute_Spline(u32 op, u32 diff) {
Flush();
// We don't dirty on normal changes anymore as we prescale, but it's needed for splines/bezier.
gstate_c.Dirty(DIRTY_UVSCALEOFFSET);
@@ -601,6 +618,7 @@ void GPU_DX9::Execute_Spline(u32 op, u32 diff) {
bool patchFacing = gstate.patchfacing & 1;
u32 vertType = gstate.vertType;
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitSpline(control_points, indices, gstate.getPatchDivisionU(), gstate.getPatchDivisionV(), sp_ucount, sp_vcount, sp_utype, sp_vtype, patchPrim, computeNormals, patchFacing, vertType, &bytesRead);
// After drawing, we advance pointers - see SubmitPrim which does the same.
View
@@ -76,7 +76,6 @@ static const GLESCommandTableEntry commandTable[] = {
// Changes that trigger data copies. Only flushing on change for LOADCLUT must be a bit of a hack...
{ GE_CMD_LOADCLUT, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTE, 0, &GPU_GLES::Execute_LoadClut },
{ GE_CMD_TRANSFERSTART, FLAG_FLUSHBEFORE | FLAG_EXECUTE | FLAG_READS_PC, 0, &GPUCommon::Execute_BlockTransferStart },
};
GPU_GLES::CommandInfo GPU_GLES::cmdInfo_[256];
@@ -567,22 +566,29 @@ void GPU_GLES::FastRunLoop(DisplayList &list) {
// We know that display list PCs have the upper nibble == 0 - no need to mask the pointer
const u32 op = *(const u32 *)(Memory::base + list.pc);
const u32 cmd = op >> 24;
const CommandInfo info = cmdInfo[cmd];
const u8 cmdFlags = info.flags; // If we stashed the cmdFlags in the top bits of the cmdmem, we could get away with one table lookup instead of two
const CommandInfo &info = cmdInfo[cmd];
const u32 diff = op ^ gstate.cmdmem[cmd];
// Inlined CheckFlushOp here to get rid of the dumpThisFrame_ check.
if ((cmdFlags & FLAG_FLUSHBEFORE) || (diff && (cmdFlags & FLAG_FLUSHBEFOREONCHANGE))) {
drawEngine_.Flush();
}
gstate.cmdmem[cmd] = op; // TODO: no need to write if diff==0...
if ((cmdFlags & FLAG_EXECUTE) || (diff && (cmdFlags & FLAG_EXECUTEONCHANGE))) {
downcount = dc;
(this->*info.func)(op, diff);
dc = downcount;
} else if (diff) {
uint64_t dirty = info.flags >> 8;
if (dirty)
gstate_c.Dirty(dirty);
if (diff == 0) {
if (info.flags & FLAG_EXECUTE) {
downcount = dc;
(this->*info.func)(op, diff);
dc = downcount;
}
} else {
uint64_t flags = info.flags;
if (flags & FLAG_FLUSHBEFOREONCHANGE) {
drawEngine_.Flush();
}
gstate.cmdmem[cmd] = op;
if (flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) {
downcount = dc;
(this->*info.func)(op, diff);
dc = downcount;
} else {
uint64_t dirty = flags >> 8;
if (dirty)
gstate_c.Dirty(dirty);
}
}
list.pc += 4;
}
@@ -596,7 +602,7 @@ void GPU_GLES::FinishDeferred() {
inline void GPU_GLES::CheckFlushOp(int cmd, u32 diff) {
const u8 cmdFlags = cmdInfo_[cmd].flags;
if ((cmdFlags & FLAG_FLUSHBEFORE) || (diff && (cmdFlags & FLAG_FLUSHBEFOREONCHANGE))) {
if (diff && (cmdFlags & FLAG_FLUSHBEFOREONCHANGE)) {
if (dumpThisFrame_) {
NOTICE_LOG(G3D, "================ FLUSH ================");
}
@@ -676,6 +682,7 @@ void GPU_GLES::Execute_Prim(u32 op, u32 diff) {
#endif
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitPrim(verts, inds, prim, count, gstate.vertType, &bytesRead);
int vertexCost = EstimatePerVertexCost();
@@ -720,6 +727,8 @@ void GPU_GLES::Execute_VertexTypeSkinning(u32 op, u32 diff) {
}
void GPU_GLES::Execute_Bezier(u32 op, u32 diff) {
Flush();
// We don't dirty on normal changes anymore as we prescale, but it's needed for splines/bezier.
gstate_c.Dirty(DIRTY_UVSCALEOFFSET);
@@ -770,6 +779,7 @@ void GPU_GLES::Execute_Bezier(u32 op, u32 diff) {
}
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitBezier(control_points, indices, gstate.getPatchDivisionU(), gstate.getPatchDivisionV(), bz_ucount, bz_vcount, patchPrim, computeNormals, patchFacing, gstate.vertType, &bytesRead);
if (gstate_c.bezier)
@@ -782,6 +792,8 @@ void GPU_GLES::Execute_Bezier(u32 op, u32 diff) {
}
void GPU_GLES::Execute_Spline(u32 op, u32 diff) {
Flush();
// We don't dirty on normal changes anymore as we prescale, but it's needed for splines/bezier.
gstate_c.Dirty(DIRTY_UVSCALEOFFSET);
@@ -839,6 +851,7 @@ void GPU_GLES::Execute_Spline(u32 op, u32 diff) {
}
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitSpline(control_points, indices, gstate.getPatchDivisionU(), gstate.getPatchDivisionV(), sp_ucount, sp_vcount, sp_utype, sp_vtype, patchPrim, computeNormals, patchFacing, vertType, &bytesRead);
if (gstate_c.spline)
Oops, something went wrong.

0 comments on commit d60bb27

Please sign in to comment.