Permalink
Browse files

Fastpath in fastrunloop when diff=0. Remove need for Execute for UV s…

…cale/offset.
  • Loading branch information...
hrydgard committed Aug 17, 2017
1 parent 71baeca commit 2c4e5e2303fc6f0fd1cc880cbed1154013f540b4
Showing with 127 additions and 80 deletions.
  1. +0 −2 Common/Vulkan/VulkanContext.h
  2. +28 −14 GPU/D3D11/GPU_D3D11.cpp
  3. +28 −14 GPU/Directx9/GPU_DX9.cpp
  4. +25 −15 GPU/GLES/GPU_GLES.cpp
  5. +6 −20 GPU/GPUCommon.cpp
  6. +15 −0 GPU/GPUCommon.h
  7. +25 −15 GPU/Vulkan/GPU_Vulkan.cpp
@@ -314,7 +314,6 @@ class VulkanContext {
const VkPhysicalDeviceFeatures &GetFeaturesEnabled() const { return featuresEnabled_; }
const VulkanPhysicalDeviceInfo &GetDeviceInfo() const { return deviceInfo_; }
private:
VkSemaphore acquireSemaphore;
VkSemaphore renderingCompleteSemaphore;
@@ -381,7 +380,6 @@ class VulkanContext {
VulkanDeleteList deleteList;
};
FrameData frame_[2];
int curFrame_;
View
@@ -389,27 +389,38 @@ void GPU_D3D11::CopyDisplayToOutputInternal() {
void GPU_D3D11::FastRunLoop(DisplayList &list) {
PROFILE_THIS_SCOPE("gpuloop");
const CommandInfo *cmdInfo = cmdInfo_;
for (; downcount > 0; --downcount) {
int dc = downcount;
for (; dc > 0; --dc) {
// We know that display list PCs have the upper nibble == 0 - no need to mask the pointer
const u32 op = *(const u32 *)(Memory::base + list.pc);
const u32 cmd = op >> 24;
const CommandInfo info = cmdInfo[cmd];
const u8 cmdFlags = info.flags; // If we stashed the cmdFlags in the top bits of the cmdmem, we could get away with one table lookup instead of two
const CommandInfo &info = cmdInfo[cmd];
const u32 diff = op ^ gstate.cmdmem[cmd];
// Inlined CheckFlushOp here to get rid of the dumpThisFrame_ check.
if (diff && (cmdFlags & FLAG_FLUSHBEFOREONCHANGE)) {
drawEngine_.Flush();
}
gstate.cmdmem[cmd] = op; // TODO: no need to write if diff==0...
if ((cmdFlags & FLAG_EXECUTE) || (diff && (cmdFlags & FLAG_EXECUTEONCHANGE))) {
(this->*info.func)(op, diff);
} else if (diff) {
uint64_t dirty = info.flags >> 8;
if (dirty)
gstate_c.Dirty(dirty);
if (diff == 0) {
if (info.flags & FLAG_EXECUTE) {
downcount = dc;
(this->*info.func)(op, diff);
dc = downcount;
}
} else {
uint64_t flags = info.flags;
if (flags & FLAG_FLUSHBEFOREONCHANGE) {
drawEngine_.Flush();
}
gstate.cmdmem[cmd] = op; // TODO: no need to write if diff==0...
if (flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) {
downcount = dc;
(this->*info.func)(op, diff);
dc = downcount;
} else {
uint64_t dirty = flags >> 8;
if (dirty)
gstate_c.Dirty(dirty);
}
}
list.pc += 4;
}
downcount = 0;
}
void GPU_D3D11::FinishDeferred() {
@@ -533,6 +544,7 @@ void GPU_D3D11::Execute_Prim(u32 op, u32 diff) {
#endif
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitPrim(verts, inds, prim, count, vertexType, &bytesRead);
int vertexCost = EstimatePerVertexCost() * count;
@@ -598,6 +610,7 @@ void GPU_D3D11::Execute_Bezier(u32 op, u32 diff) {
}
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitBezier(control_points, indices, gstate.getPatchDivisionU(), gstate.getPatchDivisionV(), bz_ucount, bz_vcount, patchPrim, computeNormals, patchFacing, gstate.vertType, &bytesRead);
if (gstate_c.bezier)
@@ -668,6 +681,7 @@ void GPU_D3D11::Execute_Spline(u32 op, u32 diff) {
}
}
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitSpline(control_points, indices, gstate.getPatchDivisionU(), gstate.getPatchDivisionV(), sp_ucount, sp_vcount, sp_utype, sp_vtype, patchPrim, computeNormals, patchFacing, vertType, &bytesRead);
if (gstate_c.spline)
View
@@ -356,27 +356,38 @@ void GPU_DX9::CopyDisplayToOutputInternal() {
void GPU_DX9::FastRunLoop(DisplayList &list) {
PROFILE_THIS_SCOPE("gpuloop");
const CommandInfo *cmdInfo = cmdInfo_;
for (; downcount > 0; --downcount) {
int dc = downcount;
for (; dc > 0; --dc) {
// We know that display list PCs have the upper nibble == 0 - no need to mask the pointer
const u32 op = *(const u32 *)(Memory::base + list.pc);
const u32 cmd = op >> 24;
const CommandInfo info = cmdInfo[cmd];
const u8 cmdFlags = info.flags; // If we stashed the cmdFlags in the top bits of the cmdmem, we could get away with one table lookup instead of two
const CommandInfo &info = cmdInfo[cmd];
const u32 diff = op ^ gstate.cmdmem[cmd];
// Inlined CheckFlushOp here to get rid of the dumpThisFrame_ check.
if (diff && (cmdFlags & FLAG_FLUSHBEFOREONCHANGE)) {
drawEngine_.Flush();
}
gstate.cmdmem[cmd] = op; // TODO: no need to write if diff==0...
if ((cmdFlags & FLAG_EXECUTE) || (diff && (cmdFlags & FLAG_EXECUTEONCHANGE))) {
(this->*info.func)(op, diff);
} else if (diff) {
uint64_t dirty = info.flags >> 8;
if (dirty)
gstate_c.Dirty(dirty);
if (diff == 0) {
if (info.flags & FLAG_EXECUTE) {
downcount = dc;
(this->*info.func)(op, diff);
dc = downcount;
}
} else {
uint64_t flags = info.flags;
if (flags & FLAG_FLUSHBEFOREONCHANGE) {
drawEngine_.Flush();
}
gstate.cmdmem[cmd] = op; // TODO: no need to write if diff==0...
if (flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) {
downcount = dc;
(this->*info.func)(op, diff);
dc = downcount;
} else {
uint64_t dirty = flags >> 8;
if (dirty)
gstate_c.Dirty(dirty);
}
}
list.pc += 4;
}
downcount = 0;
}
void GPU_DX9::FinishDeferred() {
@@ -499,6 +510,7 @@ void GPU_DX9::Execute_Prim(u32 op, u32 diff) {
#endif
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitPrim(verts, inds, prim, count, vertexType, &bytesRead);
int vertexCost = EstimatePerVertexCost() * count;
@@ -553,6 +565,7 @@ void GPU_DX9::Execute_Bezier(u32 op, u32 diff) {
bool computeNormals = gstate.isLightingEnabled();
bool patchFacing = gstate.patchfacing & 1;
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitBezier(control_points, indices, gstate.getPatchDivisionU(), gstate.getPatchDivisionV(), bz_ucount, bz_vcount, patchPrim, computeNormals, patchFacing, gstate.vertType, &bytesRead);
// After drawing, we advance pointers - see SubmitPrim which does the same.
@@ -605,6 +618,7 @@ void GPU_DX9::Execute_Spline(u32 op, u32 diff) {
bool patchFacing = gstate.patchfacing & 1;
u32 vertType = gstate.vertType;
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitSpline(control_points, indices, gstate.getPatchDivisionU(), gstate.getPatchDivisionV(), sp_ucount, sp_vcount, sp_utype, sp_vtype, patchPrim, computeNormals, patchFacing, vertType, &bytesRead);
// After drawing, we advance pointers - see SubmitPrim which does the same.
View
@@ -567,22 +567,29 @@ void GPU_GLES::FastRunLoop(DisplayList &list) {
// We know that display list PCs have the upper nibble == 0 - no need to mask the pointer
const u32 op = *(const u32 *)(Memory::base + list.pc);
const u32 cmd = op >> 24;
const CommandInfo info = cmdInfo[cmd];
const u8 cmdFlags = info.flags; // If we stashed the cmdFlags in the top bits of the cmdmem, we could get away with one table lookup instead of two
const CommandInfo &info = cmdInfo[cmd];
const u32 diff = op ^ gstate.cmdmem[cmd];
// Inlined CheckFlushOp here to get rid of the dumpThisFrame_ check.
if (diff && (cmdFlags & FLAG_FLUSHBEFOREONCHANGE)) {
drawEngine_.Flush();
}
gstate.cmdmem[cmd] = op; // TODO: no need to write if diff==0...
if ((cmdFlags & FLAG_EXECUTE) || (diff && (cmdFlags & FLAG_EXECUTEONCHANGE))) {
downcount = dc;
(this->*info.func)(op, diff);
dc = downcount;
} else if (diff) {
uint64_t dirty = info.flags >> 8;
if (dirty)
gstate_c.Dirty(dirty);
if (diff == 0) {
if (info.flags & FLAG_EXECUTE) {
downcount = dc;
(this->*info.func)(op, diff);
dc = downcount;
}
} else {
uint64_t flags = info.flags;
if (flags & FLAG_FLUSHBEFOREONCHANGE) {
drawEngine_.Flush();
}
gstate.cmdmem[cmd] = op; // TODO: no need to write if diff==0...
if (flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) {
downcount = dc;
(this->*info.func)(op, diff);
dc = downcount;
} else {
uint64_t dirty = flags >> 8;
if (dirty)
gstate_c.Dirty(dirty);
}
}
list.pc += 4;
}
@@ -676,6 +683,7 @@ void GPU_GLES::Execute_Prim(u32 op, u32 diff) {
#endif
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitPrim(verts, inds, prim, count, gstate.vertType, &bytesRead);
int vertexCost = EstimatePerVertexCost();
@@ -772,6 +780,7 @@ void GPU_GLES::Execute_Bezier(u32 op, u32 diff) {
}
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitBezier(control_points, indices, gstate.getPatchDivisionU(), gstate.getPatchDivisionV(), bz_ucount, bz_vcount, patchPrim, computeNormals, patchFacing, gstate.vertType, &bytesRead);
if (gstate_c.bezier)
@@ -843,6 +852,7 @@ void GPU_GLES::Execute_Spline(u32 op, u32 diff) {
}
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitSpline(control_points, indices, gstate.getPatchDivisionU(), gstate.getPatchDivisionV(), sp_ucount, sp_vcount, sp_utype, sp_vtype, patchPrim, computeNormals, patchFacing, vertType, &bytesRead);
if (gstate_c.spline)
View
@@ -111,10 +111,12 @@ const CommonCommandTableEntry commonCommandTable[] = {
{ GE_CMD_LOGICOPENABLE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_BLEND_STATE | DIRTY_FRAGMENTSHADER_STATE },
{ GE_CMD_TEXMAPMODE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE },
{ GE_CMD_TEXSCALEU, FLAG_EXECUTEONCHANGE, 0, &GPUCommon::Execute_TexScaleU },
{ GE_CMD_TEXSCALEV, FLAG_EXECUTEONCHANGE, 0, &GPUCommon::Execute_TexScaleV },
{ GE_CMD_TEXOFFSETU, FLAG_EXECUTEONCHANGE, 0, &GPUCommon::Execute_TexOffsetU },
{ GE_CMD_TEXOFFSETV, FLAG_EXECUTEONCHANGE, 0, &GPUCommon::Execute_TexOffsetV },
// These are read on every SubmitPrim, no need for dirtying or flushing.
{ GE_CMD_TEXSCALEU },
{ GE_CMD_TEXSCALEV },
{ GE_CMD_TEXOFFSETU },
{ GE_CMD_TEXOFFSETV },
// TEXSIZE0 is handled by each backend.
{ GE_CMD_TEXSIZE1, FLAG_FLUSHBEFOREONCHANGE, DIRTY_TEXTURE_PARAMS },
@@ -1425,22 +1427,6 @@ void GPUCommon::Execute_End(u32 op, u32 diff) {
}
}
void GPUCommon::Execute_TexScaleU(u32 op, u32 diff) {
gstate_c.uv.uScale = getFloat24(op);
}
void GPUCommon::Execute_TexScaleV(u32 op, u32 diff) {
gstate_c.uv.vScale = getFloat24(op);
}
void GPUCommon::Execute_TexOffsetU(u32 op, u32 diff) {
gstate_c.uv.uOff = getFloat24(op);
}
void GPUCommon::Execute_TexOffsetV(u32 op, u32 diff) {
gstate_c.uv.vOff = getFloat24(op);
}
void GPUCommon::Execute_TexLevel(u32 op, u32 diff) {
if (diff == 0xFFFFFFFF) return;
View
@@ -196,6 +196,21 @@ class GPUCommon : public GPUThreadEventQueue, public GPUDebugInterface {
GPUgstate GetGState() override;
void SetCmdValue(u32 op) override;
void UpdateUVScaleOffset() {
#ifdef _M_SSE
__m128i values = _mm_slli_epi32(_mm_load_si128((const __m128i *)&gstate.texscaleu), 8);
_mm_storeu_si128((__m128i *)&gstate_c.uv, values);
#elif PPSSPP_PLATFORM(ARM_NEON)
const uint32x4_t values = vshlq_n_u32(vld1q_u32(&gstate.texscaleu), 8);
vst1q_u32(&gstate_c.uv, values);
#else
gstate_c.uv.uScale = getFloat24(gstate.texscaleu);
gstate_c.uv.vScale = getFloat24(gstate.texscalev);
gstate_c.uv.uOff = getFloat24(gstate.texoffsetu);
gstate_c.uv.vOff = getFloat24(gstate.texoffsetv);
#endif
}
DisplayList* getList(int listid) override {
return &dls[listid];
}
View
@@ -398,22 +398,29 @@ void GPU_Vulkan::FastRunLoop(DisplayList &list) {
// We know that display list PCs have the upper nibble == 0 - no need to mask the pointer
const u32 op = *(const u32 *)(Memory::base + list.pc);
const u32 cmd = op >> 24;
const CommandInfo info = cmdInfo[cmd];
const u8 cmdFlags = info.flags; // If we stashed the cmdFlags in the top bits of the cmdmem, we could get away with one table lookup instead of two
const CommandInfo &info = cmdInfo[cmd];
const u32 diff = op ^ gstate.cmdmem[cmd];
// Inlined CheckFlushOp here to get rid of the dumpThisFrame_ check.
if (diff && (cmdFlags & FLAG_FLUSHBEFOREONCHANGE)) {
drawEngine_.Flush();
}
gstate.cmdmem[cmd] = op; // TODO: no need to write if diff==0...
if ((cmdFlags & FLAG_EXECUTE) || (diff && (cmdFlags & FLAG_EXECUTEONCHANGE))) {
downcount = dc;
(this->*info.func)(op, diff);
dc = downcount;
} else if (diff) {
uint64_t dirty = info.flags >> 8;
if (dirty)
gstate_c.Dirty(dirty);
if (diff == 0) {
if (info.flags & FLAG_EXECUTE) {
downcount = dc;
(this->*info.func)(op, diff);
dc = downcount;
}
} else {
uint64_t flags = info.flags;
if (flags & FLAG_FLUSHBEFOREONCHANGE) {
drawEngine_.Flush();
}
gstate.cmdmem[cmd] = op; // TODO: no need to write if diff==0...
if (flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) {
downcount = dc;
(this->*info.func)(op, diff);
dc = downcount;
} else {
uint64_t dirty = flags >> 8;
if (dirty)
gstate_c.Dirty(dirty);
}
}
list.pc += 4;
}
@@ -507,6 +514,7 @@ void GPU_Vulkan::Execute_Prim(u32 op, u32 diff) {
#endif
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitPrim(verts, inds, prim, count, gstate.vertType, &bytesRead);
int vertexCost = EstimatePerVertexCost() * count;
@@ -603,6 +611,7 @@ void GPU_Vulkan::Execute_Bezier(u32 op, u32 diff) {
}
}
UpdateUVScaleOffset();
drawEngine_.SubmitBezier(control_points, indices, gstate.getPatchDivisionU(), gstate.getPatchDivisionV(), bz_ucount, bz_vcount, patchPrim, computeNormals, patchFacing, gstate.vertType, &bytesRead);
if (gstate_c.bezier)
@@ -674,6 +683,7 @@ void GPU_Vulkan::Execute_Spline(u32 op, u32 diff) {
}
int bytesRead = 0;
UpdateUVScaleOffset();
drawEngine_.SubmitSpline(control_points, indices, gstate.getPatchDivisionU(), gstate.getPatchDivisionV(), sp_ucount, sp_vcount, sp_utype, sp_vtype, patchPrim, computeNormals, patchFacing, vertType, &bytesRead);
if (gstate_c.spline)

0 comments on commit 2c4e5e2

Please sign in to comment.