Skip to content

Commit

Permalink
Merge pull request #10454 from unknownbrackets/gpu-minor
Browse files Browse the repository at this point in the history
Vulkan: Use depth clamping, where available
  • Loading branch information
hrydgard committed Dec 27, 2017
2 parents 8ebbb82 + 6bdf39c commit ea50561
Show file tree
Hide file tree
Showing 12 changed files with 126 additions and 42 deletions.
4 changes: 4 additions & 0 deletions GPU/Common/GPUStateUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,9 @@ float DepthSliceFactor() {
if (gstate_c.Supports(GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT)) {
return DEPTH_SLICE_FACTOR_16BIT;
}
if (gstate_c.Supports(GPU_SUPPORTS_DEPTH_CLAMP)) {
return 1.0f;
}
return DEPTH_SLICE_FACTOR_HIGH;
}

Expand Down Expand Up @@ -681,6 +684,7 @@ void ConvertViewportAndScissor(bool useBufferedRendering, float renderWidth, flo
// So, we apply the depth range as minz/maxz, and transform for the viewport.
float vpZScale = gstate.getViewportZScale();
float vpZCenter = gstate.getViewportZCenter();
// TODO: This clip the entire draw if minz > maxz.
float minz = gstate.getDepthRangeMin();
float maxz = gstate.getDepthRangeMax();

Expand Down
3 changes: 1 addition & 2 deletions GPU/GPUCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ const CommonCommandTableEntry commonCommandTable[] = {
{ GE_CMD_VIEWPORTYCENTER, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_VIEWPORTSCISSOR_STATE },
{ GE_CMD_VIEWPORTZSCALE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_DEPTHRANGE | DIRTY_PROJMATRIX | DIRTY_VIEWPORTSCISSOR_STATE },
{ GE_CMD_VIEWPORTZCENTER, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_DEPTHRANGE | DIRTY_PROJMATRIX | DIRTY_VIEWPORTSCISSOR_STATE },
{ GE_CMD_CLIPENABLE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VIEWPORTSCISSOR_STATE },
{ GE_CMD_CLIPENABLE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_RASTER_STATE },

// Z clip
{ GE_CMD_MINZ, FLAG_FLUSHBEFOREONCHANGE, DIRTY_DEPTHRANGE | DIRTY_VIEWPORTSCISSOR_STATE },
Expand Down Expand Up @@ -1739,7 +1739,6 @@ void GPUCommon::Execute_BoneMtxNum(u32 op, u32 diff) {
const int end = 12 * 8 - (op & 0x7F);
int i = 0;

// TODO: Validate what should happen when explicitly setting num to 96 or higher.
bool fastLoad = !debugRecording_ && end > 0;
if (currentList->pc < currentList->stall && currentList->pc + end * 4 >= currentList->stall) {
fastLoad = false;
Expand Down
124 changes: 92 additions & 32 deletions GPU/GPUState.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ alignas(16) GPUgstate gstate;
// Let's align this one too for good measure.
alignas(16) GPUStateCache gstate_c;

// For save state compatibility.
static int savedContextVersion = 1;

struct CmdRange {
u8 start;
u8 end;
Expand Down Expand Up @@ -77,6 +80,25 @@ static const CmdRange contextCmdRanges[] = {
// Skip: {0xFA, 0xFF},
};

static u32_le *SaveMatrix(u32_le *cmds, const float *mtx, int sz, int numcmd, int datacmd) {
*cmds++ = numcmd << 24;
for (int i = 0; i < sz; ++i) {
*cmds++ = (datacmd << 24) | toFloat24(mtx[i]);
}

return cmds;
}

static const u32_le *LoadMatrix(const u32_le *cmds, float *mtx, int sz) {
// Skip the reset.
cmds++;
for (int i = 0; i < sz; ++i) {
mtx[i] = getFloat24(*cmds++);
}

return cmds;
}

void GPUgstate::Reset() {
memset(gstate.cmdmem, 0, sizeof(gstate.cmdmem));
for (int i = 0; i < 256; i++) {
Expand All @@ -89,6 +111,8 @@ void GPUgstate::Reset() {
memset(gstate.projMatrix, 0, sizeof(gstate.projMatrix));
memset(gstate.tgenMatrix, 0, sizeof(gstate.tgenMatrix));
memset(gstate.boneMatrix, 0, sizeof(gstate.boneMatrix));

savedContextVersion = 1;
}

void GPUgstate::Save(u32_le *ptr) {
Expand All @@ -105,22 +129,37 @@ void GPUgstate::Save(u32_le *ptr) {
}
}

if (Memory::IsValidAddress(getClutAddress()))
*cmds++ = loadclut;

// Seems like it actually writes commands to load the matrices and then reset the counts.
*cmds++ = boneMatrixNumber;
*cmds++ = worldmtxnum;
*cmds++ = viewmtxnum;
*cmds++ = projmtxnum;
*cmds++ = texmtxnum;

u8 *matrices = (u8 *)cmds;
memcpy(matrices, boneMatrix, sizeof(boneMatrix)); matrices += sizeof(boneMatrix);
memcpy(matrices, worldMatrix, sizeof(worldMatrix)); matrices += sizeof(worldMatrix);
memcpy(matrices, viewMatrix, sizeof(viewMatrix)); matrices += sizeof(viewMatrix);
memcpy(matrices, projMatrix, sizeof(projMatrix)); matrices += sizeof(projMatrix);
memcpy(matrices, tgenMatrix, sizeof(tgenMatrix)); matrices += sizeof(tgenMatrix);
if (savedContextVersion == 0) {
if (Memory::IsValidAddress(getClutAddress()))
*cmds++ = loadclut;

// Seems like it actually writes commands to load the matrices and then reset the counts.
*cmds++ = boneMatrixNumber;
*cmds++ = worldmtxnum;
*cmds++ = viewmtxnum;
*cmds++ = projmtxnum;
*cmds++ = texmtxnum;

u8 *matrices = (u8 *)cmds;
memcpy(matrices, boneMatrix, sizeof(boneMatrix)); matrices += sizeof(boneMatrix);
memcpy(matrices, worldMatrix, sizeof(worldMatrix)); matrices += sizeof(worldMatrix);
memcpy(matrices, viewMatrix, sizeof(viewMatrix)); matrices += sizeof(viewMatrix);
memcpy(matrices, projMatrix, sizeof(projMatrix)); matrices += sizeof(projMatrix);
memcpy(matrices, tgenMatrix, sizeof(tgenMatrix)); matrices += sizeof(tgenMatrix);
} else {
cmds = SaveMatrix(cmds, boneMatrix, ARRAY_SIZE(boneMatrix), GE_CMD_BONEMATRIXNUMBER, GE_CMD_BONEMATRIXDATA);
cmds = SaveMatrix(cmds, worldMatrix, ARRAY_SIZE(worldMatrix), GE_CMD_WORLDMATRIXNUMBER, GE_CMD_WORLDMATRIXDATA);
cmds = SaveMatrix(cmds, viewMatrix, ARRAY_SIZE(viewMatrix), GE_CMD_VIEWMATRIXNUMBER, GE_CMD_VIEWMATRIXDATA);
cmds = SaveMatrix(cmds, projMatrix, ARRAY_SIZE(projMatrix), GE_CMD_PROJMATRIXNUMBER, GE_CMD_PROJMATRIXDATA);
cmds = SaveMatrix(cmds, tgenMatrix, ARRAY_SIZE(tgenMatrix), GE_CMD_TGENMATRIXNUMBER, GE_CMD_TGENMATRIXDATA);

*cmds++ = boneMatrixNumber;
*cmds++ = worldmtxnum;
*cmds++ = viewmtxnum;
*cmds++ = projmtxnum;
*cmds++ = texmtxnum;
*cmds++ = GE_CMD_END << 24;
}
}

void GPUgstate::FastLoadBoneMatrix(u32 addr) {
Expand Down Expand Up @@ -165,27 +204,41 @@ void GPUgstate::Restore(u32_le *ptr) {
gstate_c.offsetAddr = ptr[7];

// Command values start 17 ints in.
u32_le *cmds = ptr + 17;
const u32_le *cmds = ptr + 17;
for (size_t i = 0; i < ARRAY_SIZE(contextCmdRanges); ++i) {
for (int n = contextCmdRanges[i].start; n <= contextCmdRanges[i].end; ++n) {
cmdmem[n] = *cmds++;
}
}

if (Memory::IsValidAddress(getClutAddress()))
loadclut = *cmds++;
boneMatrixNumber = *cmds++;
worldmtxnum = *cmds++;
viewmtxnum = *cmds++;
projmtxnum = *cmds++;
texmtxnum = *cmds++;

u8 *matrices = (u8 *)cmds;
memcpy(boneMatrix, matrices, sizeof(boneMatrix)); matrices += sizeof(boneMatrix);
memcpy(worldMatrix, matrices, sizeof(worldMatrix)); matrices += sizeof(worldMatrix);
memcpy(viewMatrix, matrices, sizeof(viewMatrix)); matrices += sizeof(viewMatrix);
memcpy(projMatrix, matrices, sizeof(projMatrix)); matrices += sizeof(projMatrix);
memcpy(tgenMatrix, matrices, sizeof(tgenMatrix)); matrices += sizeof(tgenMatrix);
if (savedContextVersion == 0) {
if (Memory::IsValidAddress(getClutAddress()))
loadclut = *cmds++;
boneMatrixNumber = *cmds++;
worldmtxnum = *cmds++;
viewmtxnum = *cmds++;
projmtxnum = *cmds++;
texmtxnum = *cmds++;

u8 *matrices = (u8 *)cmds;
memcpy(boneMatrix, matrices, sizeof(boneMatrix)); matrices += sizeof(boneMatrix);
memcpy(worldMatrix, matrices, sizeof(worldMatrix)); matrices += sizeof(worldMatrix);
memcpy(viewMatrix, matrices, sizeof(viewMatrix)); matrices += sizeof(viewMatrix);
memcpy(projMatrix, matrices, sizeof(projMatrix)); matrices += sizeof(projMatrix);
memcpy(tgenMatrix, matrices, sizeof(tgenMatrix)); matrices += sizeof(tgenMatrix);
} else {
cmds = LoadMatrix(cmds, boneMatrix, ARRAY_SIZE(boneMatrix));
cmds = LoadMatrix(cmds, worldMatrix, ARRAY_SIZE(worldMatrix));
cmds = LoadMatrix(cmds, viewMatrix, ARRAY_SIZE(viewMatrix));
cmds = LoadMatrix(cmds, projMatrix, ARRAY_SIZE(projMatrix));
cmds = LoadMatrix(cmds, tgenMatrix, ARRAY_SIZE(tgenMatrix));

boneMatrixNumber = *cmds++;
worldmtxnum = *cmds++;
viewmtxnum = *cmds++;
projmtxnum = *cmds++;
texmtxnum = *cmds++;
}
}

bool vertTypeIsSkinningEnabled(u32 vertType) {
Expand Down Expand Up @@ -217,7 +270,7 @@ void GPUStateCache::Reset() {
}

void GPUStateCache::DoState(PointerWrap &p) {
auto s = p.Section("GPUStateCache", 0, 4);
auto s = p.Section("GPUStateCache", 0, 5);
if (!s) {
// Old state, this was not versioned.
GPUStateCache_v0 old;
Expand All @@ -231,6 +284,8 @@ void GPUStateCache::DoState(PointerWrap &p) {
vertexFullAlpha = old.vertexFullAlpha;
skipDrawReason = old.skipDrawReason;
uv = old.uv;

savedContextVersion = 0;
} else {
p.Do(vertexAddr);
p.Do(indexAddr);
Expand Down Expand Up @@ -290,4 +345,9 @@ void GPUStateCache::DoState(PointerWrap &p) {
p.Do(curRTHeight);

// curRTBufferWidth, curRTBufferHeight, and cutRTOffsetX don't need to be saved.
if (s < 5) {
savedContextVersion = 0;
} else {
p.Do(savedContextVersion);
}
}
1 change: 1 addition & 0 deletions GPU/GPUState.h
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,7 @@ enum {
GPU_SUPPORTS_VERTEX_TEXTURE_FETCH = FLAG_BIT(11),
GPU_SUPPORTS_TEXTURE_FLOAT = FLAG_BIT(12),
GPU_SUPPORTS_16BIT_FORMATS = FLAG_BIT(13),
GPU_SUPPORTS_DEPTH_CLAMP = FLAG_BIT(14),
GPU_SUPPORTS_LARGE_VIEWPORTS = FLAG_BIT(16),
GPU_SUPPORTS_ACCURATE_DEPTH = FLAG_BIT(17),
GPU_SUPPORTS_VAO = FLAG_BIT(18),
Expand Down
4 changes: 2 additions & 2 deletions GPU/Software/Clipper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ void ProcessLine(VertexData& v0, VertexData& v1)
return;
}

if (mask && (gstate.clipEnable & 0x1)) {
if (mask && gstate.isClippingEnabled()) {
// discard if any vertex is outside the near clipping plane
if (mask & CLIP_NEG_Z_BIT)
return;
Expand Down Expand Up @@ -303,7 +303,7 @@ void ProcessTriangle(VertexData& v0, VertexData& v1, VertexData& v2)
mask |= CalcClipMask(v1.clippos);
mask |= CalcClipMask(v2.clippos);

if (mask && (gstate.clipEnable & 0x1)) {
if (mask && gstate.isClippingEnabled()) {
// discard if any vertex is outside the near clipping plane
if (mask & CLIP_NEG_Z_BIT)
return;
Expand Down
4 changes: 2 additions & 2 deletions GPU/Software/TransformUnit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ static inline ScreenCoords ClipToScreenInternal(const ClipCoords& coords, bool *
float y = coords.y * yScale / coords.w + yCenter;
float z = coords.z * zScale / coords.w + zCenter;

// Is this really right?
if (gstate.clipEnable & 0x1) {
// This matches hardware tests - depth is clamped when this flag is on.
if (gstate.isClippingEnabled()) {
if (z < 0.f)
z = 0.f;
if (z > 65535.f)
Expand Down
14 changes: 13 additions & 1 deletion GPU/Vulkan/FragmentShaderGeneratorVulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -456,8 +456,20 @@ bool GenerateVulkanGLSLFragmentShader(const FShaderID &id, char *buffer) {
}

if (gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT)) {
const double scale = DepthSliceFactor() * 65535.0;

WRITE(p, " highp float z = gl_FragCoord.z;\n");
WRITE(p, " z = (1.0/65535.0) * floor(z * 65535.0);\n");
if (gstate_c.Supports(GPU_SUPPORTS_ACCURATE_DEPTH)) {
// We center the depth with an offset, but only its fraction matters.
// When (DepthSliceFactor() - 1) is odd, it will be 0.5, otherwise 0.
if (((int)(DepthSliceFactor() - 1.0f) & 1) == 1) {
WRITE(p, " z = (floor((z * %f) - (1.0 / 2.0)) + (1.0 / 2.0)) * (1.0 / %f);\n", scale, scale);
} else {
WRITE(p, " z = floor(z * %f) * (1.0 / %f);\n", scale, scale);
}
} else {
WRITE(p, " z = (1.0/65535.0) * floor(z * 65535.0);\n");
}
WRITE(p, " gl_FragDepth = z;\n");
}

Expand Down
3 changes: 3 additions & 0 deletions GPU/Vulkan/GPU_Vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,9 @@ void GPU_Vulkan::CheckGPUFeatures() {
if (vulkan_->GetFeaturesEnabled().wideLines) {
features |= GPU_SUPPORTS_WIDE_LINES;
}
if (vulkan_->GetFeaturesEnabled().depthClamp) {
features |= GPU_SUPPORTS_DEPTH_CLAMP;
}
if (vulkan_->GetFeaturesEnabled().dualSrcBlend) {
switch (vulkan_->GetPhysicalDeviceProperties().vendorID) {
case VULKAN_VENDOR_NVIDIA:
Expand Down
2 changes: 1 addition & 1 deletion GPU/Vulkan/PipelineManagerVulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ static VulkanPipeline *CreateVulkanPipeline(VkDevice device, VkPipelineCache pip
rs.lineWidth = lineWidth;
rs.rasterizerDiscardEnable = false;
rs.polygonMode = VK_POLYGON_MODE_FILL;
rs.depthClampEnable = false;
rs.depthClampEnable = key.depthClampEnable;

VkPipelineMultisampleStateCreateInfo ms = { VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO };
ms.pSampleMask = nullptr;
Expand Down
3 changes: 3 additions & 0 deletions GPU/Vulkan/StateMappingVulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,13 @@ void DrawEngineVulkan::ConvertStateToVulkanKey(FramebufferManagerVulkan &fbManag
if (gstate_c.IsDirty(DIRTY_RASTER_STATE)) {
if (gstate.isModeClear()) {
key.cullMode = VK_CULL_MODE_NONE;
// TODO: Or does it always clamp?
key.depthClampEnable = false;
} else {
// Set cull
bool wantCull = !gstate.isModeThrough() && prim != GE_PRIM_RECTANGLES && gstate.isCullEnabled();
key.cullMode = wantCull ? (gstate.getCullMode() ? VK_CULL_MODE_FRONT_BIT : VK_CULL_MODE_BACK_BIT) : VK_CULL_MODE_NONE;
key.depthClampEnable = gstate.isClippingEnabled() && gstate_c.Supports(GPU_SUPPORTS_DEPTH_CLAMP);
}
}

Expand Down
5 changes: 3 additions & 2 deletions GPU/Vulkan/StateMappingVulkan.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ struct VulkanDynamicState {
// Let's pack this tight using bitfields.
// If an enable flag is set to 0, all the data fields for that section should
// also be set to 0.
// ~54 bits.
// ~64 bits.
// Can't use enums unfortunately, they end up signed and breaking values above half their ranges.
struct VulkanPipelineRasterStateKey {
// Blend
Expand All @@ -37,6 +37,7 @@ struct VulkanPipelineRasterStateKey {
unsigned int colorWriteMask : 4;

// Depth/Stencil
unsigned int depthClampEnable : 1;
unsigned int depthTestEnable : 1;
unsigned int depthWriteEnable : 1;
unsigned int depthCompareOp : 3; // VkCompareOp
Expand All @@ -57,4 +58,4 @@ struct VulkanPipelineRasterStateKey {
size_t size = sizeof(VulkanPipelineRasterStateKey);
return memcmp(this, &other, size) < 0;
}
};
};
1 change: 1 addition & 0 deletions headless/Headless.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,7 @@ int main(int argc, const char* argv[])
g_Config.bVertexDecoderJit = true;
g_Config.bBlockTransferGPU = true;
g_Config.iSplineBezierQuality = 2;
g_Config.bHighQualityDepth = true;

#ifdef _WIN32
InitSysDirectories();
Expand Down

3 comments on commit ea50561

@tausifj15
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sir android builds?

@hrydgard
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I'll look into it.

@tausifj15
Copy link

@tausifj15 tausifj15 commented on ea50561 Dec 27, 2017 via email

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.