Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
More vulkan microoptimizations. Add more profiler scopes.
  • Loading branch information
hrydgard committed Aug 18, 2017
1 parent ed776d8 commit b9b2656
Show file tree
Hide file tree
Showing 12 changed files with 81 additions and 53 deletions.
2 changes: 0 additions & 2 deletions Core/MIPS/ARM64/Arm64CompLoadStore.cpp
Expand Up @@ -129,8 +129,6 @@ namespace MIPSComp {
}
}

DISABLE;

u32 iaddr = gpr.IsImm(rs) ? offset + gpr.GetImm(rs) : 0xFFFFFFFF;
std::vector<FixupBranch> skips;

Expand Down
9 changes: 7 additions & 2 deletions GPU/GPUCommon.cpp
Expand Up @@ -3,6 +3,8 @@
#include <mutex>

#include "base/timeutil.h"
#include "profiler/profiler.h"

#include "Common/ColorConv.h"
#include "Core/Reporting.h"
#include "GPU/GeDisasm.h"
Expand Down Expand Up @@ -1211,6 +1213,7 @@ void GPUCommon::Execute_BJump(u32 op, u32 diff) {
}

void GPUCommon::Execute_Call(u32 op, u32 diff) {
PROFILE_THIS_SCOPE("gpu_call");
easy_guard guard(listLock);

// Saint Seiya needs correct support for relative calls.
Expand Down Expand Up @@ -1559,6 +1562,7 @@ void GPUCommon::Execute_BoundingBox(u32 op, u32 diff) {
}

void GPUCommon::Execute_BlockTransferStart(u32 op, u32 diff) {
PROFILE_THIS_SCOPE("block");
Flush();
// and take appropriate action. This is a block transfer between RAM and VRAM, or vice versa.
// Can we skip this on SkipDraw?
Expand All @@ -1574,6 +1578,7 @@ void GPUCommon::Execute_WorldMtxNum(u32 op, u32 diff) {

// We must record the individual data commands while debugRecording_.
bool fastLoad = !debugRecording_;
// Stalling in the middle of a matrix would be stupid, I doubt this check is necessary.

This comment has been minimized.

Copy link
@unknownbrackets

unknownbrackets Aug 20, 2017

Collaborator

Maybe, but it did happen with bone matrix loads. The case was a partially overwritten matrix load (the stall address was on matrix data, but that would be overwritten with something else after the stall.)

Example, pass 1:

NOP
MATRIX LOAD 1
MATRIX LOAD 2
MATRIX LOAD 3
(stall) garbage data

Pass 2:

MATRIX LOAD 1
MATRIX LOAD 2
MATRIX LOAD 3
(stall) MATRIX LOAD 3 (but actually garbage)

That's why.

-[Unknown]

This comment has been minimized.

Copy link
@hrydgard

hrydgard Aug 20, 2017

Author Owner

Oh, I see. Thanks, makes sense.

if (currentList->pc < currentList->stall && currentList->pc + end * 4 >= currentList->stall) {
fastLoad = false;
}
Expand Down Expand Up @@ -1786,7 +1791,7 @@ void GPUCommon::Execute_BoneMtxNum(u32 op, u32 diff) {
}

const int numPlusCount = (op & 0x7F) + i;
for (int num = op & 0x7F; num < numPlusCount; num += 12) {
for (unsigned int num = op & 0x7F; num < numPlusCount; num += 12) {
gstate_c.Dirty(DIRTY_BONEMATRIX0 << (num / 12));
}
} else {
Expand All @@ -1798,7 +1803,7 @@ void GPUCommon::Execute_BoneMtxNum(u32 op, u32 diff) {
}

const int numPlusCount = (op & 0x7F) + i;
for (int num = op & 0x7F; num < numPlusCount; num += 12) {
for (unsigned int num = op & 0x7F; num < numPlusCount; num += 12) {
gstate_c.deferredVertTypeDirty |= DIRTY_BONEMATRIX0 << (num / 12);
}
}
Expand Down
62 changes: 40 additions & 22 deletions GPU/Vulkan/DrawEngineVulkan.cpp
Expand Up @@ -20,6 +20,7 @@
#include "base/logging.h"
#include "base/timeutil.h"
#include "math/dataconv.h"
#include "profiler/profiler.h"

#include "Common/MemoryUtil.h"
#include "Core/MemMap.h"
Expand Down Expand Up @@ -51,7 +52,7 @@


enum {
VERTEX_CACHE_SIZE = 4096 * 1024
VERTEX_CACHE_SIZE = 8192 * 1024
};

#define VERTEXCACHE_DECIMATION_INTERVAL 17
Expand Down Expand Up @@ -275,8 +276,6 @@ void DrawEngineVulkan::BeginFrame() {
lastPipeline_ = nullptr;

FrameData *frame = &frame_[curFrame_];
vkResetDescriptorPool(vulkan_->GetDevice(), frame->descPool, 0);
frame->descSets.clear();

// First reset all buffers, then begin. This is so that Reset can free memory and Begin can allocate it,
// if growing the buffer is needed. Doing it this way will reduce fragmentation if more than one buffer
Expand Down Expand Up @@ -323,6 +322,8 @@ void DrawEngineVulkan::BeginFrame() {
vertexCache_->BeginNoReset();

if (--decimationCounter_ <= 0) {
vkResetDescriptorPool(vulkan_->GetDevice(), frame->descPool, 0);
frame->descSets.clear();
decimationCounter_ = VERTEXCACHE_DECIMATION_INTERVAL;

const int threshold = gpuStats.numFlips - VAI_KILL_AGE;
Expand Down Expand Up @@ -498,7 +499,7 @@ void DrawEngineVulkan::DecodeVerts(VulkanPushBuffer *push, uint32_t *bindOffset,
}


VkDescriptorSet DrawEngineVulkan::GetDescriptorSet(VkImageView imageView, VkSampler sampler, VkBuffer base, VkBuffer light, VkBuffer bone) {
VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView, VkSampler sampler, VkBuffer base, VkBuffer light, VkBuffer bone) {
DescriptorSetKey key;
key.imageView_ = imageView;
key.sampler_ = sampler;
Expand All @@ -511,7 +512,7 @@ VkDescriptorSet DrawEngineVulkan::GetDescriptorSet(VkImageView imageView, VkSamp
assert(bone != VK_NULL_HANDLE);

FrameData *frame = &frame_[curFrame_];
if (!(gstate_c.bezier || gstate_c.spline)) { // Has no cache when HW tessellation.
if (!gstate_c.bezier && !gstate_c.spline) { // Has no cache when HW tessellation.
auto iter = frame->descSets.find(key);
if (iter != frame->descSets.end()) {
return iter->second;
Expand Down Expand Up @@ -629,8 +630,9 @@ void MarkUnreliable(VertexArrayInfoVulkan *vai) {
// For now we just leave it in the pushbuffer.
}

// The inline wrapper in the header checks for numDrawCalls == 0d
// The inline wrapper in the header checks for numDrawCalls == 0
void DrawEngineVulkan::DoFlush() {
PROFILE_THIS_SCOPE("Flush");
gpuStats.numFlushes++;
// TODO: Should be enough to update this once per frame?
gpuStats.numTrackedVertexArrays = (int)vai_.size();
Expand All @@ -640,7 +642,7 @@ void DrawEngineVulkan::DoFlush() {
lastPipeline_ = nullptr;
lastCmd_ = cmd;
// Since we have a new cmdbuf, dirty our dynamic state so it gets re-set.
gstate_c.Dirty(DIRTY_VIEWPORTSCISSOR_STATE|DIRTY_DEPTHSTENCIL_STATE);
gstate_c.Dirty(DIRTY_VIEWPORTSCISSOR_STATE|DIRTY_DEPTHSTENCIL_STATE|DIRTY_BLEND_STATE);
}

VkRenderPass rp = (VkRenderPass)draw_->GetNativeObject(Draw::NativeObject::CURRENT_RENDERPASS);
Expand Down Expand Up @@ -671,8 +673,6 @@ void DrawEngineVulkan::DoFlush() {
uint32_t ibOffset = 0;
uint32_t vbOffset = 0;

VkRenderPass renderPass = (VkRenderPass)draw_->GetNativeObject(Draw::NativeObject::COMPATIBLE_RENDERPASS);

if (useHWTransform) {
// We don't detect clears in this path, so here we can switch framebuffers if necessary.

Expand All @@ -690,6 +690,7 @@ void DrawEngineVulkan::DoFlush() {
}

if (useCache) {
PROFILE_THIS_SCOPE("vcache");
u32 id = dcid_ ^ gstate.getUVGenMode(); // This can have an effect on which UV decoder we need to use! And hence what the decoded data will look like. See #9263
auto iter = vai_.find(id);
VertexArrayInfoVulkan *vai;
Expand Down Expand Up @@ -722,6 +723,7 @@ void DrawEngineVulkan::DoFlush() {
// But if we get this far it's likely to be worth uploading the data.
case VertexArrayInfoVulkan::VAI_HASHING:
{
PROFILE_THIS_SCOPE("vcachehash");
vai->numDraws++;
if (vai->lastFrame != gpuStats.numFlips) {
vai->numFrames++;
Expand Down Expand Up @@ -856,6 +858,7 @@ void DrawEngineVulkan::DoFlush() {
gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && ((hasColor && (gstate.materialupdate & 1)) || gstate.getMaterialAmbientA() == 255) && (!gstate.isLightingEnabled() || gstate.getAmbientA() == 255);
}

PROFILE_THIS_SCOPE("updatestate");
if (textureNeedsApply) {
textureCache_->ApplyTexture();
textureCache_->GetVulkanHandles(imageView, sampler);
Expand All @@ -870,6 +873,7 @@ void DrawEngineVulkan::DoFlush() {
if (prim != lastPrim_ || gstate_c.IsDirty(DIRTY_BLEND_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE)) {
ConvertStateToVulkanKey(*framebufferManager_, shaderManager_, prim, pipelineKey_, dynState_);
}
VkRenderPass renderPass = (VkRenderPass)draw_->GetNativeObject(Draw::NativeObject::COMPATIBLE_RENDERPASS);
VulkanPipeline *pipeline = pipelineManager_->GetOrCreatePipeline(pipelineLayout_, renderPass, pipelineKey_, dec_, vshader, fshader, true);
if (!pipeline) {
// Already logged, let's bail out.
Expand All @@ -885,10 +889,12 @@ void DrawEngineVulkan::DoFlush() {
lastPrim_ = prim;

dirtyUniforms_ |= shaderManager_->UpdateUniforms();

UpdateUBOs(frame);

VkDescriptorSet ds = GetDescriptorSet(imageView, sampler, baseBuf, lightBuf, boneBuf);
VkDescriptorSet ds = GetOrCreateDescriptorSet(imageView, sampler, baseBuf, lightBuf, boneBuf);

{
PROFILE_THIS_SCOPE("vkdraw");

const uint32_t dynamicUBOOffsets[3] = {
baseUBOOffset, lightUBOOffset, boneUBOOffset,
Expand All @@ -911,7 +917,9 @@ void DrawEngineVulkan::DoFlush() {
vkCmdBindVertexBuffers(cmd, 0, 1, &vbuf, offsets);
vkCmdDraw(cmd, vertexCount, 1, 0, 0);
}
}
} else {
PROFILE_THIS_SCOPE("soft");
// Decode to "decoded"
DecodeVerts(nullptr, nullptr, nullptr);
bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE;
Expand All @@ -928,7 +936,6 @@ void DrawEngineVulkan::DoFlush() {
prim = GE_PRIM_TRIANGLES;
VERBOSE_LOG(G3D, "Flush prim %i SW! %i verts in one go", prim, indexGen.VertexCount());

lastPrim_ = prim;
int numTrans = 0;
bool drawIndexed = false;
u16 *inds = decIndex;
Expand Down Expand Up @@ -963,26 +970,37 @@ void DrawEngineVulkan::DoFlush() {
sampler = nullSampler_;
}

ConvertStateToVulkanKey(*framebufferManager_, shaderManager_, prim, pipelineKey_, dynState_);
ApplyDrawStateLate(cmd, result.setStencil, result.stencilValue);

shaderManager_->GetShaders(prim, lastVType_, &vshader, &fshader, useHWTransform);
VulkanPipeline *pipeline = pipelineManager_->GetOrCreatePipeline(pipelineLayout_, renderPass, pipelineKey_, dec_, vshader, fshader, false);
if (!pipeline) {
// Already logged, let's bail out.
return;
if (!lastPipeline_ || gstate_c.IsDirty(DIRTY_BLEND_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE) || prim != lastPrim_) {
shaderManager_->GetShaders(prim, lastVType_, &vshader, &fshader, useHWTransform);
if (prim != lastPrim_ || gstate_c.IsDirty(DIRTY_BLEND_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE)) {
ConvertStateToVulkanKey(*framebufferManager_, shaderManager_, prim, pipelineKey_, dynState_);
}
VkRenderPass renderPass = (VkRenderPass)draw_->GetNativeObject(Draw::NativeObject::COMPATIBLE_RENDERPASS);
VulkanPipeline *pipeline = pipelineManager_->GetOrCreatePipeline(pipelineLayout_, renderPass, pipelineKey_, dec_, vshader, fshader, false);
if (!pipeline) {
// Already logged, let's bail out.
return;
}
if (pipeline != lastPipeline_) {
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline->pipeline); // TODO: Avoid if same as last draw.
lastPipeline_ = pipeline;
}
ApplyDrawStateLate(cmd, false, 0);
gstate_c.Clean(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE);
}
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline->pipeline); // TODO: Avoid if same as last draw.
lastPrim_ = prim;

dirtyUniforms_ |= shaderManager_->UpdateUniforms();

// Even if the first draw is through-mode, make sure we at least have one copy of these uniforms buffered
UpdateUBOs(frame);

VkDescriptorSet ds = GetDescriptorSet(imageView, sampler, baseBuf, lightBuf, boneBuf);
VkDescriptorSet ds = GetOrCreateDescriptorSet(imageView, sampler, baseBuf, lightBuf, boneBuf);
const uint32_t dynamicUBOOffsets[3] = {
baseUBOOffset, lightUBOOffset, boneUBOOffset,
};

PROFILE_THIS_SCOPE("vkdrawsoft");
vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout_, 0, 1, &ds, 3, dynamicUBOOffsets);

if (drawIndexed) {
Expand Down
3 changes: 2 additions & 1 deletion GPU/Vulkan/DrawEngineVulkan.h
Expand Up @@ -183,7 +183,7 @@ class DrawEngineVulkan : public DrawEngineCommon {
void DoFlush();
void UpdateUBOs(FrameData *frame);

VkDescriptorSet GetDescriptorSet(VkImageView imageView, VkSampler sampler, VkBuffer base, VkBuffer light, VkBuffer bone);
VkDescriptorSet GetOrCreateDescriptorSet(VkImageView imageView, VkSampler sampler, VkBuffer base, VkBuffer light, VkBuffer bone);

VulkanContext *vulkan_;
Draw::DrawContext *draw_;
Expand All @@ -193,6 +193,7 @@ class DrawEngineVulkan : public DrawEngineCommon {
VkPipelineLayout pipelineLayout_;
VkCommandBuffer lastCmd_ = VK_NULL_HANDLE;
VulkanPipeline *lastPipeline_;
VkDescriptorSet lastDs_ = VK_NULL_HANDLE;

std::unordered_map<u32, VertexArrayInfoVulkan *> vai_;
VulkanPushBuffer *vertexCache_;
Expand Down
2 changes: 2 additions & 0 deletions GPU/Vulkan/GPU_Vulkan.cpp
Expand Up @@ -459,6 +459,8 @@ void GPU_Vulkan::Execute_Prim(u32 op, u32 diff) {
// This drives all drawing. All other state we just buffer up, then we apply it only
// when it's time to draw. As most PSP games set state redundantly ALL THE TIME, this is a huge optimization.

PROFILE_THIS_SCOPE("execprim");

u32 data = op & 0xFFFFFF;
u32 count = data & 0xFFFF;
if (count == 0)
Expand Down
6 changes: 5 additions & 1 deletion GPU/Vulkan/PipelineManagerVulkan.cpp
@@ -1,5 +1,7 @@
#include <cstring>

#include "profiler/profiler.h"

#include "Common/Log.h"
#include "Common/StringUtils.h"
#include "Common/Vulkan/VulkanContext.h"
Expand Down Expand Up @@ -307,7 +309,9 @@ VulkanPipeline *PipelineManagerVulkan::GetOrCreatePipeline(VkPipelineLayout layo
if (iter != pipelines_.end()) {
return iter->second;
}


PROFILE_THIS_SCOPE("pipelinebuild");

VulkanPipeline *pipeline = CreateVulkanPipeline(
vulkan_->GetDevice(), pipelineCache_, layout, renderPass,
rasterKey, vtxDec, vs, fs, useHwTransform);
Expand Down
16 changes: 3 additions & 13 deletions GPU/Vulkan/ShaderManagerVulkan.cpp
Expand Up @@ -25,6 +25,7 @@
#include "math/lin/matrix4x4.h"
#include "math/math_util.h"
#include "math/dataconv.h"
#include "profiler/profiler.h"
#include "util/text/utf8.h"
#include "Common/Vulkan/VulkanContext.h"
#include "Common/Vulkan/VulkanMemory.h"
Expand All @@ -42,6 +43,7 @@

VulkanFragmentShader::VulkanFragmentShader(VulkanContext *vulkan, ShaderID id, const char *code, bool useHWTransform)
: vulkan_(vulkan), id_(id), failed_(false), useHWTransform_(useHWTransform), module_(0) {
PROFILE_THIS_SCOPE("shadercomp");
source_ = code;

std::string errorMessage;
Expand Down Expand Up @@ -99,6 +101,7 @@ std::string VulkanFragmentShader::GetShaderString(DebugShaderStringType type) co

VulkanVertexShader::VulkanVertexShader(VulkanContext *vulkan, ShaderID id, const char *code, int vertType, bool useHWTransform, bool usesLighting)
: vulkan_(vulkan), id_(id), failed_(false), useHWTransform_(useHWTransform), module_(VK_NULL_HANDLE), usesLighting_(usesLighting) {
PROFILE_THIS_SCOPE("shadercomp");
source_ = code;
std::string errorMessage;
std::vector<uint32_t> spirv;
Expand Down Expand Up @@ -168,19 +171,6 @@ ShaderManagerVulkan::~ShaderManagerVulkan() {
delete[] codeBuffer_;
}

uint32_t ShaderManagerVulkan::PushBaseBuffer(VulkanPushBuffer *dest, VkBuffer *buf) {
return dest->PushAligned(&ub_base, sizeof(ub_base), uboAlignment_, buf);
}

uint32_t ShaderManagerVulkan::PushLightBuffer(VulkanPushBuffer *dest, VkBuffer *buf) {
return dest->PushAligned(&ub_lights, sizeof(ub_lights), uboAlignment_, buf);
}

// TODO: Only push half the bone buffer if we only have four bones.
uint32_t ShaderManagerVulkan::PushBoneBuffer(VulkanPushBuffer *dest, VkBuffer *buf) {
return dest->PushAligned(&ub_bones, sizeof(ub_bones), uboAlignment_, buf);
}

void ShaderManagerVulkan::DeviceRestore(VulkanContext *vulkan) {
vulkan_ = vulkan;
uboAlignment_ = vulkan_->GetPhysicalDeviceProperties().limits.minUniformBufferOffsetAlignment;
Expand Down
14 changes: 11 additions & 3 deletions GPU/Vulkan/ShaderManagerVulkan.h
Expand Up @@ -21,6 +21,7 @@

#include "base/basictypes.h"
#include "Globals.h"
#include "Common/Vulkan/VulkanMemory.h"
#include "GPU/Common/ShaderCommon.h"
#include "GPU/Common/ShaderId.h"
#include "GPU/Vulkan/VertexShaderGeneratorVulkan.h"
Expand Down Expand Up @@ -113,9 +114,16 @@ class ShaderManagerVulkan : public ShaderManagerCommon {
bool IsLightDirty() { return true; }
bool IsBoneDirty() { return true; }

uint32_t PushBaseBuffer(VulkanPushBuffer *dest, VkBuffer *buf);
uint32_t PushLightBuffer(VulkanPushBuffer *dest, VkBuffer *buf);
uint32_t PushBoneBuffer(VulkanPushBuffer *dest, VkBuffer *buf);
uint32_t PushBaseBuffer(VulkanPushBuffer *dest, VkBuffer *buf) {
return dest->PushAligned(&ub_base, sizeof(ub_base), uboAlignment_, buf);
}
uint32_t PushLightBuffer(VulkanPushBuffer *dest, VkBuffer *buf) {
return dest->PushAligned(&ub_lights, sizeof(ub_lights), uboAlignment_, buf);
}
// TODO: Only push half the bone buffer if we only have four bones.
uint32_t PushBoneBuffer(VulkanPushBuffer *dest, VkBuffer *buf) {
return dest->PushAligned(&ub_bones, sizeof(ub_bones), uboAlignment_, buf);
}

private:
void Clear();
Expand Down

0 comments on commit b9b2656

Please sign in to comment.