Skip to content
Permalink
Browse files

More spline/bezier optimization. Enable real splines. Add option "Low…

… quality spline/bezier curves".
  • Loading branch information...
hrydgard committed Sep 24, 2013
1 parent 216ccbb commit 2b66a850be6f30bcaa7746d70c9f6d2123fa021a
Showing with 88 additions and 43 deletions.
  1. +2 −0 Core/Config.cpp
  2. +1 −0 Core/Config.h
  3. +67 −42 GPU/GLES/Spline.cpp
  4. +14 −0 GPU/GLES/TransformPipeline.cpp
  5. +3 −0 GPU/GLES/TransformPipeline.h
  6. +1 −1 UI/GameSettingsScreen.cpp
@@ -177,6 +177,7 @@ void Config::Load(const char *iniFileName, const char *controllerIniFilename)
graphics->Get("VSyncInterval", &bVSync, false);
graphics->Get("DisableStencilTest", &bDisableStencilTest, false);
graphics->Get("AlwaysDepthWrite", &bAlwaysDepthWrite, false);
graphics->Get("LowQualitySplineBezier", &bLowQualitySplineBezier, false);

IniFile::Section *sound = iniFile.GetOrCreateSection("Sound");
sound->Get("Enable", &bEnableSound, true);
@@ -336,6 +337,7 @@ void Config::Save() {
graphics->Set("VSyncInterval", bVSync);
graphics->Set("DisableStencilTest", bDisableStencilTest);
graphics->Set("AlwaysDepthWrite", bAlwaysDepthWrite);
graphics->Set("LowQualitySplineBezier", bLowQualitySplineBezier);

IniFile::Section *sound = iniFile.GetOrCreateSection("Sound");
sound->Set("Enable", bEnableSound);
@@ -94,6 +94,7 @@ struct Config {
bool bReloadCheats;
bool bDisableStencilTest;
bool bAlwaysDepthWrite;
bool bLowQualitySplineBezier;

// Sound
bool bEnableSound;
@@ -16,13 +16,10 @@
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.

#include "TransformPipeline.h"
#include "Core/Config.h"
#include "Core/MemMap.h"
#include "GPU/Math3D.h"

// Splines are too slow, need optimization.
const bool realTesselationBezier = true;
const bool realTesselationSpline = false;

// Here's how to evaluate them fast:
// http://and-what-happened.blogspot.se/2012/07/evaluating-b-splines-aka-basis-splines.html

@@ -267,15 +264,16 @@ struct SplinePatch {
}*/
};


static void CopyTriangle(u8 *&dest, SimpleVertex *v1, SimpleVertex *v2, SimpleVertex* v3) {
static void CopyQuad(u8 *&dest, const SimpleVertex *v1, const SimpleVertex *v2, const SimpleVertex* v3, const SimpleVertex *v4) {
int vertexSize = sizeof(SimpleVertex);
memcpy(dest, v1, vertexSize);
dest += vertexSize;
memcpy(dest, v2, vertexSize);
dest += vertexSize;
memcpy(dest, v3, vertexSize);
dest += vertexSize;
memcpy(dest, v4, vertexSize);
dest += vertexSize;
}

#undef b2
@@ -301,8 +299,7 @@ Vec3f Bernstein3DDerivative(const Vec3f p0, const Vec3f p1, const Vec3f p2, cons
return p0 * bern0deriv(x) + p1 * bern1deriv(x) + p2 * bern2deriv(x) + p3 * bern3deriv(x);
}

// A little faster, but not optimal
void spline_n_4(int i, float t, int *knot, float *splineVal) {
void spline_n_4(int i, float t, float *knot, float *splineVal) {
knot += i + 1;

float t0 = (t - knot[0]);
@@ -325,10 +322,10 @@ void spline_n_4(int i, float t, int *knot, float *splineVal) {
splineVal[3] = d*f32;
}

// knot should be an array of int, sized n + 5
void spline_knot(int n, int type, int *knot) {
memset(knot, 0, sizeof(int) * (n + 5));
for(int i = 0; i < n - 1; ++i)
// knot should be an array sized n + 5 (n + 1 + 1 + degree (cubic))
void spline_knot(int n, int type, float *knot) {
memset(knot, 0, sizeof(float) * (n + 5));
for (int i = 0; i < n - 1; ++i)
knot[i + 3] = i;

if ((type & 1) == 0) {
@@ -350,7 +347,7 @@ void spline_knot(int n, int type, int *knot) {
void TesselateSplinePatch(u8 *&dest, int &count, const SplinePatch &spatch, u32 origVertType) {
const float third = 1.0f / 3.0f;

if (!realTesselationSpline) {
if (g_Config.bLowQualitySplineBezier) {
// Fast and easy way - just draw the control points, generate some very basic normal vector substitutes.
// Very inaccurate but okay for Loco Roco. Maybe should keep it as an option because it's fast.

@@ -395,8 +392,7 @@ void TesselateSplinePatch(u8 *&dest, int &count, const SplinePatch &spatch, u32
v3.nrm = norm;
}

CopyTriangle(dest, &v0, &v2, &v1);
CopyTriangle(dest, &v1, &v2, &v3);
CopyQuad(dest, &v0, &v1, &v2, &v3);
count += 6;
}
}
@@ -408,8 +404,8 @@ void TesselateSplinePatch(u8 *&dest, int &count, const SplinePatch &spatch, u32
int n = spatch.count_u - 1;
int m = spatch.count_v - 1;

int *knot_u = new int[n + 5];
int *knot_v = new int[m + 5];
float *knot_u = new float[n + 5];
float *knot_v = new float[m + 5];
spline_knot(n, spatch.type_u, knot_u);
spline_knot(m, spatch.type_v, knot_v);

@@ -424,8 +420,8 @@ void TesselateSplinePatch(u8 *&dest, int &count, const SplinePatch &spatch, u32
if (patch_div_t == 0) patch_div_t = 1;

// TODO: Remove this cap when spline_s has been optimized.
if (patch_div_s > 20) patch_div_s = 20;
if (patch_div_t > 20) patch_div_t = 20;
if (patch_div_s > 64) patch_div_s = 64;
if (patch_div_t > 64) patch_div_t = 64;

// First compute all the vertices and put them in an array
SimpleVertex *vertices = new SimpleVertex[(patch_div_s + 1) * (patch_div_t + 1)];
@@ -458,10 +454,6 @@ void TesselateSplinePatch(u8 *&dest, int &count, const SplinePatch &spatch, u32
}

// Collect influences from surrounding control points.
// Should be possible to limit to a smaller range than looping through the entire patch...
// Also, it should be possible to do something similar to what we do in bezier where we only
// evaluate the spline 5 times instead of n * m, taking advantage of the fundamentally linear
// nature of this stuff to separate into horizontal and vertical passes.
float u_weights[4];
float v_weights[4];

@@ -504,6 +496,26 @@ void TesselateSplinePatch(u8 *&dest, int &count, const SplinePatch &spatch, u32
delete [] knot_u;
delete [] knot_v;

// Hacky normal generation through central difference.
if (gstate.isLightingEnabled() && (origVertType & GE_VTYPE_NRM_MASK) == 0) {
for (int v = 0; v < patch_div_t + 1; v++) {
for (int u = 0; u < patch_div_s + 1; u++) {
int l = std::max(0, u - 1);
int t = std::max(0, v - 1);
int r = std::min(patch_div_s, u + 1);
int b = std::min(patch_div_t, v + 1);

const Vec3f &right = vertices[v * (patch_div_s + 1) + r].pos - vertices[v * (patch_div_s + 1) + l].pos;
const Vec3f &down = vertices[b * (patch_div_s + 1) + u].pos - vertices[t * (patch_div_s + 1) + u].pos;

vertices[v * (patch_div_s + 1) + u].nrm = Cross(right, down).Normalized();
if (gstate.patchfacing & 1) {
vertices[v * (patch_div_s + 1) + u].nrm *= -1.0f;
}
}
}
}

// Tesselate. TODO: Use indices so we only need to emit 4 vertices per pair of triangles instead of six.
for (int tile_v = 0; tile_v < patch_div_t; ++tile_v) {
for (int tile_u = 0; tile_u < patch_div_s; ++tile_u) {
@@ -515,8 +527,7 @@ void TesselateSplinePatch(u8 *&dest, int &count, const SplinePatch &spatch, u32
SimpleVertex *v2 = &vertices[(tile_v + 1) * (patch_div_s + 1) + tile_u];
SimpleVertex *v3 = &vertices[(tile_v + 1) * (patch_div_s + 1) + tile_u + 1];

CopyTriangle(dest, v0, v2, v1);
CopyTriangle(dest, v1, v2, v3);
CopyQuad(dest, v0, v1, v2, v3);
count += 6;
}
}
@@ -528,7 +539,7 @@ void TesselateSplinePatch(u8 *&dest, int &count, const SplinePatch &spatch, u32
void TesselateBezierPatch(u8 *&dest, int &count, const BezierPatch &patch, u32 origVertType) {
const float third = 1.0f / 3.0f;

if (!realTesselationBezier) {
if (g_Config.bLowQualitySplineBezier) {
// Fast and easy way - just draw the control points, generate some very basic normal vector subsitutes.
// Very inaccurate though but okay for Loco Roco. Maybe should keep it as an option.

@@ -571,8 +582,7 @@ void TesselateBezierPatch(u8 *&dest, int &count, const BezierPatch &patch, u32 o
v3.nrm = norm;
}

CopyTriangle(dest, &v0, &v2, &v1);
CopyTriangle(dest, &v1, &v2, &v3);
CopyQuad(dest, &v0, &v1, &v2, &v3);
count += 6;
}
}
@@ -586,7 +596,22 @@ void TesselateBezierPatch(u8 *&dest, int &count, const BezierPatch &patch, u32 o
// First compute all the vertices and put them in an array
SimpleVertex *vertices = new SimpleVertex[(tess_u + 1) * (tess_v + 1)];

Vec3f *horiz = new Vec3f[(tess_u + 1) * 4];
Vec3f *horiz2 = horiz + (tess_u + 1) * 1;
Vec3f *horiz3 = horiz + (tess_u + 1) * 2;
Vec3f *horiz4 = horiz + (tess_u + 1) * 3;

// Precompute the horizontal curves to we only have to evaluate the vertical ones.
for (int i = 0; i < tess_u + 1; i++) {
float u = ((float)i / (float)tess_u);
horiz[i] = Bernstein3D(patch.points[0]->pos, patch.points[1]->pos, patch.points[2]->pos, patch.points[3]->pos, u);
horiz2[i] = Bernstein3D(patch.points[4]->pos, patch.points[5]->pos, patch.points[6]->pos, patch.points[7]->pos, u);
horiz3[i] = Bernstein3D(patch.points[8]->pos, patch.points[9]->pos, patch.points[10]->pos, patch.points[11]->pos, u);
horiz4[i] = Bernstein3D(patch.points[12]->pos, patch.points[13]->pos, patch.points[14]->pos, patch.points[15]->pos, u);
}

bool computeNormals = gstate.isLightingEnabled();

for (int tile_v = 0; tile_v < tess_v + 1; ++tile_v) {
for (int tile_u = 0; tile_u < tess_u + 1; ++tile_u) {
float u = ((float)tile_u / (float)tess_u);
@@ -595,10 +620,10 @@ void TesselateBezierPatch(u8 *&dest, int &count, const BezierPatch &patch, u32 o
float bv = v;

// TODO: Should be able to precompute the four curves per U, then just Bernstein per V. Will benefit large tesselation factors.
Vec3f pos1 = Bernstein3D(patch.points[0]->pos, patch.points[1]->pos, patch.points[2]->pos, patch.points[3]->pos, bu);
Vec3f pos2 = Bernstein3D(patch.points[4]->pos, patch.points[5]->pos, patch.points[6]->pos, patch.points[7]->pos, bu);
Vec3f pos3 = Bernstein3D(patch.points[8]->pos, patch.points[9]->pos, patch.points[10]->pos, patch.points[11]->pos, bu);
Vec3f pos4 = Bernstein3D(patch.points[12]->pos, patch.points[13]->pos, patch.points[14]->pos, patch.points[15]->pos, bu);
const Vec3f &pos1 = horiz[tile_u];
const Vec3f &pos2 = horiz2[tile_u];
const Vec3f &pos3 = horiz3[tile_u];
const Vec3f &pos4 = horiz4[tile_u];

SimpleVertex &vert = vertices[tile_v * (tess_u + 1) + tile_u];

@@ -636,20 +661,20 @@ void TesselateBezierPatch(u8 *&dest, int &count, const BezierPatch &patch, u32 o
}
}
}
delete [] horiz;

// Tesselate. TODO: Use indices so we only need to emit 4 vertices per pair of triangles instead of six.
for (int tile_v = 0; tile_v < tess_v; ++tile_v) {
for (int tile_u = 0; tile_u < tess_u; ++tile_u) {
float u = ((float)tile_u / (float)tess_u);
float v = ((float)tile_v / (float)tess_v);

SimpleVertex *v0 = &vertices[tile_v * (tess_u + 1) + tile_u];
SimpleVertex *v1 = &vertices[tile_v * (tess_u + 1) + tile_u + 1];
SimpleVertex *v2 = &vertices[(tile_v + 1) * (tess_u + 1) + tile_u];
SimpleVertex *v3 = &vertices[(tile_v + 1) * (tess_u + 1) + tile_u + 1];
const SimpleVertex *v0 = &vertices[tile_v * (tess_u + 1) + tile_u];
const SimpleVertex *v1 = &vertices[tile_v * (tess_u + 1) + tile_u + 1];
const SimpleVertex *v2 = &vertices[(tile_v + 1) * (tess_u + 1) + tile_u];
const SimpleVertex *v3 = &vertices[(tile_v + 1) * (tess_u + 1) + tile_u + 1];

CopyTriangle(dest, v0, v2, v1);
CopyTriangle(dest, v1, v2, v3);
CopyQuad(dest, v0, v1, v2, v3);
count += 6;
}
}
@@ -716,9 +741,9 @@ void TransformDrawEngine::SubmitSpline(void* control_points, void* indices, int

delete[] points;

u32 vertTypeWithoutIndex = vertType & ~GE_VTYPE_IDX_MASK;
u32 vertTypeWithIndex16 = (vertType & ~GE_VTYPE_IDX_MASK) | GE_VTYPE_IDX_16BIT;

SubmitPrim(decoded2, 0, GE_PRIM_TRIANGLES, count, vertTypeWithoutIndex, GE_VTYPE_IDX_NONE, 0);
SubmitPrim(decoded2, quadIndices_, GE_PRIM_TRIANGLES, count, vertTypeWithIndex16, -1, 0);
Flush();
}

@@ -783,8 +808,8 @@ void TransformDrawEngine::SubmitBezier(void* control_points, void* indices, int
}
delete[] patches;

u32 vertTypeWithoutIndex = vertType & ~GE_VTYPE_IDX_MASK;
u32 vertTypeWithIndex16 = (vertType & ~GE_VTYPE_IDX_MASK) | GE_VTYPE_IDX_16BIT;

SubmitPrim(decoded2, 0, GE_PRIM_TRIANGLES, count, vertTypeWithoutIndex, GE_VTYPE_IDX_NONE, 0);
SubmitPrim(decoded2, quadIndices_, GE_PRIM_TRIANGLES, count, vertTypeWithIndex16, -1, 0);
Flush();
}
@@ -56,6 +56,7 @@ enum {
TRANSFORMED_VERTEX_BUFFER_SIZE = 65536 * sizeof(TransformedVertex)
};

#define QUAD_INDICES_MAX 32768

#define VERTEXCACHE_DECIMATION_INTERVAL 17

@@ -82,6 +83,17 @@ TransformDrawEngine::TransformDrawEngine()
decIndex = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE);
transformed = (TransformedVertex *)AllocateMemoryPages(TRANSFORMED_VERTEX_BUFFER_SIZE);
transformedExpanded = (TransformedVertex *)AllocateMemoryPages(3 * TRANSFORMED_VERTEX_BUFFER_SIZE);
quadIndices_ = new u16[6 * QUAD_INDICES_MAX];

for (int i = 0; i < QUAD_INDICES_MAX; i++) {
quadIndices_[i * 6 + 0] = i * 4;
quadIndices_[i * 6 + 1] = i * 4 + 2;
quadIndices_[i * 6 + 2] = i * 4 + 1;
quadIndices_[i * 6 + 3] = i * 4 + 1;
quadIndices_[i * 6 + 4] = i * 4 + 2;
quadIndices_[i * 6 + 5] = i * 4 + 3;
}

if (g_Config.bPrescaleUV) {
uvScale = new UVScale[MAX_DEFERRED_DRAW_CALLS];
}
@@ -98,6 +110,8 @@ TransformDrawEngine::~TransformDrawEngine() {
FreeMemoryPages(decIndex, DECODED_INDEX_BUFFER_SIZE);
FreeMemoryPages(transformed, TRANSFORMED_VERTEX_BUFFER_SIZE);
FreeMemoryPages(transformedExpanded, 3 * TRANSFORMED_VERTEX_BUFFER_SIZE);
delete [] quadIndices_;

unregister_gl_resource_holder(this);
for (auto iter = decoderMap_.begin(); iter != decoderMap_.end(); iter++) {
delete iter->second;
@@ -179,6 +179,9 @@ class TransformDrawEngine : public GfxResourceHolder {

std::map<u32, VertexArrayInfo *> vai_;

// Fixed index buffer for easy quad generation from spline/bezier
u16 *quadIndices_;

// Vertex buffer objects
// Element buffer objects
enum { NUM_VBOS = 128 };
@@ -115,10 +115,10 @@ void GameSettingsScreen::CreateViews() {
static const char *customSpeed[] = {"Unlimited", "25%", "50%", "75%", "100%", "125%", "150%", "200%", "300%"};
graphicsSettings->Add(new PopupMultiChoice(&iAlternateSpeedPercent_, gs->T("Alternative Speed"), customSpeed, 0, ARRAY_SIZE(customSpeed), gs, screenManager()));


graphicsSettings->Add(new ItemHeader(gs->T("Features")));
graphicsSettings->Add(new CheckBox(&g_Config.bHardwareTransform, gs->T("Hardware Transform")));
graphicsSettings->Add(new CheckBox(&g_Config.bVertexCache, gs->T("Vertex Cache")));
graphicsSettings->Add(new CheckBox(&g_Config.bLowQualitySplineBezier, gs->T("Low quality spline/bezier curves (speed)")));

static const char *internalResolutions[] = {"Auto (1:1)", "1x PSP", "2x PSP", "3x PSP", "4x PSP", "5x PSP" };
graphicsSettings->Add(new PopupMultiChoice(&g_Config.iInternalResolution, gs->T("Rendering Resolution"), internalResolutions, 0, ARRAY_SIZE(internalResolutions), gs, screenManager()))->OnClick.Handle(this, &GameSettingsScreen::OnResolutionChange);

12 comments on commit 2b66a85

@raven02

This comment has been minimized.

Copy link
Contributor

replied Sep 24, 2013

Lower quality does it mean rendered more jagged ? or just speed-wise ?

@hrydgard

This comment has been minimized.

Copy link
Owner Author

replied Sep 24, 2013

Low quality is the old method where we only drew the control points, high quality is the "proper" method which fixed the sky in GEB.

In most games that use splines it's nowhere near as drastic as this, but an example:

HQ:
high quality

LQ:
low quality

@i30817

This comment has been minimized.

Copy link

replied Sep 24, 2013

I thought the problem with Jeanne D'Arc no showing characters was splines. Apparently not.

@hrydgard

This comment has been minimized.

Copy link
Owner Author

replied Sep 24, 2013

Well, Jeanne D'Arc uses beziers to draw flat stuff, which is a bit stupid. The missing characters are probably something else entirely.

@raven02

This comment has been minimized.

Copy link
Contributor

replied Sep 24, 2013

Humm the 'High Quality' one is really high quality when compared to that 'Low Qaulity' which is a big difference

@hrydgard

This comment has been minimized.

Copy link
Owner Author

replied Sep 24, 2013

Heh yeah we could also make intermediate levels that only tesselates half the amount of polygons or something but not sure it's worth the trouble. Loco Roco looks pretty good with low quality as it is.

Very few games really use the spline functionality to the fullest, the game in the screenshots above is one of those few.

@Line524

This comment has been minimized.

Copy link

replied Sep 24, 2013

YEAH !!! Locos now looks fine !)
HQ is optimized enough to run fullspeed even on low end PC like Dual Core 2GHz with Ati HD3200...
So there is no reason to make additional intermediate level of tesselation or low quality option. Current HQ really rocks ! :)

@raven02

This comment has been minimized.

Copy link
Contributor

replied Sep 24, 2013

LQ would be a good option for mobile plaltform .

@hrydgard

This comment has been minimized.

Copy link
Owner Author

replied Sep 24, 2013

@raven02, yeah, that's why I made it an option. Maybe should default it to low quality on mobile - but it really depends on the game if it's too slow or not...

@Line524

This comment has been minimized.

Copy link

replied Sep 24, 2013

LocoRoco looks good only with High quality - loco's looks more softly and rubber levels looks like real rubber levels, not the stone chunks like before )))
Patapon also looks better (wind effects)

@ufdada

This comment has been minimized.

Copy link
Contributor

replied Sep 24, 2013

Great Job so far, everything in the pursuit force series looks good with hq on, but the performance is horrible (as expected) :( Sometimes even with LQ bezier/splines (which causes major graphical error and glitches too). This was tested on a Quadcore AMD Phenom II 940 with a Geforce GTX 670.

@hrydgard

This comment has been minimized.

Copy link
Owner Author

replied Sep 25, 2013

Well there's plenty of room for making this stuff faster, and not just by a little. It will take a lot of work though as this is really complicated.

Please sign in to comment.
You can’t perform that action at this time.