-
Notifications
You must be signed in to change notification settings - Fork 2.2k
/
TextureScalerCommon.cpp
671 lines (606 loc) · 24.7 KB
/
TextureScalerCommon.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
// Copyright (c) 2012- PPSSPP Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0 or later versions.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official git repository and contact information can be found at
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#if _MSC_VER == 1700
// Has to be included before TextureScaler.h, else we get those std::bind errors in VS2012..
#include "../native/base/basictypes.h"
#endif
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <cstring>
#include <cmath>
#include "GPU/Common/TextureScalerCommon.h"
#include "Core/Config.h"
#include "Common/Common.h"
#include "Common/Log.h"
#include "Common/MsgHandler.h"
#include "Common/CommonFuncs.h"
#include "Common/ThreadPools.h"
#include "Common/CPUDetect.h"
#include "ext/xbrz/xbrz.h"
#if _M_SSE >= 0x401
#include <smmintrin.h>
#endif
// Report the time and throughput for each larger scaling operation in the log
//#define SCALING_MEASURE_TIME
//#define DEBUG_SCALER_OUTPUT
#ifdef SCALING_MEASURE_TIME
#include "base/timeutil.h"
#endif
/////////////////////////////////////// Helper Functions (mostly math for parallelization)
namespace {
//////////////////////////////////////////////////////////////////// Various image processing
#define R(_col) ((_col>> 0)&0xFF)
#define G(_col) ((_col>> 8)&0xFF)
#define B(_col) ((_col>>16)&0xFF)
#define A(_col) ((_col>>24)&0xFF)
#define DISTANCE(_p1,_p2) ( abs(static_cast<int>(static_cast<int>(R(_p1))-R(_p2))) + abs(static_cast<int>(static_cast<int>(G(_p1))-G(_p2))) \
+ abs(static_cast<int>(static_cast<int>(B(_p1))-B(_p2))) + abs(static_cast<int>(static_cast<int>(A(_p1))-A(_p2))) )
// this is sadly much faster than an inline function with a loop, at least in VC10
#define MIX_PIXELS(_p0, _p1, _factors) \
( (R(_p0)*(_factors)[0] + R(_p1)*(_factors)[1])/255 << 0 ) | \
( (G(_p0)*(_factors)[0] + G(_p1)*(_factors)[1])/255 << 8 ) | \
( (B(_p0)*(_factors)[0] + B(_p1)*(_factors)[1])/255 << 16 ) | \
( (A(_p0)*(_factors)[0] + A(_p1)*(_factors)[1])/255 << 24 )
#define BLOCK_SIZE 32
// 3x3 convolution with Neumann boundary conditions, parallelizable
// quite slow, could be sped up a lot
// especially handling of separable kernels
void convolve3x3(u32* data, u32* out, const int kernel[3][3], int width, int height, int l, int u) {
for (int yb = 0; yb < (u - l) / BLOCK_SIZE + 1; ++yb) {
for (int xb = 0; xb < width / BLOCK_SIZE + 1; ++xb) {
for (int y = l + yb*BLOCK_SIZE; y < l + (yb + 1)*BLOCK_SIZE && y < u; ++y) {
for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < width; ++x) {
int val = 0;
for (int yoff = -1; yoff <= 1; ++yoff) {
int yy = std::max(std::min(y + yoff, height - 1), 0);
for (int xoff = -1; xoff <= 1; ++xoff) {
int xx = std::max(std::min(x + xoff, width - 1), 0);
val += data[yy*width + xx] * kernel[yoff + 1][xoff + 1];
}
}
out[y*width + x] = abs(val);
}
}
}
}
}
// deposterization: smoothes posterized gradients from low-color-depth (e.g. 444, 565, compressed) sources
void deposterizeH(u32* data, u32* out, int w, int l, int u) {
static const int T = 8;
for (int y = l; y < u; ++y) {
for (int x = 0; x < w; ++x) {
int inpos = y*w + x;
u32 center = data[inpos];
if (x == 0 || x == w - 1) {
out[y*w + x] = center;
continue;
}
u32 left = data[inpos - 1];
u32 right = data[inpos + 1];
out[y*w + x] = 0;
for (int c = 0; c < 4; ++c) {
u8 lc = ((left >> c * 8) & 0xFF);
u8 cc = ((center >> c * 8) & 0xFF);
u8 rc = ((right >> c * 8) & 0xFF);
if ((lc != rc) && ((lc == cc && abs((int)((int)rc) - cc) <= T) || (rc == cc && abs((int)((int)lc) - cc) <= T))) {
// blend this component
out[y*w + x] |= ((rc + lc) / 2) << (c * 8);
} else {
// no change for this component
out[y*w + x] |= cc << (c * 8);
}
}
}
}
}
void deposterizeV(u32* data, u32* out, int w, int h, int l, int u) {
static const int T = 8;
for (int xb = 0; xb < w / BLOCK_SIZE + 1; ++xb) {
for (int y = l; y < u; ++y) {
for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < w; ++x) {
u32 center = data[y * w + x];
if (y == 0 || y == h - 1) {
out[y*w + x] = center;
continue;
}
u32 upper = data[(y - 1) * w + x];
u32 lower = data[(y + 1) * w + x];
out[y*w + x] = 0;
for (int c = 0; c < 4; ++c) {
u8 uc = ((upper >> c * 8) & 0xFF);
u8 cc = ((center >> c * 8) & 0xFF);
u8 lc = ((lower >> c * 8) & 0xFF);
if ((uc != lc) && ((uc == cc && abs((int)((int)lc) - cc) <= T) || (lc == cc && abs((int)((int)uc) - cc) <= T))) {
// blend this component
out[y*w + x] |= ((lc + uc) / 2) << (c * 8);
} else {
// no change for this component
out[y*w + x] |= cc << (c * 8);
}
}
}
}
}
}
// generates a distance mask value for each pixel in data
// higher values -> larger distance to the surrounding pixels
void generateDistanceMask(u32* data, u32* out, int width, int height, int l, int u) {
for (int yb = 0; yb < (u - l) / BLOCK_SIZE + 1; ++yb) {
for (int xb = 0; xb < width / BLOCK_SIZE + 1; ++xb) {
for (int y = l + yb*BLOCK_SIZE; y < l + (yb + 1)*BLOCK_SIZE && y < u; ++y) {
for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < width; ++x) {
const u32 center = data[y*width + x];
u32 dist = 0;
for (int yoff = -1; yoff <= 1; ++yoff) {
int yy = y + yoff;
if (yy == height || yy == -1) {
dist += 1200; // assume distance at borders, usually makes for better result
continue;
}
for (int xoff = -1; xoff <= 1; ++xoff) {
if (yoff == 0 && xoff == 0) continue;
int xx = x + xoff;
if (xx == width || xx == -1) {
dist += 400; // assume distance at borders, usually makes for better result
continue;
}
dist += DISTANCE(data[yy*width + xx], center);
}
}
out[y*width + x] = dist;
}
}
}
}
}
// mix two images based on a mask
void mix(u32* data, u32* source, u32* mask, u32 maskmax, int width, int l, int u) {
for (int y = l; y < u; ++y) {
for (int x = 0; x < width; ++x) {
int pos = y*width + x;
u8 mixFactors[2] = { 0, static_cast<u8>((std::min(mask[pos], maskmax) * 255) / maskmax) };
mixFactors[0] = 255 - mixFactors[1];
data[pos] = MIX_PIXELS(data[pos], source[pos], mixFactors);
if (A(source[pos]) == 0) data[pos] = data[pos] & 0x00FFFFFF; // xBRZ always does a better job with hard alpha
}
}
}
//////////////////////////////////////////////////////////////////// Bicubic scaling
// generate the value of a Mitchell-Netravali scaling spline at distance d, with parameters A and B
// B=1 C=0 : cubic B spline (very smooth)
// B=C=1/3 : recommended for general upscaling
// B=0 C=1/2 : Catmull-Rom spline (sharp, ringing)
// see Mitchell & Netravali, "Reconstruction Filters in Computer Graphics"
inline float mitchell(float x, float B, float C) {
float ax = fabs(x);
if (ax >= 2.0f) return 0.0f;
if (ax >= 1.0f) return ((-B - 6 * C)*(x*x*x) + (6 * B + 30 * C)*(x*x) + (-12 * B - 48 * C)*x + (8 * B + 24 * C)) / 6.0f;
return ((12 - 9 * B - 6 * C)*(x*x*x) + (-18 + 12 * B + 6 * C)*(x*x) + (6 - 2 * B)) / 6.0f;
}
// arrays for pre-calculating weights and sums (~20KB)
// Dimensions:
// 0: 0 = BSpline, 1 = mitchell
// 2: 2-5x scaling
// 2,3: 5x5 generated pixels
// 4,5: 5x5 pixels sampled from
float bicubicWeights[2][4][5][5][5][5];
float bicubicInvSums[2][4][5][5];
// initialize pre-computed weights array
void initBicubicWeights() {
float B[2] = { 1.0f, 0.334f };
float C[2] = { 0.0f, 0.334f };
for (int type = 0; type < 2; ++type) {
for (int factor = 2; factor <= 5; ++factor) {
for (int x = 0; x < factor; ++x) {
for (int y = 0; y < factor; ++y) {
float sum = 0.0f;
for (int sx = -2; sx <= 2; ++sx) {
for (int sy = -2; sy <= 2; ++sy) {
float dx = (x + 0.5f) / factor - (sx + 0.5f);
float dy = (y + 0.5f) / factor - (sy + 0.5f);
float dist = sqrt(dx*dx + dy*dy);
float weight = mitchell(dist, B[type], C[type]);
bicubicWeights[type][factor - 2][x][y][sx + 2][sy + 2] = weight;
sum += weight;
}
}
bicubicInvSums[type][factor - 2][x][y] = 1.0f / sum;
}
}
}
}
}
// perform bicubic scaling by factor f, with precomputed spline type T
template<int f, int T>
void scaleBicubicT(u32* data, u32* out, int w, int h, int l, int u) {
int outw = w*f;
for (int yb = 0; yb < (u - l)*f / BLOCK_SIZE + 1; ++yb) {
for (int xb = 0; xb < w*f / BLOCK_SIZE + 1; ++xb) {
for (int y = l*f + yb*BLOCK_SIZE; y < l*f + (yb + 1)*BLOCK_SIZE && y < u*f; ++y) {
for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < w*f; ++x) {
float r = 0.0f, g = 0.0f, b = 0.0f, a = 0.0f;
int cx = x / f, cy = y / f;
// sample supporting pixels in original image
for (int sx = -2; sx <= 2; ++sx) {
for (int sy = -2; sy <= 2; ++sy) {
float weight = bicubicWeights[T][f - 2][x%f][y%f][sx + 2][sy + 2];
if (weight != 0.0f) {
// clamp pixel locations
int csy = std::max(std::min(sy + cy, h - 1), 0);
int csx = std::max(std::min(sx + cx, w - 1), 0);
// sample & add weighted components
u32 sample = data[csy*w + csx];
r += weight*R(sample);
g += weight*G(sample);
b += weight*B(sample);
a += weight*A(sample);
}
}
}
// generate and write result
float invSum = bicubicInvSums[T][f - 2][x%f][y%f];
int ri = std::min(std::max(static_cast<int>(ceilf(r*invSum)), 0), 255);
int gi = std::min(std::max(static_cast<int>(ceilf(g*invSum)), 0), 255);
int bi = std::min(std::max(static_cast<int>(ceilf(b*invSum)), 0), 255);
int ai = std::min(std::max(static_cast<int>(ceilf(a*invSum)), 0), 255);
out[y*outw + x] = (ai << 24) | (bi << 16) | (gi << 8) | ri;
}
}
}
}
}
#if _M_SSE >= 0x401
template<int f, int T>
void scaleBicubicTSSE41(u32* data, u32* out, int w, int h, int l, int u) {
int outw = w*f;
for (int yb = 0; yb < (u - l)*f / BLOCK_SIZE + 1; ++yb) {
for (int xb = 0; xb < w*f / BLOCK_SIZE + 1; ++xb) {
for (int y = l*f + yb*BLOCK_SIZE; y < l*f + (yb + 1)*BLOCK_SIZE && y < u*f; ++y) {
for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < w*f; ++x) {
__m128 result = _mm_set1_ps(0.0f);
int cx = x / f, cy = y / f;
// sample supporting pixels in original image
for (int sx = -2; sx <= 2; ++sx) {
for (int sy = -2; sy <= 2; ++sy) {
float weight = bicubicWeights[T][f - 2][x%f][y%f][sx + 2][sy + 2];
if (weight != 0.0f) {
// clamp pixel locations
int csy = std::max(std::min(sy + cy, h - 1), 0);
int csx = std::max(std::min(sx + cx, w - 1), 0);
// sample & add weighted components
__m128i sample = _mm_cvtsi32_si128(data[csy*w + csx]);
sample = _mm_cvtepu8_epi32(sample);
__m128 col = _mm_cvtepi32_ps(sample);
col = _mm_mul_ps(col, _mm_set1_ps(weight));
result = _mm_add_ps(result, col);
}
}
}
// generate and write result
__m128i pixel = _mm_cvtps_epi32(_mm_mul_ps(result, _mm_set1_ps(bicubicInvSums[T][f - 2][x%f][y%f])));
pixel = _mm_packs_epi32(pixel, pixel);
pixel = _mm_packus_epi16(pixel, pixel);
out[y*outw + x] = _mm_cvtsi128_si32(pixel);
}
}
}
}
}
#endif
void scaleBicubicBSpline(int factor, u32* data, u32* out, int w, int h, int l, int u) {
#if _M_SSE >= 0x401
if (cpu_info.bSSE4_1) {
switch (factor) {
case 2: scaleBicubicTSSE41<2, 0>(data, out, w, h, l, u); break; // when I first tested this,
case 3: scaleBicubicTSSE41<3, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected
case 4: scaleBicubicTSSE41<4, 0>(data, out, w, h, l, u); break; // turns out I had not included
case 5: scaleBicubicTSSE41<5, 0>(data, out, w, h, l, u); break; // any of these break statements
default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
}
} else {
#endif
switch (factor) {
case 2: scaleBicubicT<2, 0>(data, out, w, h, l, u); break; // when I first tested this,
case 3: scaleBicubicT<3, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected
case 4: scaleBicubicT<4, 0>(data, out, w, h, l, u); break; // turns out I had not included
case 5: scaleBicubicT<5, 0>(data, out, w, h, l, u); break; // any of these break statements
default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
}
#if _M_SSE >= 0x401
}
#endif
}
void scaleBicubicMitchell(int factor, u32* data, u32* out, int w, int h, int l, int u) {
#if _M_SSE >= 0x401
if (cpu_info.bSSE4_1) {
switch (factor) {
case 2: scaleBicubicTSSE41<2, 1>(data, out, w, h, l, u); break;
case 3: scaleBicubicTSSE41<3, 1>(data, out, w, h, l, u); break;
case 4: scaleBicubicTSSE41<4, 1>(data, out, w, h, l, u); break;
case 5: scaleBicubicTSSE41<5, 1>(data, out, w, h, l, u); break;
default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
}
} else {
#endif
switch (factor) {
case 2: scaleBicubicT<2, 1>(data, out, w, h, l, u); break;
case 3: scaleBicubicT<3, 1>(data, out, w, h, l, u); break;
case 4: scaleBicubicT<4, 1>(data, out, w, h, l, u); break;
case 5: scaleBicubicT<5, 1>(data, out, w, h, l, u); break;
default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
}
#if _M_SSE >= 0x401
}
#endif
}
//////////////////////////////////////////////////////////////////// Bilinear scaling
const static u8 BILINEAR_FACTORS[4][3][2] = {
{ { 44, 211 }, { 0, 0 }, { 0, 0 } }, // x2
{ { 64, 191 }, { 0, 255 }, { 0, 0 } }, // x3
{ { 77, 178 }, { 26, 229 }, { 0, 0 } }, // x4
{ { 102, 153 }, { 51, 204 }, { 0, 255 } }, // x5
};
// integral bilinear upscaling by factor f, horizontal part
template<int f>
void bilinearHt(u32* data, u32* out, int w, int l, int u) {
static_assert(f > 1 && f <= 5, "Bilinear scaling only implemented for factors 2 to 5");
int outw = w*f;
for (int y = l; y < u; ++y) {
for (int x = 0; x < w; ++x) {
int inpos = y*w + x;
u32 left = data[inpos - (x == 0 ? 0 : 1)];
u32 center = data[inpos];
u32 right = data[inpos + (x == w - 1 ? 0 : 1)];
int i = 0;
for (; i < f / 2 + f % 2; ++i) { // first half of the new pixels + center, hope the compiler unrolls this
out[y*outw + x*f + i] = MIX_PIXELS(left, center, BILINEAR_FACTORS[f - 2][i]);
}
for (; i < f; ++i) { // second half of the new pixels, hope the compiler unrolls this
out[y*outw + x*f + i] = MIX_PIXELS(right, center, BILINEAR_FACTORS[f - 2][f - 1 - i]);
}
}
}
}
void bilinearH(int factor, u32* data, u32* out, int w, int l, int u) {
switch (factor) {
case 2: bilinearHt<2>(data, out, w, l, u); break;
case 3: bilinearHt<3>(data, out, w, l, u); break;
case 4: bilinearHt<4>(data, out, w, l, u); break;
case 5: bilinearHt<5>(data, out, w, l, u); break;
default: ERROR_LOG(G3D, "Bilinear upsampling only implemented for factors 2 to 5");
}
}
// integral bilinear upscaling by factor f, vertical part
// gl/gu == global lower and upper bound
template<int f>
void bilinearVt(u32* data, u32* out, int w, int gl, int gu, int l, int u) {
static_assert(f>1 && f <= 5, "Bilinear scaling only implemented for 2x, 3x, 4x, and 5x");
int outw = w*f;
for (int xb = 0; xb < outw / BLOCK_SIZE + 1; ++xb) {
for (int y = l; y < u; ++y) {
u32 uy = y - (y == gl ? 0 : 1);
u32 ly = y + (y == gu - 1 ? 0 : 1);
for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < outw; ++x) {
u32 upper = data[uy * outw + x];
u32 center = data[y * outw + x];
u32 lower = data[ly * outw + x];
int i = 0;
for (; i < f / 2 + f % 2; ++i) { // first half of the new pixels + center, hope the compiler unrolls this
out[(y*f + i)*outw + x] = MIX_PIXELS(upper, center, BILINEAR_FACTORS[f - 2][i]);
}
for (; i < f; ++i) { // second half of the new pixels, hope the compiler unrolls this
out[(y*f + i)*outw + x] = MIX_PIXELS(lower, center, BILINEAR_FACTORS[f - 2][f - 1 - i]);
}
}
}
}
}
void bilinearV(int factor, u32* data, u32* out, int w, int gl, int gu, int l, int u) {
switch (factor) {
case 2: bilinearVt<2>(data, out, w, gl, gu, l, u); break;
case 3: bilinearVt<3>(data, out, w, gl, gu, l, u); break;
case 4: bilinearVt<4>(data, out, w, gl, gu, l, u); break;
case 5: bilinearVt<5>(data, out, w, gl, gu, l, u); break;
default: ERROR_LOG(G3D, "Bilinear upsampling only implemented for factors 2 to 5");
}
}
#undef BLOCK_SIZE
#undef MIX_PIXELS
#undef DISTANCE
#undef R
#undef G
#undef B
#undef A
#ifdef DEBUG_SCALER_OUTPUT
// used for debugging texture scaling (writing textures to files)
static int g_imgCount = 0;
void dbgPPM(int w, int h, u8* pixels, const char* prefix = "dbg") { // 3 component RGB
char fn[32];
snprintf(fn, 32, "%s%04d.ppm", prefix, g_imgCount++);
FILE *fp = fopen(fn, "wb");
fprintf(fp, "P6\n%d %d\n255\n", w, h);
for (int j = 0; j < h; ++j) {
for (int i = 0; i < w; ++i) {
static unsigned char color[3];
color[0] = pixels[(j*w + i) * 4 + 0]; /* red */
color[1] = pixels[(j*w + i) * 4 + 1]; /* green */
color[2] = pixels[(j*w + i) * 4 + 2]; /* blue */
fwrite(color, 1, 3, fp);
}
}
fclose(fp);
}
void dbgPGM(int w, int h, u32* pixels, const char* prefix = "dbg") { // 1 component
char fn[32];
snprintf(fn, 32, "%s%04d.pgm", prefix, g_imgCount++);
FILE *fp = fopen(fn, "wb");
fprintf(fp, "P5\n%d %d\n65536\n", w, h);
for (int j = 0; j < h; ++j) {
for (int i = 0; i < w; ++i) {
fwrite((pixels + (j*w + i)), 1, 2, fp);
}
}
fclose(fp);
}
#endif
}
/////////////////////////////////////// Texture Scaler
TextureScalerCommon::TextureScalerCommon() {
initBicubicWeights();
}
TextureScalerCommon::~TextureScalerCommon() {
}
bool TextureScalerCommon::IsEmptyOrFlat(u32* data, int pixels, int fmt) {
int pixelsPerWord = 4 / BytesPerPixel(fmt);
u32 ref = data[0];
if (pixelsPerWord > 1 && (ref & 0x0000FFFF) != (ref >> 16)) {
return false;
}
for (int i = 0; i < pixels / pixelsPerWord; ++i) {
if (data[i] != ref) return false;
}
return true;
}
void TextureScalerCommon::ScaleAlways(u32 *out, u32 *src, u32 &dstFmt, int &width, int &height, int factor) {
if (IsEmptyOrFlat(src, width*height, dstFmt)) {
// This means it was a flat texture. Vulkan wants the size up front, so we need to make it happen.
u32 pixel;
// Since it's flat, one pixel is enough. It might end up pointing to data, though.
u32 *pixelPointer = &pixel;
ConvertTo8888(dstFmt, src, pixelPointer, 1, 1);
if (pixelPointer != &pixel) {
pixel = *pixelPointer;
}
dstFmt = Get8888Format();
width *= factor;
height *= factor;
// ABCD. If A = D, and AB = CD, then they must all be equal (B = C, etc.)
if ((pixel & 0x000000FF) == (pixel >> 24) && (pixel & 0x0000FFFF) == (pixel >> 16)) {
memset(out, pixel & 0xFF, width * height * sizeof(u32));
} else {
// Let's hope this is vectorized.
for (int i = 0; i < width * height; ++i) {
out[i] = pixel;
}
}
} else {
ScaleInto(out, src, dstFmt, width, height, factor);
}
}
bool TextureScalerCommon::ScaleInto(u32 *outputBuf, u32 *src, u32 &dstFmt, int &width, int &height, int factor) {
#ifdef SCALING_MEASURE_TIME
double t_start = real_time_now();
#endif
bufInput.resize(width*height); // used to store the input image image if it needs to be reformatted
u32 *inputBuf = bufInput.data();
// convert texture to correct format for scaling
ConvertTo8888(dstFmt, src, inputBuf, width, height);
// deposterize
if (g_Config.bTexDeposterize) {
bufDeposter.resize(width*height);
DePosterize(inputBuf, bufDeposter.data(), width, height);
inputBuf = bufDeposter.data();
}
// scale
switch (g_Config.iTexScalingType) {
case XBRZ:
ScaleXBRZ(factor, inputBuf, outputBuf, width, height);
break;
case HYBRID:
ScaleHybrid(factor, inputBuf, outputBuf, width, height);
break;
case BICUBIC:
ScaleBicubicMitchell(factor, inputBuf, outputBuf, width, height);
break;
case HYBRID_BICUBIC:
ScaleHybrid(factor, inputBuf, outputBuf, width, height, true);
break;
default:
ERROR_LOG(G3D, "Unknown scaling type: %d", g_Config.iTexScalingType);
}
// update values accordingly
dstFmt = Get8888Format();
width *= factor;
height *= factor;
#ifdef SCALING_MEASURE_TIME
if (width*height > 64 * 64 * factor*factor) {
double t = real_time_now() - t_start;
NOTICE_LOG(G3D, "TextureScaler: processed %9d pixels in %6.5lf seconds. (%9.2lf Mpixels/second)",
width*height, t, (width*height) / (t * 1000 * 1000));
}
#endif
return true;
}
bool TextureScalerCommon::Scale(u32* &data, u32 &dstFmt, int &width, int &height, int factor) {
// prevent processing empty or flat textures (this happens a lot in some games)
// doesn't hurt the standard case, will be very quick for textures with actual texture
if (IsEmptyOrFlat(data, width*height, dstFmt)) {
DEBUG_LOG(G3D, "TextureScaler: early exit -- empty/flat texture");
return false;
}
bufOutput.resize(width*height*factor*factor); // used to store the upscaled image
u32 *outputBuf = bufOutput.data();
if (ScaleInto(outputBuf, data, dstFmt, width, height, factor)) {
data = outputBuf;
return true;
}
return false;
}
void TextureScalerCommon::ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height) {
xbrz::ScalerCfg cfg;
GlobalThreadPool::Loop(std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, std::placeholders::_1, std::placeholders::_2), 0, height);
}
void TextureScalerCommon::ScaleBilinear(int factor, u32* source, u32* dest, int width, int height) {
bufTmp1.resize(width*height*factor);
u32 *tmpBuf = bufTmp1.data();
GlobalThreadPool::Loop(std::bind(&bilinearH, factor, source, tmpBuf, width, std::placeholders::_1, std::placeholders::_2), 0, height);
GlobalThreadPool::Loop(std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, std::placeholders::_1, std::placeholders::_2), 0, height);
}
void TextureScalerCommon::ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height) {
GlobalThreadPool::Loop(std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height);
}
void TextureScalerCommon::ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height) {
GlobalThreadPool::Loop(std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height);
}
void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic) {
// Basic algorithm:
// 1) determine a feature mask C based on a sobel-ish filter + splatting, and upscale that mask bilinearly
// 2) generate 2 scaled images: A - using Bilinear filtering, B - using xBRZ
// 3) output = A*C + B*(1-C)
const static int KERNEL_SPLAT[3][3] = {
{ 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }
};
bufTmp1.resize(width*height);
bufTmp2.resize(width*height*factor*factor);
bufTmp3.resize(width*height*factor*factor);
GlobalThreadPool::Loop(std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, std::placeholders::_1, std::placeholders::_2), 0, height);
GlobalThreadPool::Loop(std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, std::placeholders::_1, std::placeholders::_2), 0, height);
ScaleBilinear(factor, bufTmp2.data(), bufTmp3.data(), width, height);
// mask C is now in bufTmp3
ScaleXBRZ(factor, source, bufTmp2.data(), width, height);
// xBRZ upscaled source is in bufTmp2
if (bicubic) ScaleBicubicBSpline(factor, source, dest, width, height);
else ScaleBilinear(factor, source, dest, width, height);
// Upscaled source is in dest
// Now we can mix it all together
// The factor 8192 was found through practical testing on a variety of textures
GlobalThreadPool::Loop(std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, std::placeholders::_1, std::placeholders::_2), 0, height*factor);
}
void TextureScalerCommon::DePosterize(u32* source, u32* dest, int width, int height) {
bufTmp3.resize(width*height);
GlobalThreadPool::Loop(std::bind(&deposterizeH, source, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height);
GlobalThreadPool::Loop(std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height);
GlobalThreadPool::Loop(std::bind(&deposterizeH, dest, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height);
GlobalThreadPool::Loop(std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height);
}