Permalink
Browse files

TexCache: Optimize DXT3/DXT5 decode to single pass.

This is significantly faster on Vulkan, and in other situations where
we're decoding directly to uncached memory.
  • Loading branch information...
unknownbrackets committed Sep 2, 2018
1 parent a004196 commit 8ae2b1e6fbd095707269db09e4a9b7d2f6f9b61c
Showing with 100 additions and 62 deletions.
  1. +100 −62 GPU/Common/TextureDecoder.cpp
@@ -327,106 +327,144 @@ void SetupTextureDecoder() {
#endif
}
// S3TC / DXT Decoder
class DXTDecoder {
public:
inline void DecodeColors(const DXT1Block *src, bool ignore1bitAlpha);
inline void DecodeAlphaDXT5(const DXT5Block *src);
inline void WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int height);
inline void WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int height);
inline void WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int height);
protected:
u32 colors_[4];
u8 alpha_[8];
};
static inline u32 makecol(int r, int g, int b, int a) {
return (a << 24) | (r << 16) | (g << 8) | b;
}
// This could probably be done faster by decoding two or four blocks at a time with SSE/NEON.
void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int height, bool ignore1bitAlpha) {
// S3TC Decoder
// Needs more speed and debugging.
u16 c1 = (src->color1);
u16 c2 = (src->color2);
void DXTDecoder::DecodeColors(const DXT1Block *src, bool ignore1bitAlpha) {
u16 c1 = src->color1;
u16 c2 = src->color2;
int red1 = Convert5To8(c1 & 0x1F);
int red2 = Convert5To8(c2 & 0x1F);
int green1 = Convert6To8((c1 >> 5) & 0x3F);
int green2 = Convert6To8((c2 >> 5) & 0x3F);
int blue1 = Convert5To8((c1 >> 11) & 0x1F);
int blue2 = Convert5To8((c2 >> 11) & 0x1F);
u32 colors[4];
colors[0] = makecol(red1, green1, blue1, 255);
colors[1] = makecol(red2, green2, blue2, 255);
colors_[0] = makecol(red1, green1, blue1, 255);
colors_[1] = makecol(red2, green2, blue2, 255);
if (c1 > c2 || ignore1bitAlpha) {
int blue3 = ((blue2 - blue1) >> 1) - ((blue2 - blue1) >> 3);
int red3 = ((red2 - red1) >> 1) - ((red2 - red1) >> 3);
int green3 = ((green2 - green1) >> 1) - ((green2 - green1) >> 3);
int red3 = ((red2 - red1) >> 1) - ((red2 - red1) >> 3);
colors[2] = makecol(red1 + red3, green1 + green3, blue1 + blue3, 255);
colors[3] = makecol(red2 - red3, green2 - green3, blue2 - blue3, 255);
int blue3 = ((blue2 - blue1) >> 1) - ((blue2 - blue1) >> 3);
colors_[2] = makecol(red1 + red3, green1 + green3, blue1 + blue3, 255);
colors_[3] = makecol(red2 - red3, green2 - green3, blue2 - blue3, 255);
} else {
// Average
int red3 = (red1 + red2 + 1) / 2;
int green3 = (green1 + green2 + 1) / 2;
int blue3 = (blue1 + blue2 + 1) / 2;
colors_[2] = makecol(red3, green3, blue3, 255);
// Color2 but transparent
colors_[3] = makecol(red2, green2, blue2, 0);
}
}
static inline u8 lerp8(const DXT5Block *src, int n) {
float d = n / 7.0f;
return (u8)(src->alpha1 + (src->alpha2 - src->alpha1) * d);
}
static inline u8 lerp6(const DXT5Block *src, int n) {
float d = n / 5.0f;
return (u8)(src->alpha1 + (src->alpha2 - src->alpha1) * d);
}
void DXTDecoder::DecodeAlphaDXT5(const DXT5Block *src) {
// TODO: Check if alpha is still not 100% correct.
alpha_[0] = src->alpha1;
alpha_[1] = src->alpha2;
if (alpha_[0] > alpha_[1]) {
alpha_[2] = lerp8(src, 1);
alpha_[3] = lerp8(src, 2);
alpha_[4] = lerp8(src, 3);
alpha_[5] = lerp8(src, 4);
alpha_[6] = lerp8(src, 5);
alpha_[7] = lerp8(src, 6);
} else {
colors[2] = makecol((red1 + red2 + 1) / 2, // Average
(green1 + green2 + 1) / 2,
(blue1 + blue2 + 1) / 2, 255);
colors[3] = makecol(red2, green2, blue2, 0); // Color2 but transparent
alpha_[2] = lerp6(src, 1);
alpha_[3] = lerp6(src, 2);
alpha_[4] = lerp6(src, 3);
alpha_[5] = lerp6(src, 4);
alpha_[6] = 0;
alpha_[7] = 255;
}
}
void DXTDecoder::WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int height) {
for (int y = 0; y < height; y++) {
int colordata = src->lines[y];
for (int x = 0; x < 4; x++) {
dst[x] = colors_[colordata & 3];
colordata >>= 2;
}
dst += pitch;
}
}
void DXTDecoder::WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int height) {
for (int y = 0; y < height; y++) {
int val = src->lines[y];
int colordata = src->color.lines[y];
u32 alphadata = src->alphaLines[y];
for (int x = 0; x < 4; x++) {
dst[x] = colors[val & 3];
val >>= 2;
const u8 a4 = alphadata & 0xF;
dst[x] = (colors_[colordata & 3] & 0x00FFFFFF) | (a4 << 24) | (a4 << 28);
colordata >>= 2;
alphadata >>= 4;
}
dst += pitch;
}
}
void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int height)
{
DecodeDXT1Block(dst, &src->color, pitch, height, true);
void DXTDecoder::WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int height) {
// 48 bits, 3 bit index per pixel, 12 bits per line.
u64 alphadata = ((u64)(u16)src->alphadata1 << 32) | (u32)src->alphadata2;
for (int y = 0; y < height; y++) {
u32 line = src->alphaLines[y];
int colordata = src->color.lines[y];
for (int x = 0; x < 4; x++) {
const u8 a4 = line & 0xF;
dst[x] = (dst[x] & 0xFFFFFF) | (a4 << 24) | (a4 << 28);
line >>= 4;
dst[x] = (colors_[colordata & 3] & 0x00FFFFFF) | (alpha_[alphadata & 7] << 24);
colordata >>= 2;
alphadata >>= 3;
}
dst += pitch;
}
}
static inline u8 lerp8(const DXT5Block *src, int n) {
float d = n / 7.0f;
return (u8)(src->alpha1 + (src->alpha2 - src->alpha1) * d);
// This could probably be done faster by decoding two or four blocks at a time with SSE/NEON.
void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int height, bool ignore1bitAlpha) {
DXTDecoder dxt;
dxt.DecodeColors(src, ignore1bitAlpha);
dxt.WriteColorsDXT1(dst, src, pitch, height);
}
static inline u8 lerp6(const DXT5Block *src, int n) {
float d = n / 5.0f;
return (u8)(src->alpha1 + (src->alpha2 - src->alpha1) * d);
void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int height) {
DXTDecoder dxt;
dxt.DecodeColors(&src->color, true);
dxt.WriteColorsDXT3(dst, src, pitch, height);
}
// The alpha channel is not 100% correct
void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch, int height) {
DecodeDXT1Block(dst, &src->color, pitch, height, true);
u8 alpha[8];
alpha[0] = src->alpha1;
alpha[1] = src->alpha2;
if (alpha[0] > alpha[1]) {
alpha[2] = lerp8(src, 1);
alpha[3] = lerp8(src, 2);
alpha[4] = lerp8(src, 3);
alpha[5] = lerp8(src, 4);
alpha[6] = lerp8(src, 5);
alpha[7] = lerp8(src, 6);
} else {
alpha[2] = lerp6(src, 1);
alpha[3] = lerp6(src, 2);
alpha[4] = lerp6(src, 3);
alpha[5] = lerp6(src, 4);
alpha[6] = 0;
alpha[7] = 255;
}
u64 data = ((u64)(u16)src->alphadata1 << 32) | (u32)src->alphadata2;
for (int y = 0; y < height; y++) {
for (int x = 0; x < 4; x++) {
dst[x] = (dst[x] & 0xFFFFFF) | (alpha[data & 7] << 24);
data >>= 3;
}
dst += pitch;
}
DXTDecoder dxt;
dxt.DecodeColors(&src->color, true);
dxt.DecodeAlphaDXT5(src);
dxt.WriteColorsDXT5(dst, src, pitch, height);
}
#ifdef _M_SSE

0 comments on commit 8ae2b1e

Please sign in to comment.