Skip to content

Commit

Permalink
Merge pull request #18314 from hrydgard/read-write-vector-opt
Browse files Browse the repository at this point in the history
Interpreter: Optimize ReadVector/WriteVector by removing voffset lookups
  • Loading branch information
hrydgard committed Oct 5, 2023
2 parents ba1688b + 0d06af8 commit 0cd02ab
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 22 deletions.
2 changes: 1 addition & 1 deletion Core/MIPS/MIPS.cpp
Expand Up @@ -122,7 +122,7 @@ MIPSState::MIPSState() {
// * 4x4 Matrices are contiguous in RAM, making them, too, fast-loadable in NEON

// Disadvantages:
// * Extra indirection, can be confusing and slower (interpreter only)
// * Extra indirection, can be confusing and slower (interpreter only, however we can often skip the table by rerranging formulas)
// * Flushing and reloading row registers is now slower

int i = 0;
Expand Down
48 changes: 27 additions & 21 deletions Core/MIPS/MIPSVFPUUtils.cpp
Expand Up @@ -164,6 +164,7 @@ void GetMatrixRows(int matrixReg, MatrixSize msize, u8 vecs[4]) {
}
}


void ReadVector(float *rd, VectorSize size, int reg) {
int row;
int length;
Expand All @@ -175,16 +176,17 @@ void ReadVector(float *rd, VectorSize size, int reg) {
default: length = 0; break;
}
int transpose = (reg >> 5) & 1;
const int mtx = reg & (7 << 2);
const int mtx = ((reg << 2) & 0x70);
const int col = reg & 3;
// NOTE: We now skip the voffset lookups.
if (transpose) {
const int base = mtx + col * 32;
const int base = mtx + col;
for (int i = 0; i < length; i++)
rd[i] = V(base + ((row+i)&3));
rd[i] = currentMIPS->v[base + ((row + i) & 3) * 4];
} else {
const int base = mtx + col;
const int base = mtx + col * 4;
for (int i = 0; i < length; i++)
rd[i] = V(base + ((row+i)&3)*32);
rd[i] = currentMIPS->v[base + ((row + i) & 3)];
}
}

Expand All @@ -200,28 +202,32 @@ void WriteVector(const float *rd, VectorSize size, int reg) {
default: length = 0; break;
}

const int mtx = reg & (7 << 2);
const int mtx = ((reg << 2) & 0x70);
const int col = reg & 3;
bool transpose = (reg >> 5) & 1;
if (currentMIPS->VfpuWriteMask() == 0) {
if (transpose) {
const int base = mtx + col * 32;
// NOTE: We now skip the voffset lookups.
if (transpose) {
const int base = mtx + col;
if (currentMIPS->VfpuWriteMask() == 0) {
for (int i = 0; i < length; i++)
V(base + ((row+i)&3)) = rd[i];
currentMIPS->v[base + ((row+i) & 3) * 4] = rd[i];
} else {
const int base = mtx + col;
for (int i = 0; i < length; i++)
V(base + ((row+i)&3)*32) = rd[i];
for (int i = 0; i < length; i++) {
if (!currentMIPS->VfpuWriteMask(i)) {
currentMIPS->v[base + ((row+i) & 3) * 4] = rd[i];
}
}
}
} else {
for (int i = 0; i < length; i++) {
if (!currentMIPS->VfpuWriteMask(i)) {
int index = mtx;
if (transpose)
index += ((row+i)&3) + col*32;
else
index += col + ((row+i)&3)*32;
V(index) = rd[i];
const int base = mtx + col * 4;
if (currentMIPS->VfpuWriteMask() == 0) {
for (int i = 0; i < length; i++)
currentMIPS->v[base + ((row + i) & 3)] = rd[i];
} else {
for (int i = 0; i < length; i++) {
if (!currentMIPS->VfpuWriteMask(i)) {
currentMIPS->v[base + ((row + i) & 3)] = rd[i];
}
}
}
}
Expand Down

0 comments on commit 0cd02ab

Please sign in to comment.