Skip to content

Commit

Permalink
PPGe: Interpret invalid UTF-8 sequences better.
Browse files Browse the repository at this point in the history
This matches PSP firmware behavior per tests.
  • Loading branch information
unknownbrackets committed Mar 28, 2021
1 parent 7d08596 commit 5ef8762
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 4 deletions.
31 changes: 29 additions & 2 deletions Common/Data/Encoding/Utf8.cpp
Expand Up @@ -234,6 +234,32 @@ uint32_t u8_nextchar(const char *s, int *i)
return ch;
}

uint32_t u8_nextchar_unsafe(const char *s, int *i) {
uint32_t ch = (unsigned char)s[(*i)++];
int sz = 1;

if (ch >= 0xF0) {
sz++;
ch &= ~0x10;
}
if (ch >= 0xE0) {
sz++;
ch &= ~0x20;
}
if (ch >= 0xC0) {
sz++;
ch &= ~0xC0;
}

// Just assume the bytes must be there. This is the logic used on the PSP.
for (int j = 1; j < sz; ++j) {
ch <<= 6;
ch += ((unsigned char)s[(*i)++]) & 0x3F;
}

return ch;
}

void u8_inc(const char *s, int *i)
{
(void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) ||
Expand Down Expand Up @@ -489,9 +515,10 @@ std::string SanitizeUTF8(const std::string &utf8string) {
// Worst case.
s.resize(utf8string.size() * 4);

// This stops at invalid start bytes.
size_t pos = 0;
while (!utf.end_or_overlong_end()) {
int c = utf.next();
while (!utf.end() && !utf.invalid()) {
int c = utf.next_unsafe();
pos += UTF8::encode(&s[pos], c);
}
s.resize(pos);
Expand Down
11 changes: 10 additions & 1 deletion Common/Data/Encoding/Utf8.h
Expand Up @@ -20,6 +20,7 @@
#include <string>

uint32_t u8_nextchar(const char *s, int *i);
uint32_t u8_nextchar_unsafe(const char *s, int *i);
int u8_wc_toutf8(char *dest, uint32_t ch);
int u8_strlen(const char *s);
void u8_inc(const char *s, int *i);
Expand All @@ -31,10 +32,18 @@ class UTF8 {
UTF8(const char *c) : c_(c), index_(0) {}
UTF8(const char *c, int index) : c_(c), index_(index) {}
bool end() const { return c_[index_] == 0; }
bool end_or_overlong_end() const { return peek() == 0; }
// Returns true if the next character is outside BMP and Planes 1 - 16.
bool invalid() const {
unsigned char c = (unsigned char)c_[index_];
return (c >= 0x80 && c <= 0xC1) || c >= 0xF5;
}
uint32_t next() {
return u8_nextchar(c_, &index_);
}
// Allow invalid continuation bytes.
uint32_t next_unsafe() {
return u8_nextchar_unsafe(c_, &index_);
}
uint32_t peek() const {
int tempIndex = index_;
return u8_nextchar(c_, &tempIndex);
Expand Down
2 changes: 1 addition & 1 deletion Core/Util/PPGeDraw.cpp
Expand Up @@ -960,7 +960,7 @@ static std::string PPGeSanitizeText(const std::string &text) {
// the overlong null, the rest of the string is missing in the bottom left corner (save size, etc).
// It doesn't seem to be using sceCcc.
// Note how the double "" is required in the middle of the string to end the \x80 constant (otherwise it takes E).
// TODO: Potentially if the string is only ended by a C080, ReplaceAll might overshoot :(
// This behavior doesn't replicate within other games, so it may be a game bug workaround.
std::string str = ReplaceAll(text, "\xC0\x80""ENTR", "");
// Then SanitizeUTF8 is needed to get rid of various other overlong encodings.
return SanitizeUTF8(str);
Expand Down

0 comments on commit 5ef8762

Please sign in to comment.