Skip to content

Commit

Permalink
Merge pull request #18682 from hrydgard/string-optimizations
Browse files Browse the repository at this point in the history
More string_view optimizations
  • Loading branch information
hrydgard authored Jan 12, 2024
2 parents 971edc6 + 5dddfa3 commit 83999b8
Show file tree
Hide file tree
Showing 13 changed files with 89 additions and 128 deletions.
121 changes: 32 additions & 89 deletions Common/Data/Encoding/Utf8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,35 +206,22 @@ int u8_charnum(const char *s, int offset)
return charnum;
}

/* number of characters */
int u8_strlen(const char *s)
{
int count = 0;
int i = 0;

while (u8_nextchar(s, &i) != 0)
count++;

return count;
}

/* reads the next utf-8 sequence out of a string, updating an index */
uint32_t u8_nextchar(const char *s, int *index) {
uint32_t u8_nextchar(const char *s, int *index, size_t size) {
uint32_t ch = 0;
int sz = 0;
int i = *index;
do {
ch = (ch << 6) + (unsigned char)s[i++];
sz++;
} while (s[i] && ((s[i]) & 0xC0) == 0x80);
} while (i < size && ((s[i]) & 0xC0) == 0x80);
*index = i;
return ch - offsetsFromUTF8[sz - 1];
}

uint32_t u8_nextchar_unsafe(const char *s, int *i) {
uint32_t ch = (unsigned char)s[(*i)++];
int sz = 1;

if (ch >= 0xF0) {
sz++;
ch &= ~0x10;
Expand All @@ -253,7 +240,6 @@ uint32_t u8_nextchar_unsafe(const char *s, int *i) {
ch <<= 6;
ch += ((unsigned char)s[(*i)++]) & 0x3F;
}

return ch;
}

Expand Down Expand Up @@ -367,48 +353,6 @@ int u8_unescape(char *buf, int sz, char *src)
return c;
}

const char *u8_strchr(const char *s, uint32_t ch, int *charn)
{
int i = 0, lasti=0;
uint32_t c;

*charn = 0;
while (s[i]) {
c = u8_nextchar(s, &i);
if (c == ch) {
return &s[lasti];
}
lasti = i;
(*charn)++;
}
return NULL;
}

const char *u8_memchr(const char *s, uint32_t ch, size_t sz, int *charn)
{
size_t i = 0, lasti=0;
uint32_t c;
int csz;

*charn = 0;
while (i < sz) {
c = csz = 0;
do {
c <<= 6;
c += (unsigned char)s[i++];
csz++;
} while (i < sz && !isutf(s[i]));
c -= offsetsFromUTF8[csz-1];

if (c == ch) {
return &s[lasti];
}
lasti = i;
(*charn)++;
}
return NULL;
}

int u8_is_locale_utf8(const char *locale)
{
/* this code based on libutf8 */
Expand All @@ -428,18 +372,20 @@ int u8_is_locale_utf8(const char *locale)
return 0;
}

bool AnyEmojiInString(const char *s, size_t byteCount) {
UTF8::UTF8(const char *c) : c_(c), size_((int)strlen(c)), index_(0) {}

bool AnyEmojiInString(std::string_view str, size_t byteCount) {
int i = 0;
while (i < byteCount) {
uint32_t c = u8_nextchar(s, &i);
uint32_t c = u8_nextchar(str.data(), &i, str.size());
if (CodepointIsProbablyEmoji(c)) {
return true;
}
}
return false;
}

int UTF8StringNonASCIICount(const char *utf8string) {
int UTF8StringNonASCIICount(std::string_view utf8string) {
UTF8 utf(utf8string);
int count = 0;
while (!utf.end()) {
Expand All @@ -450,7 +396,7 @@ int UTF8StringNonASCIICount(const char *utf8string) {
return count;
}

bool UTF8StringHasNonASCII(const char *utf8string) {
bool UTF8StringHasNonASCII(std::string_view utf8string) {
return UTF8StringNonASCIICount(utf8string) > 0;
}

Expand Down Expand Up @@ -478,25 +424,21 @@ std::string ConvertWStringToUTF8(const std::wstring &wstr) {
return s;
}

void ConvertUTF8ToWString(wchar_t *dest, size_t destSize, const std::string &source) {
void ConvertUTF8ToWString(wchar_t *dest, size_t destSize, std::string_view source) {
int len = (int)source.size();
int size = (int)MultiByteToWideChar(CP_UTF8, 0, source.c_str(), len, NULL, 0);
MultiByteToWideChar(CP_UTF8, 0, source.c_str(), len, dest, std::min((int)destSize, size));
}

void ConvertUTF8ToWString(wchar_t *dest, size_t destSize, const char *source) {
int len = (int)strlen(source) + 1; // include trailing zero
int size = (int)MultiByteToWideChar(CP_UTF8, 0, source, len, NULL, 0);
MultiByteToWideChar(CP_UTF8, 0, source, len, dest, std::min((int)destSize, size));
destSize -= 1; // account for the \0.
int size = (int)MultiByteToWideChar(CP_UTF8, 0, source.data(), len, NULL, 0);
MultiByteToWideChar(CP_UTF8, 0, source.data(), len, dest, std::min((int)destSize, size));
dest[size] = 0;
}

std::wstring ConvertUTF8ToWString(const std::string &source) {
std::wstring ConvertUTF8ToWString(const std::string_view source) {
int len = (int)source.size();
int size = (int)MultiByteToWideChar(CP_UTF8, 0, source.c_str(), len, NULL, 0);
int size = (int)MultiByteToWideChar(CP_UTF8, 0, source.data(), len, NULL, 0);
std::wstring str;
str.resize(size);
if (size > 0) {
MultiByteToWideChar(CP_UTF8, 0, source.c_str(), len, &str[0], size);
MultiByteToWideChar(CP_UTF8, 0, source.data(), source.size(), &str[0], size);
}
return str;
}
Expand All @@ -517,8 +459,8 @@ std::string ConvertUCS2ToUTF8(const std::u16string &wstr) {
return s;
}

std::string SanitizeUTF8(const std::string &utf8string) {
UTF8 utf(utf8string.c_str());
std::string SanitizeUTF8(std::string_view utf8string) {
UTF8 utf(utf8string);
std::string s;
// Worst case.
s.resize(utf8string.size() * 4);
Expand All @@ -533,11 +475,11 @@ std::string SanitizeUTF8(const std::string &utf8string) {
return s;
}

static size_t ConvertUTF8ToUCS2Internal(char16_t *dest, size_t destSize, const std::string &source) {
static size_t ConvertUTF8ToUCS2Internal(char16_t *dest, size_t destSize, std::string_view source) {
const char16_t *const orig = dest;
const char16_t *const destEnd = dest + destSize;

UTF8 utf(source.c_str());
UTF8 utf(source);

char16_t *destw = (char16_t *)dest;
const char16_t *const destwEnd = destw + destSize;
Expand All @@ -550,7 +492,7 @@ static size_t ConvertUTF8ToUCS2Internal(char16_t *dest, size_t destSize, const s
destw += UTF16LE::encodeUCS2(destw, c);
}

// No ++ to not count the terminal in length.
// No ++ to not count the null-terminator in length.
if (destw < destEnd) {
*destw = 0;
}
Expand All @@ -562,11 +504,11 @@ void ConvertUTF8ToUCS2(char16_t *dest, size_t destSize, const std::string &sourc
ConvertUTF8ToUCS2Internal(dest, destSize, source);
}

std::u16string ConvertUTF8ToUCS2(const std::string &source) {
std::u16string ConvertUTF8ToUCS2(std::string_view source) {
std::u16string dst;
// utf-8 won't be less bytes than there are characters. But need +1 for terminator.
dst.resize(source.size() + 1, 0);
size_t realLen = ConvertUTF8ToUCS2Internal(&dst[0], source.size() + 1, source);
// utf-8 won't be less bytes than there are characters.
dst.resize(source.size(), 0);
size_t realLen = ConvertUTF8ToUCS2Internal(&dst[0], source.size(), source);
dst.resize(realLen);
return dst;
}
Expand Down Expand Up @@ -595,11 +537,11 @@ std::string ConvertWStringToUTF8(const std::wstring &wstr) {
return s;
}

static size_t ConvertUTF8ToWStringInternal(wchar_t *dest, size_t destSize, const std::string &source) {
static size_t ConvertUTF8ToWStringInternal(wchar_t *dest, size_t destSize, std::string_view source) {
const wchar_t *const orig = dest;
const wchar_t *const destEnd = dest + destSize;

UTF8 utf(source.c_str());
UTF8 utf(source);

if (sizeof(wchar_t) == 2) {
char16_t *destw = (char16_t *)dest;
Expand Down Expand Up @@ -628,12 +570,13 @@ static size_t ConvertUTF8ToWStringInternal(wchar_t *dest, size_t destSize, const
return dest - orig;
}

std::wstring ConvertUTF8ToWString(const std::string &source) {
std::wstring ConvertUTF8ToWString(std::string_view source) {
std::wstring dst;
// utf-8 won't be less bytes than there are characters. But need +1 for terminator.
dst.resize(source.size() + 1, 0);
size_t realLen = ConvertUTF8ToWStringInternal(&dst[0], source.size() + 1, source);
// utf-8 won't be less bytes than there are characters.
dst.resize(source.size(), 0);
size_t realLen = ConvertUTF8ToWStringInternal(&dst[0], source.size(), source);
dst.resize(realLen);
dst[realLen] = 0;
return dst;
}

Expand Down
39 changes: 20 additions & 19 deletions Common/Data/Encoding/Utf8.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@

#include <cstdint>
#include <string>
#include <string_view>

uint32_t u8_nextchar(const char *s, int *i);
uint32_t u8_nextchar(const char *s, int *i, size_t size);
uint32_t u8_nextchar_unsafe(const char *s, int *i);
int u8_wc_toutf8(char *dest, uint32_t ch);
int u8_strlen(const char *s);
void u8_inc(const char *s, int *i);
void u8_dec(const char *s, int *i);

Expand All @@ -33,29 +33,31 @@ inline bool CodepointIsProbablyEmoji(uint32_t c) {
return c > 0xFFFF;
}

bool AnyEmojiInString(const char *s, size_t byteCount);
bool AnyEmojiInString(std::string_view str, size_t byteCount);

class UTF8 {
public:
static const uint32_t INVALID = (uint32_t)-1;
UTF8(const char *c) : c_(c), index_(0) {}
UTF8(const char *c, int index) : c_(c), index_(index) {}
bool end() const { return c_[index_] == 0; }
// TODO: Try to get rid of this constructor.
explicit UTF8(const char *c);
explicit UTF8(std::string_view view) : c_(view.data()), size_((int)view.size()), index_(0) {}
explicit UTF8(std::string_view view, int index) : c_(view.data()), size_((int)view.size()), index_(index) {}
bool end() const { return index_ == size_; }
// Returns true if the next character is outside BMP and Planes 1 - 16.
bool invalid() const {
unsigned char c = (unsigned char)c_[index_];
return (c >= 0x80 && c <= 0xC1) || c >= 0xF5;
}
uint32_t next() {
return u8_nextchar(c_, &index_);
return u8_nextchar(c_, &index_, size_);
}
// Allow invalid continuation bytes.
uint32_t next_unsafe() {
return u8_nextchar_unsafe(c_, &index_);
}
uint32_t peek() const {
int tempIndex = index_;
return u8_nextchar(c_, &tempIndex);
return u8_nextchar(c_, &tempIndex, size_);
}
void fwd() {
u8_inc(c_, &index_);
Expand All @@ -64,7 +66,7 @@ class UTF8 {
u8_dec(c_, &index_);
}
int length() const {
return u8_strlen(c_);
return size_;
}
int byteIndex() const {
return index_;
Expand All @@ -88,16 +90,16 @@ class UTF8 {
private:
const char *c_;
int index_;
int size_;
};

int UTF8StringNonASCIICount(const char *utf8string);
int UTF8StringNonASCIICount(std::string_view utf8string);

bool UTF8StringHasNonASCII(const char *utf8string);
bool UTF8StringHasNonASCII(std::string_view utf8string);


// Removes overlong encodings and similar.
std::string SanitizeUTF8(const std::string &utf8string);

std::string SanitizeUTF8(std::string_view utf8string);
std::string CodepointToUTF8(uint32_t codePoint);


Expand All @@ -107,20 +109,19 @@ std::string CodepointToUTF8(uint32_t codePoint);

std::string ConvertWStringToUTF8(const std::wstring &wstr);
std::string ConvertWStringToUTF8(const wchar_t *wstr);
void ConvertUTF8ToWString(wchar_t *dest, size_t destSize, const std::string &source);
void ConvertUTF8ToWString(wchar_t *dest, size_t destSize, const char *source);
std::wstring ConvertUTF8ToWString(const std::string &source);
void ConvertUTF8ToWString(wchar_t *dest, size_t destSize, std::string_view source);
std::wstring ConvertUTF8ToWString(std::string_view source);

#else

// Used by SymbolMap/assembler
std::wstring ConvertUTF8ToWString(const std::string &source);
std::wstring ConvertUTF8ToWString(std::string_view source);
std::string ConvertWStringToUTF8(const std::wstring &wstr);

#endif

std::string ConvertUCS2ToUTF8(const std::u16string &wstr);

// Dest size in units, not bytes.
void ConvertUTF8ToUCS2(char16_t *dest, size_t destSize, const std::string &source);
std::u16string ConvertUTF8ToUCS2(const std::string &source);
void ConvertUTF8ToUCS2(char16_t *dest, size_t destSize, std::string_view source);
std::u16string ConvertUTF8ToUCS2(std::string_view source);
20 changes: 15 additions & 5 deletions Common/Data/Format/IniFile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,24 +215,34 @@ const ParsedIniLine *Section::GetLine(const char* key) const {
}

void Section::Set(const char* key, uint32_t newValue) {
Set(key, StringFromFormat("0x%08x", newValue).c_str());
char temp[128];
snprintf(temp, sizeof(temp), "0x%08x", newValue);
Set(key, (const char *)temp);
}

void Section::Set(const char* key, uint64_t newValue) {
Set(key, StringFromFormat("0x%016" PRIx64, newValue).c_str());
char temp[128];
snprintf(temp, sizeof(temp), "0x%016" PRIx64, newValue);
Set(key, (const char *)temp);
}

void Section::Set(const char* key, float newValue) {
_dbg_assert_(!my_isnanorinf(newValue));
Set(key, StringFromFormat("%f", newValue).c_str());
char temp[128];
snprintf(temp, sizeof(temp), "%f", newValue);
Set(key, (const char *)temp);
}

void Section::Set(const char* key, double newValue) {
Set(key, StringFromFormat("%f", newValue).c_str());
char temp[128];
snprintf(temp, sizeof(temp), "%f", newValue);
Set(key, (const char *)temp);
}

void Section::Set(const char* key, int newValue) {
Set(key, StringFromInt(newValue).c_str());
char temp[128];
snprintf(temp, sizeof(temp), "%d", newValue);
Set(key, (const char *)temp);
}

void Section::Set(const char* key, const char* newValue) {
Expand Down
Loading

0 comments on commit 83999b8

Please sign in to comment.