Skip to content

Commit

Permalink
Initial support of UTF-8 as Identifier.
Browse files Browse the repository at this point in the history
  • Loading branch information
syoyo committed Apr 5, 2024
1 parent 912d27e commit de6b545
Show file tree
Hide file tree
Showing 5 changed files with 1,696 additions and 14 deletions.
107 changes: 107 additions & 0 deletions src/str-util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// Copyright 2023 - Present, Light Transport Entertainment, Inc.
#include "str-util.hh"

#include "unicode-xid.hh"
#include "common-macros.inc"

namespace tinyusdz {
Expand Down Expand Up @@ -470,6 +471,68 @@ inline std::string extract_utf8_char(const std::string &str, uint32_t start_i,
}
}

inline uint32_t to_codepoint(const char *s, uint32_t &char_len) {
if (!s) {
char_len = 0;
return ~0u;
}

char_len = detail::utf8_len(static_cast<unsigned char>(s[0]));
if (char_len == 0) {
return ~0u;
}

uint32_t code = 0;
if (char_len == 1) {
unsigned char s0 = static_cast<unsigned char>(s[0]);
if (s0 > 0x7f) {
return ~0u;
}
code = uint32_t(s0) & 0x7f;
} else if (char_len == 2) {
// 11bit: 110y-yyyx 10xx-xxxx
unsigned char s0 = static_cast<unsigned char>(s[0]);
unsigned char s1 = static_cast<unsigned char>(s[1]);

if (((s0 & 0xe0) == 0xc0) && ((s1 & 0xc0) == 0x80)) {
code = (uint32_t(s0 & 0x1f) << 6) | (s1 & 0x3f);
} else {
return ~0u;
}
} else if (char_len == 3) {
// 16bit: 1110-yyyy 10yx-xxxx 10xx-xxxx
unsigned char s0 = static_cast<unsigned char>(s[0]);
unsigned char s1 = static_cast<unsigned char>(s[1]);
unsigned char s2 = static_cast<unsigned char>(s[2]);
if (((s0 & 0xf0) == 0xe0) && ((s1 & 0xc0) == 0x80) &&
((s2 & 0xc0) == 0x80)) {
code =
(uint32_t(s0 & 0xf) << 12) | (uint32_t(s1 & 0x3f) << 6) | (s2 & 0x3f);
} else {
return ~0u;
}
} else if (char_len == 4) {
// 21bit: 1111-0yyy 10yy-xxxx 10xx-xxxx 10xx-xxxx
unsigned char s0 = static_cast<unsigned char>(s[0]);
unsigned char s1 = static_cast<unsigned char>(s[1]);
unsigned char s2 = static_cast<unsigned char>(s[2]);
unsigned char s3 = static_cast<unsigned char>(s[3]);
if (((s0 & 0xf8) == 0xf0) && ((s1 & 0xc0) == 0x80) &&
((s2 & 0xc0) == 0x80) && ((s2 & 0xc0) == 0x80)) {
code = (uint32_t(s0 & 0x7) << 18) | (uint32_t(s1 & 0x3f) << 12) |
(uint32_t(s2 & 0x3f) << 6) | uint32_t(s3 & 0x3f);
} else {
return ~0u;
}
} else {
// ???
char_len = 0;
return ~0u;
}

return code;
}

} // namespace detail

std::vector<std::string> to_utf8_chars(const std::string &str) {
Expand Down Expand Up @@ -544,6 +607,7 @@ uint32_t to_utf8_code(const std::string &s) {
return code;
}


#if 0
std::string to_utf8_char(const uint32_t code) {

Expand All @@ -568,4 +632,47 @@ bool is_valid_utf8(const std::string &str) {
return true;
}

std::vector<uint32_t> to_codepoints(const std::string &str) {

std::vector<uint32_t> cps;

for (size_t i = 0; i < str.size(); ) {
uint32_t char_len;
uint32_t cp = detail::to_codepoint(str.c_str() + i, char_len);

if ((cp > kMaxUTF8Codepoint) || (char_len == 0)) {
return std::vector<uint32_t>();
}

cps.push_back(cp);

i += char_len;
}

return cps;
}

bool is_valid_utf8_identifier(const std::string &str) {
// First convert to codepoint values.
std::vector<uint32_t> codepoints = to_codepoints(str);

if (codepoints.empty()) {
return false;
}

// (XID_Start|_) (XID_Continue|_)+

if ((codepoints[0] != '_') || !unicode_xid::is_xid_start(codepoints[0])) {
return false;
}

for (size_t i = 1; i < codepoints.size(); i++) {
if ((codepoints[i] != '_') || !unicode_xid::is_xid_continue(codepoints[i])) {
return false;
}
}

return true;
}

} // namespace tinyusdz
55 changes: 41 additions & 14 deletions src/str-util.hh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

namespace tinyusdz {

constexpr size_t kMaxUTF8Codepoint = 0x10ffff;

enum class CharEncoding
{
None,
Expand Down Expand Up @@ -240,30 +242,42 @@ std::string unescapeControlSequence(const std::string &str);

std::string buildEscapedAndQuotedStringForUSDA(const std::string &str);

///
/// Determine if input UTF-8 string is Unicode Identifier
/// (UAX31 Default Identifier)
///
bool is_valid_utf8_identifier(const std::string &str);

// TfIsValidIdentifier in pxrUSD equivalanet
// TODO: support UTF-8
inline bool isValidIdentifier(const std::string &str) {
// Supports UTF-8 identifier(UAX31 Default Identifier. pxrUSD supports UTF8 Identififer from 24.03)
inline bool isValidIdentifier(const std::string &str, bool is_utf8 = true) {

if (str.empty()) {
return false;
}

// first char
// [a-ZA-Z_]
if ((('a' <= str[0]) && (str[0] <= 'z')) || (('A' <= str[0]) && (str[0] <= 'Z')) || (str[0] == '_')) {
// ok
if (is_utf8) {
return is_valid_utf8_identifier(str);
} else {
return false;
}

// remain chars
// [a-ZA-Z0-9_]
for (size_t i = 1; i < str.length(); i++) {
if ((('a' <= str[i]) && (str[i] <= 'z')) || (('A' <= str[i]) && (str[i] <= 'Z')) || (('0' <= str[i]) && (str[i] <= '9')) || (str[i] == '_')) {
// legacy

// first char
// [a-ZA-Z_]
if ((('a' <= str[0]) && (str[0] <= 'z')) || (('A' <= str[0]) && (str[0] <= 'Z')) || (str[0] == '_')) {
// ok
} else {
return false;
}

// remaining chars
// [a-ZA-Z0-9_]
for (size_t i = 1; i < str.length(); i++) {
if ((('a' <= str[i]) && (str[i] <= 'z')) || (('A' <= str[i]) && (str[i] <= 'Z')) || (('0' <= str[i]) && (str[i] <= '9')) || (str[i] == '_')) {
// ok
} else {
return false;
}
}
}

return true;
Expand All @@ -272,7 +286,9 @@ inline bool isValidIdentifier(const std::string &str) {

// TfMakeValidIdentifier in pxrUSD equivalanet
// TODO: support UTF-8
inline std::string makeIdentifierValid(const std::string &str) {
inline std::string makeIdentifierValid(const std::string &str, bool is_utf8 = true) {
(void)is_utf8;

std::string s;

if (str.empty()) {
Expand Down Expand Up @@ -312,8 +328,12 @@ inline std::string makeIdentifierValid(const std::string &str) {
bool makeUniqueName(std::multiset<std::string> &nameSet, const std::string &name, std::string *unique_name);


///
/// Determine if input string is valid UTF-8 string.
///
bool is_valid_utf8(const std::string &str);


///
/// Convert string buffer to list of UTF-8 chars.
/// Example: 'こんにちは' => ['こ', 'ん', 'に', 'ち', 'は']
Expand All @@ -326,6 +346,13 @@ std::vector<std::string> to_utf8_chars(const std::string &str);
///
uint32_t to_utf8_code(const std::string &u8char);

///
/// Convert UTF-8 string to codepoint values.
///
/// Return empty array when input is not a valid UTF-8 string.
///
std::vector<uint32_t> to_codepoints(const std::string &str);

///
/// Convert UTF-8 codepoint to UTF-8 string.
///
Expand Down
Loading

0 comments on commit de6b545

Please sign in to comment.