Skip to content

Commit

Permalink
misc: export WTF8 conversion utilities (#4021)
Browse files Browse the repository at this point in the history
As promised in #2970, this attempts to migrate code to a common set of
utilities in a common place in the code and use them everywhere. This
also exports the functionality, since the Windows API with
WideCharToMultiByte is fairly verbose relative to what libuv and
libuv's clients typically need, so it is useful not to require clients
to reimplement this conversion logic unnecessarily (and because Windows
is not 64-bit ready here, but this implementation is.)
  • Loading branch information
vtjnash committed Oct 29, 2023
1 parent 56fca44 commit f388908
Show file tree
Hide file tree
Showing 15 changed files with 667 additions and 999 deletions.
47 changes: 47 additions & 0 deletions docs/src/misc.rst
Original file line number Diff line number Diff line change
Expand Up @@ -839,3 +839,50 @@ API
Causes the calling thread to sleep for `msec` milliseconds.
.. versionadded:: 1.34.0
String manipulation functions
-----------------------------
These string utilities are needed internally for dealing with Windows, and are
exported to allow clients to work uniformly with this data when the libuv API
is not complete.
.. c:function:: size_t uv_utf16_length_as_wtf8(const uint16_t* utf16, ssize_t utf16_len)
Get the length of a UTF-16 (or UCS-2) `utf16` value after converting it to
WTF-8. If `utf16` is NUL terminated, `utf16_len` can be set to -1,
otherwise it must be specified.
.. versionadded:: 1.47.0
.. c:function:: int uv_utf16_to_wtf8(const uint16_t* utf16, ssize_t utf16_len, char** wtf8_ptr, size_t* wtf8_len_ptr)
Convert UTF-16 (or UCS-2) data in `utf16` to WTF-8 data in `*wtf8_ptr`. The
`utf16_len` count (in characters) gives the length of `utf16`. If `utf16`
is NUL terminated, `utf16_len` can be set to -1, otherwise it must be
specified. If `wtf8_ptr` is `NULL`, no result will be computed, but the
length (equal to `uv_utf16_length_as_wtf8`) will be stored in `wtf8_ptr`.
If `*wtf8_ptr` is `NULL`, space for the conversion will be allocated and
returned in `wtf8_ptr` and the length will be returned in `wtf8_len_ptr`.
Otherwise, the length of `*wtf8_ptr` must be passed in `wtf8_len_ptr`. The
`wtf8_ptr` must contain an extra space for an extra NUL after the result.
If the result is truncated, `UV_ENOBUFS` will be returned and
`wtf8_len_ptr` will be the length of the required `wtf8_ptr` to contain the
whole result.
.. versionadded:: 1.47.0
.. c:function:: ssize_t uv_wtf8_length_as_utf16(const char* wtf8)
Get the length in characters of a NUL-terminated WTF-8 `wtf8` value
after converting it to UTF-16 (or UCS-2), including NUL terminator.
.. versionadded:: 1.47.0
.. c:function:: void uv_wtf8_to_utf16(const char* utf8, uint16_t* utf16, size_t utf16_len)
Convert NUL-terminated WTF-8 data in `wtf8` to UTF-16 (or UCS-2) data
in `utf16`. The `utf16_len` count (in characters) must include space
for the NUL terminator.
.. versionadded:: 1.47.0
12 changes: 12 additions & 0 deletions include/uv.h
Original file line number Diff line number Diff line change
Expand Up @@ -1885,6 +1885,18 @@ struct uv_loop_s {
UV_EXTERN void* uv_loop_get_data(const uv_loop_t*);
UV_EXTERN void uv_loop_set_data(uv_loop_t*, void* data);

/* String utilities needed internally for dealing with Windows. */
size_t uv_utf16_length_as_wtf8(const uint16_t* utf16,
ssize_t utf16_len);
int uv_utf16_to_wtf8(const uint16_t* utf16,
ssize_t utf16_len,
char** wtf8_ptr,
size_t* wtf8_len_ptr);
ssize_t uv_wtf8_length_as_utf16(const char* wtf8);
void uv_wtf8_to_utf16(const char* wtf8,
uint16_t* utf16,
size_t utf16_len);

/* Don't export the private CPP symbols. */
#undef UV_HANDLE_TYPE_PRIVATE
#undef UV_REQ_TYPE_PRIVATE
Expand Down
244 changes: 242 additions & 2 deletions src/idna.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright (c) 2011, 2018 Ben Noordhuis <info@bnoordhuis.nl>
/* Copyright libuv contributors. All rights reserved.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
Expand All @@ -18,11 +18,56 @@
*/

#include "uv.h"
#include "uv-common.h"
#include "idna.h"
#include <assert.h>
#include <string.h>
#include <limits.h> /* UINT_MAX */


static int32_t uv__wtf8_decode1(const char** input) {
uint32_t code_point;
uint8_t b1;
uint8_t b2;
uint8_t b3;
uint8_t b4;

b1 = **input;
if (b1 <= 0x7F)
return b1; /* ASCII code point */
if (b1 < 0xC2)
return -1; /* invalid: continuation byte */
code_point = b1;

b2 = *++*input;
if ((b2 & 0xC0) != 0x80)
return -1; /* invalid: not a continuation byte */
code_point = (code_point << 6) | (b2 & 0x3F);
if (b1 <= 0xDF)
return 0x7FF & code_point; /* two-byte character */

b3 = *++*input;
if ((b3 & 0xC0) != 0x80)
return -1; /* invalid: not a continuation byte */
code_point = (code_point << 6) | (b3 & 0x3F);
if (b1 <= 0xEF)
return 0xFFFF & code_point; /* three-byte character */

b4 = *++*input;
if ((b4 & 0xC0) != 0x80)
return -1; /* invalid: not a continuation byte */
code_point = (code_point << 6) | (b4 & 0x3F);
if (b1 <= 0xF4) {
code_point &= 0x1FFFFF;
if (code_point <= 0x10FFFF)
return code_point; /* four-byte character */
}

/* code point too large */
return -1;
}


static unsigned uv__utf8_decode1_slow(const char** p,
const char* pe,
unsigned a) {
Expand Down Expand Up @@ -89,6 +134,7 @@ static unsigned uv__utf8_decode1_slow(const char** p,
return a;
}


unsigned uv__utf8_decode1(const char** p, const char* pe) {
unsigned a;

Expand All @@ -102,6 +148,7 @@ unsigned uv__utf8_decode1(const char** p, const char* pe) {
return uv__utf8_decode1_slow(p, pe, a);
}


static int uv__idna_toascii_label(const char* s, const char* se,
char** d, char* de) {
static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789";
Expand Down Expand Up @@ -267,7 +314,8 @@ static int uv__idna_toascii_label(const char* s, const char* se,
return 0;
}

long uv__idna_toascii(const char* s, const char* se, char* d, char* de) {

ssize_t uv__idna_toascii(const char* s, const char* se, char* d, char* de) {
const char* si;
const char* st;
unsigned c;
Expand Down Expand Up @@ -313,3 +361,195 @@ long uv__idna_toascii(const char* s, const char* se, char* d, char* de) {

return d - ds; /* Number of bytes written. */
}


ssize_t uv_wtf8_length_as_utf16(const char* source_ptr) {
size_t w_target_len = 0;
int32_t code_point;

do {
code_point = uv__wtf8_decode1(&source_ptr);
if (code_point < 0)
return -1;
if (code_point > 0xFFFF)
w_target_len++;
w_target_len++;
} while (*source_ptr++);

return w_target_len;
}


void uv_wtf8_to_utf16(const char* source_ptr,
uint16_t* w_target,
size_t w_target_len) {
int32_t code_point;

do {
code_point = uv__wtf8_decode1(&source_ptr);
/* uv_wtf8_length_as_utf16 should have been called and checked first. */
assert(code_point >= 0);
if (code_point > 0x10000) {
assert(code_point < 0x10FFFF);
*w_target++ = (((code_point - 0x10000) >> 10) + 0xD800);
*w_target++ = ((code_point - 0x10000) & 0x3FF) + 0xDC00;
w_target_len -= 2;
} else {
*w_target++ = code_point;
w_target_len -= 1;
}
} while (*source_ptr++);

assert(w_target_len == 0);
}


static int32_t uv__get_surrogate_value(const uint16_t* w_source_ptr,
ssize_t w_source_len) {
uint16_t u;
uint16_t next;

u = w_source_ptr[0];
if (u >= 0xD800 && u <= 0xDBFF && w_source_len != 1) {
next = w_source_ptr[1];
if (next >= 0xDC00 && next <= 0xDFFF)
return 0x10000 + ((u - 0xD800) << 10) + (next - 0xDC00);
}
return u;
}


size_t uv_utf16_length_as_wtf8(const uint16_t* w_source_ptr,
ssize_t w_source_len) {
size_t target_len;
int32_t code_point;

target_len = 0;
while (w_source_len) {
code_point = uv__get_surrogate_value(w_source_ptr, w_source_len);
/* Can be invalid UTF-8 but must be valid WTF-8. */
assert(code_point >= 0);
if (w_source_len < 0 && code_point == 0)
break;
if (code_point < 0x80)
target_len += 1;
else if (code_point < 0x800)
target_len += 2;
else if (code_point < 0x10000)
target_len += 3;
else {
target_len += 4;
w_source_ptr++;
if (w_source_len > 0)
w_source_len--;
}
w_source_ptr++;
if (w_source_len > 0)
w_source_len--;
}

return target_len;
}


int uv_utf16_to_wtf8(const uint16_t* w_source_ptr,
ssize_t w_source_len,
char** target_ptr,
size_t* target_len_ptr) {
size_t target_len;
char* target;
char* target_end;
int32_t code_point;

/* If *target_ptr is provided, then *target_len_ptr must be its length
* (excluding space for NUL), otherwise we will compute the target_len_ptr
* length and may return a new allocation in *target_ptr if target_ptr is
* provided. */
if (target_ptr == NULL || *target_ptr == NULL) {
target_len = uv_utf16_length_as_wtf8(w_source_ptr, w_source_len);
if (target_len_ptr != NULL)
*target_len_ptr = target_len;
} else {
target_len = *target_len_ptr;
}

if (target_ptr == NULL)
return 0;

if (*target_ptr == NULL) {
target = uv__malloc(target_len + 1);
if (target == NULL) {
return UV_ENOMEM;
}
*target_ptr = target;
} else {
target = *target_ptr;
}

target_end = target + target_len;

while (target != target_end && w_source_len) {
code_point = uv__get_surrogate_value(w_source_ptr, w_source_len);
/* Can be invalid UTF-8 but must be valid WTF-8. */
assert(code_point >= 0);
if (w_source_len < 0 && code_point == 0) {
w_source_len = 0;
break;
}
if (code_point < 0x80) {
*target++ = code_point;
} else if (code_point < 0x800) {
*target++ = 0xC0 | (code_point >> 6);
if (target == target_end)
break;
*target++ = 0x80 | (code_point & 0x3F);
} else if (code_point < 0x10000) {
*target++ = 0xE0 | (code_point >> 12);
if (target == target_end)
break;
*target++ = 0x80 | ((code_point >> 6) & 0x3F);
if (target == target_end)
break;
*target++ = 0x80 | (code_point & 0x3F);
} else {
*target++ = 0xF0 | (code_point >> 18);
if (target == target_end)
break;
*target++ = 0x80 | ((code_point >> 12) & 0x3F);
if (target == target_end)
break;
*target++ = 0x80 | ((code_point >> 6) & 0x3F);
if (target == target_end)
break;
*target++ = 0x80 | (code_point & 0x3F);
/* uv__get_surrogate_value consumed 2 input characters */
w_source_ptr++;
if (w_source_len > 0)
w_source_len--;
}
target_len = target - *target_ptr;
w_source_ptr++;
if (w_source_len > 0)
w_source_len--;
}

if (target != target_end && target_len_ptr != NULL)
/* Did not fill all of the provided buffer, so update the target_len_ptr
* output with the space used. */
*target_len_ptr = target - *target_ptr;

/* Check if input fit into target exactly. */
if (w_source_len < 0 && target == target_end && w_source_ptr[0] == 0)
w_source_len = 0;

*target++ = '\0';

/* Characters remained after filling the buffer, compute the remaining length now. */
if (w_source_len) {
if (target_len_ptr != NULL)
*target_len_ptr = target_len + uv_utf16_length_as_wtf8(w_source_ptr, w_source_len);
return UV_ENOBUFS;
}

return 0;
}
4 changes: 2 additions & 2 deletions src/idna.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright (c) 2011, 2018 Ben Noordhuis <info@bnoordhuis.nl>
/* Copyright libuv contributors. All rights reserved.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
Expand Down Expand Up @@ -26,6 +26,6 @@ unsigned uv__utf8_decode1(const char** p, const char* pe);
* is the number of bytes written to |d|, including the trailing nul byte.
* A return value < 0 is a libuv error code. |s| and |d| can not overlap.
*/
long uv__idna_toascii(const char* s, const char* se, char* d, char* de);
ssize_t uv__idna_toascii(const char* s, const char* se, char* d, char* de);

#endif /* UV_SRC_IDNA_H_ */
15 changes: 7 additions & 8 deletions src/win/dl.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,17 @@ static int uv__dlerror(uv_lib_t* lib, const char* filename, DWORD errorno);

int uv_dlopen(const char* filename, uv_lib_t* lib) {
WCHAR filename_w[32768];
ssize_t r;

lib->handle = NULL;
lib->errmsg = NULL;

if (!MultiByteToWideChar(CP_UTF8,
0,
filename,
-1,
filename_w,
ARRAY_SIZE(filename_w))) {
return uv__dlerror(lib, filename, GetLastError());
}
r = uv_wtf8_length_as_utf16(filename);
if (r < 0)
return uv__dlerror(lib, filename, ERROR_NO_UNICODE_TRANSLATION);
if ((size_t) r > ARRAY_SIZE(filename_w))
return uv__dlerror(lib, filename, ERROR_INSUFFICIENT_BUFFER);
uv_wtf8_to_utf16(filename, filename_w, r);

lib->handle = LoadLibraryExW(filename_w, NULL, LOAD_WITH_ALTERED_SEARCH_PATH);
if (lib->handle == NULL) {
Expand Down

0 comments on commit f388908

Please sign in to comment.