misc: export WTF8 conversion utilities (#4021)

As promised in #2970, this attempts to migrate code to a common set of utilities in a common place in the code and use them everywhere. This also exports the functionality, since the Windows API with WideCharToMultiByte is fairly verbose relative to what libuv and libuv's clients typically need, so it is useful not to require clients to reimplement this conversion logic unnecessarily (and because Windows is not 64-bit ready here, but this implementation is.)
libuv · Oct 29, 2023 · f388908 · f388908
1 parent 56fca44
commit f388908
Show file tree

Hide file tree

Showing 15 changed files with 667 additions and 999 deletions.
diff --git a/docs/src/misc.rst b/docs/src/misc.rst
@@ -839,3 +839,50 @@ API
     Causes the calling thread to sleep for `msec` milliseconds.
 
     .. versionadded:: 1.34.0
+
+String manipulation functions
+-----------------------------
+
+These string utilities are needed internally for dealing with Windows, and are
+exported to allow clients to work uniformly with this data when the libuv API
+is not complete.
+
+.. c:function:: size_t uv_utf16_length_as_wtf8(const uint16_t* utf16, ssize_t utf16_len)
+
+    Get the length of a UTF-16 (or UCS-2) `utf16` value after converting it to
+    WTF-8. If `utf16` is NUL terminated, `utf16_len` can be set to -1,
+    otherwise it must be specified.
+
+    .. versionadded:: 1.47.0
+
+.. c:function:: int uv_utf16_to_wtf8(const uint16_t* utf16, ssize_t utf16_len, char** wtf8_ptr, size_t* wtf8_len_ptr)
+
+    Convert UTF-16 (or UCS-2) data in `utf16` to WTF-8 data in `*wtf8_ptr`. The
+    `utf16_len` count (in characters) gives the length of `utf16`. If `utf16`
+    is NUL terminated, `utf16_len` can be set to -1, otherwise it must be
+    specified. If `wtf8_ptr` is `NULL`, no result will be computed, but the
+    length (equal to `uv_utf16_length_as_wtf8`) will be stored in `wtf8_ptr`.
+    If `*wtf8_ptr` is `NULL`, space for the conversion will be allocated and
+    returned in `wtf8_ptr` and the length will be returned in `wtf8_len_ptr`.
+    Otherwise, the length of `*wtf8_ptr` must be passed in `wtf8_len_ptr`. The
+    `wtf8_ptr` must contain an extra space for an extra NUL after the result.
+    If the result is truncated, `UV_ENOBUFS` will be returned and
+    `wtf8_len_ptr` will be the length of the required `wtf8_ptr` to contain the
+    whole result.
+
+    .. versionadded:: 1.47.0
+
+.. c:function:: ssize_t uv_wtf8_length_as_utf16(const char* wtf8)
+
+    Get the length in characters of a NUL-terminated WTF-8 `wtf8` value
+    after converting it to UTF-16 (or UCS-2), including NUL terminator.
+
+    .. versionadded:: 1.47.0
+
+.. c:function:: void uv_wtf8_to_utf16(const char* utf8, uint16_t* utf16, size_t utf16_len)
+
+    Convert NUL-terminated WTF-8 data in `wtf8` to UTF-16 (or UCS-2) data
+    in `utf16`. The `utf16_len` count (in characters) must include space
+    for the NUL terminator.
+
+    .. versionadded:: 1.47.0
diff --git a/include/uv.h b/include/uv.h
@@ -1885,6 +1885,18 @@ struct uv_loop_s {
 UV_EXTERN void* uv_loop_get_data(const uv_loop_t*);
 UV_EXTERN void uv_loop_set_data(uv_loop_t*, void* data);
 
+/* String utilities needed internally for dealing with Windows. */
+size_t uv_utf16_length_as_wtf8(const uint16_t* utf16,
+                               ssize_t utf16_len);
+int uv_utf16_to_wtf8(const uint16_t* utf16,
+                     ssize_t utf16_len,
+                     char** wtf8_ptr,
+                     size_t* wtf8_len_ptr);
+ssize_t uv_wtf8_length_as_utf16(const char* wtf8);
+void uv_wtf8_to_utf16(const char* wtf8,
+                      uint16_t* utf16,
+                      size_t utf16_len);
+
 /* Don't export the private CPP symbols. */
 #undef UV_HANDLE_TYPE_PRIVATE
 #undef UV_REQ_TYPE_PRIVATE

diff --git a/src/idna.c b/src/idna.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, 2018 Ben Noordhuis <info@bnoordhuis.nl>
+/* Copyright libuv contributors. All rights reserved.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -18,11 +18,56 @@
  */
 
 #include "uv.h"
+#include "uv-common.h"
 #include "idna.h"
 #include <assert.h>
 #include <string.h>
 #include <limits.h> /* UINT_MAX */
 
+
+static int32_t uv__wtf8_decode1(const char** input) {
+  uint32_t code_point;
+  uint8_t b1;
+  uint8_t b2;
+  uint8_t b3;
+  uint8_t b4;
+
+  b1 = **input;
+  if (b1 <= 0x7F)
+    return b1; /* ASCII code point */
+  if (b1 < 0xC2)
+    return -1; /* invalid: continuation byte */
+  code_point = b1;
+
+  b2 = *++*input;
+  if ((b2 & 0xC0) != 0x80)
+    return -1; /* invalid: not a continuation byte */
+  code_point = (code_point << 6) | (b2 & 0x3F);
+  if (b1 <= 0xDF)
+    return 0x7FF & code_point; /* two-byte character */
+
+  b3 = *++*input;
+  if ((b3 & 0xC0) != 0x80)
+    return -1; /* invalid: not a continuation byte */
+  code_point = (code_point << 6) | (b3 & 0x3F);
+  if (b1 <= 0xEF)
+    return 0xFFFF & code_point; /* three-byte character */
+
+  b4 = *++*input;
+  if ((b4 & 0xC0) != 0x80)
+    return -1; /* invalid: not a continuation byte */
+  code_point = (code_point << 6) | (b4 & 0x3F);
+  if (b1 <= 0xF4) {
+    code_point &= 0x1FFFFF;
+    if (code_point <= 0x10FFFF)
+      return code_point; /* four-byte character */
+  }
+
+  /* code point too large */
+  return -1;
+}
+
+
 static unsigned uv__utf8_decode1_slow(const char** p,
                                       const char* pe,
                                       unsigned a) {
@@ -89,6 +134,7 @@ static unsigned uv__utf8_decode1_slow(const char** p,
   return a;
 }
 
+
 unsigned uv__utf8_decode1(const char** p, const char* pe) {
   unsigned a;
 
@@ -102,6 +148,7 @@ unsigned uv__utf8_decode1(const char** p, const char* pe) {
   return uv__utf8_decode1_slow(p, pe, a);
 }
 
+
 static int uv__idna_toascii_label(const char* s, const char* se,
                                   char** d, char* de) {
   static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789";
@@ -267,7 +314,8 @@ static int uv__idna_toascii_label(const char* s, const char* se,
   return 0;
 }
 
-long uv__idna_toascii(const char* s, const char* se, char* d, char* de) {
+
+ssize_t uv__idna_toascii(const char* s, const char* se, char* d, char* de) {
   const char* si;
   const char* st;
   unsigned c;
@@ -313,3 +361,195 @@ long uv__idna_toascii(const char* s, const char* se, char* d, char* de) {
 
   return d - ds;  /* Number of bytes written. */
 }
+
+
+ssize_t uv_wtf8_length_as_utf16(const char* source_ptr) {
+  size_t w_target_len = 0;
+  int32_t code_point;
+
+  do {
+    code_point = uv__wtf8_decode1(&source_ptr);
+    if (code_point < 0)
+      return -1;
+    if (code_point > 0xFFFF)
+      w_target_len++;
+    w_target_len++;
+  } while (*source_ptr++);
+
+  return w_target_len;
+}
+
+
+void uv_wtf8_to_utf16(const char* source_ptr,
+                      uint16_t* w_target,
+                      size_t w_target_len) {
+  int32_t code_point;
+
+  do {
+    code_point = uv__wtf8_decode1(&source_ptr);
+    /* uv_wtf8_length_as_utf16 should have been called and checked first. */
+    assert(code_point >= 0);
+    if (code_point > 0x10000) {
+      assert(code_point < 0x10FFFF);
+      *w_target++ = (((code_point - 0x10000) >> 10) + 0xD800);
+      *w_target++ = ((code_point - 0x10000) & 0x3FF) + 0xDC00;
+      w_target_len -= 2;
+    } else {
+      *w_target++ = code_point;
+      w_target_len -= 1;
+    }
+  } while (*source_ptr++);
+
+  assert(w_target_len == 0);
+}
+
+
+static int32_t uv__get_surrogate_value(const uint16_t* w_source_ptr,
+                                       ssize_t w_source_len) {
+  uint16_t u;
+  uint16_t next;
+
+  u = w_source_ptr[0];
+  if (u >= 0xD800 && u <= 0xDBFF && w_source_len != 1) {
+    next = w_source_ptr[1];
+    if (next >= 0xDC00 && next <= 0xDFFF)
+      return 0x10000 + ((u - 0xD800) << 10) + (next - 0xDC00);
+  }
+  return u;
+}
+
+
+size_t uv_utf16_length_as_wtf8(const uint16_t* w_source_ptr,
+                               ssize_t w_source_len) {
+  size_t target_len;
+  int32_t code_point;
+
+  target_len = 0;
+  while (w_source_len) {
+    code_point = uv__get_surrogate_value(w_source_ptr, w_source_len);
+    /* Can be invalid UTF-8 but must be valid WTF-8. */
+    assert(code_point >= 0);
+    if (w_source_len < 0 && code_point == 0)
+      break;
+    if (code_point < 0x80)
+      target_len += 1;
+    else if (code_point < 0x800)
+      target_len += 2;
+    else if (code_point < 0x10000)
+      target_len += 3;
+    else {
+      target_len += 4;
+      w_source_ptr++;
+      if (w_source_len > 0)
+        w_source_len--;
+    }
+    w_source_ptr++;
+    if (w_source_len > 0)
+      w_source_len--;
+  }
+
+  return target_len;
+}
+
+
+int uv_utf16_to_wtf8(const uint16_t* w_source_ptr,
+                     ssize_t w_source_len,
+                     char** target_ptr,
+                     size_t* target_len_ptr) {
+  size_t target_len;
+  char* target;
+  char* target_end;
+  int32_t code_point;
+
+  /* If *target_ptr is provided, then *target_len_ptr must be its length
+   * (excluding space for NUL), otherwise we will compute the target_len_ptr
+   * length and may return a new allocation in *target_ptr if target_ptr is
+   * provided. */
+  if (target_ptr == NULL || *target_ptr == NULL) {
+    target_len = uv_utf16_length_as_wtf8(w_source_ptr, w_source_len);
+    if (target_len_ptr != NULL)
+      *target_len_ptr = target_len;
+  } else {
+    target_len = *target_len_ptr;
+  }
+
+  if (target_ptr == NULL)
+    return 0;
+
+  if (*target_ptr == NULL) {
+    target = uv__malloc(target_len + 1);
+    if (target == NULL) {
+      return UV_ENOMEM;
+    }
+    *target_ptr = target;
+  } else {
+    target = *target_ptr;
+  }
+
+  target_end = target + target_len;
+
+  while (target != target_end && w_source_len) {
+    code_point = uv__get_surrogate_value(w_source_ptr, w_source_len);
+    /* Can be invalid UTF-8 but must be valid WTF-8. */
+    assert(code_point >= 0);
+    if (w_source_len < 0 && code_point == 0) {
+      w_source_len = 0;
+      break;
+    }
+    if (code_point < 0x80) {
+      *target++ = code_point;
+    } else if (code_point < 0x800) {
+      *target++ = 0xC0 | (code_point >> 6);
+      if (target == target_end)
+        break;
+      *target++ = 0x80 | (code_point & 0x3F);
+    } else if (code_point < 0x10000) {
+      *target++ = 0xE0 | (code_point >> 12);
+      if (target == target_end)
+        break;
+      *target++ = 0x80 | ((code_point >> 6) & 0x3F);
+      if (target == target_end)
+        break;
+      *target++ = 0x80 | (code_point & 0x3F);
+    } else {
+      *target++ = 0xF0 | (code_point >> 18);
+      if (target == target_end)
+        break;
+      *target++ = 0x80 | ((code_point >> 12) & 0x3F);
+      if (target == target_end)
+        break;
+      *target++ = 0x80 | ((code_point >> 6) & 0x3F);
+      if (target == target_end)
+        break;
+      *target++ = 0x80 | (code_point & 0x3F);
+      /* uv__get_surrogate_value consumed 2 input characters */
+      w_source_ptr++;
+      if (w_source_len > 0)
+        w_source_len--;
+    }
+    target_len = target - *target_ptr;
+    w_source_ptr++;
+    if (w_source_len > 0)
+      w_source_len--;
+  }
+
+  if (target != target_end && target_len_ptr != NULL)
+    /* Did not fill all of the provided buffer, so update the target_len_ptr
+     * output with the space used. */
+    *target_len_ptr = target - *target_ptr;
+
+  /* Check if input fit into target exactly. */
+  if (w_source_len < 0 && target == target_end && w_source_ptr[0] == 0)
+    w_source_len = 0;
+
+  *target++ = '\0';
+
+  /* Characters remained after filling the buffer, compute the remaining length now. */
+  if (w_source_len) {
+    if (target_len_ptr != NULL)
+      *target_len_ptr = target_len + uv_utf16_length_as_wtf8(w_source_ptr, w_source_len);
+    return UV_ENOBUFS;
+  }
+
+  return 0;
+}
diff --git a/src/idna.h b/src/idna.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, 2018 Ben Noordhuis <info@bnoordhuis.nl>
+/* Copyright libuv contributors. All rights reserved.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -26,6 +26,6 @@ unsigned uv__utf8_decode1(const char** p, const char* pe);
  * is the number of bytes written to |d|, including the trailing nul byte.
  * A return value < 0 is a libuv error code. |s| and |d| can not overlap.
  */
-long uv__idna_toascii(const char* s, const char* se, char* d, char* de);
+ssize_t uv__idna_toascii(const char* s, const char* se, char* d, char* de);
 
 #endif  /* UV_SRC_IDNA_H_ */
diff --git a/src/win/dl.c b/src/win/dl.c
@@ -27,18 +27,17 @@ static int uv__dlerror(uv_lib_t* lib, const char* filename, DWORD errorno);
 
 int uv_dlopen(const char* filename, uv_lib_t* lib) {
   WCHAR filename_w[32768];
+  ssize_t r;
 
   lib->handle = NULL;
   lib->errmsg = NULL;
 
-  if (!MultiByteToWideChar(CP_UTF8,
-                           0,
-                           filename,
-                           -1,
-                           filename_w,
-                           ARRAY_SIZE(filename_w))) {
-    return uv__dlerror(lib, filename, GetLastError());
-  }
+  r = uv_wtf8_length_as_utf16(filename);
+  if (r < 0)
+    return uv__dlerror(lib, filename, ERROR_NO_UNICODE_TRANSLATION);
+  if ((size_t) r > ARRAY_SIZE(filename_w))
+    return uv__dlerror(lib, filename, ERROR_INSUFFICIENT_BUFFER);
+  uv_wtf8_to_utf16(filename, filename_w, r);
 
   lib->handle = LoadLibraryExW(filename_w, NULL, LOAD_WITH_ALTERED_SEARCH_PATH);
   if (lib->handle == NULL) {