Skip to content

Commit

Permalink
uri: speedup encode and decode functions
Browse files Browse the repository at this point in the history
Patch replaces encoding and decoding functions written in Lua with
functions implemented in C.

Performance of Lua implementation (before the patch):

```
uri.escape   152.37  runs/sec
uri.unescape 263.44  runs/sec
```

Performance of C implementation (after the patch):

```
uri.escape   4983.03  runs/sec
uri.unescape 4197.19  runs/sec
```

Follows up tarantool#3682

NO_CHANGELOG=see previous commit
NO_DOC=see previous commit

Co-authored-by: Alexander Turenko <alexander.turenko@tarantool.org>
  • Loading branch information
ligurio and Totktonada committed Dec 25, 2022
1 parent a017495 commit dfc6d1a
Show file tree
Hide file tree
Showing 5 changed files with 295 additions and 24 deletions.
2 changes: 2 additions & 0 deletions extra/exports
Expand Up @@ -549,6 +549,8 @@ tt_uuid_to_string
uri_destroy
uri_format
uri_set_destroy
uri_escape
uri_unescape
uuid_nil
uuid_unpack
_say
Expand Down
65 changes: 65 additions & 0 deletions src/lib/uri/uri.c
Expand Up @@ -3,6 +3,7 @@
*
* Copyright 2010-2021, Tarantool AUTHORS, please see AUTHORS file.
*/
#include <ctype.h>
#include "uri.h"
#include "uri_parser.h"
#include "trivia/util.h"
Expand Down Expand Up @@ -371,3 +372,67 @@ uri_set_create(struct uri_set *uri_set, const char *str)
uri_set->uri_count = 0;
return -1;
}

/**
* String percent-encoding.
*/
size_t
uri_escape(const char *src, size_t src_size, char *dst,
const unsigned char unreserved[256], bool encode_plus)
{
int pos = 0;
const char *hex = "0123456789ABCDEF";
while (src_size--) {
unsigned char ch = (unsigned char)*src;
if ((ch == ' ') && encode_plus) {
dst[pos++] = '+';
} else if (!unreserved[(int)ch]) {
dst[pos++] = '%';
dst[pos++] = hex[ch >> 4];
dst[pos++] = hex[ch & 15];
} else {
dst[pos++] = *src;
}
src++;
}
return (size_t)pos;
};

/**
* Converts a hex character to its integer value.
*/
static char
hex2ch(unsigned char ch)
{
return isdigit(ch) ? ch - '0' : tolower(ch) - 'a' + 10;
}

/**
* String percent-decoding.
*/
size_t
uri_unescape(const char *src, size_t src_size, char *dst, bool decode_plus)
{
char *dst_buf = dst;
const char *src_end = src + src_size;
for (const char *p = src; p < src_end; ++p) {
if (*p == '%') {
bool is_hex_1 = p + 1 < src_end &&
isxdigit((unsigned char)p[1]);
bool is_hex_2 = p + 2 < src_end &&
isxdigit((unsigned char)p[2]);
if (is_hex_1 && is_hex_2) {
*dst_buf++ = hex2ch(p[1]) << 4 |
hex2ch(p[2]);
p += 2;
} else {
*dst_buf++ = '%';
}
} else if (decode_plus && *p == '+') {
*dst_buf++ = ' ';
} else {
*dst_buf++ = *p;
}
}
return (size_t)(dst_buf - dst);
}
13 changes: 13 additions & 0 deletions src/lib/uri/uri.h
Expand Up @@ -141,6 +141,19 @@ uri_param(const struct uri *uri, const char *name, int idx);
int
uri_param_count(const struct uri *uri, const char *name);

/**
* String percent-encoding.
*/
size_t
uri_escape(const char *src, size_t src_size, char *dst,
const unsigned char unreserved[256], bool plus);

/**
* String percent-decoding.
*/
size_t
uri_unescape(const char *src, size_t src_size, char *dst, bool decode_plus);

#if defined(__cplusplus)
} /* extern "C" */
#endif /* defined(__cplusplus) */
40 changes: 17 additions & 23 deletions src/lua/uri.lua
Expand Up @@ -47,6 +47,13 @@ uri_set_destroy(struct uri_set *uri_set);
int
uri_format(char *str, size_t len, struct uri *uri, bool write_password);
size_t
uri_escape(const char *src, size_t src_size, char *dst,
const unsigned char unreserved[256], bool plus);
size_t
uri_unescape(const char *src, size_t src_size, char *dst, bool decode_plus);
]]

pcall(ffi.cdef, uri_cdef) -- Required for running unit tests.
Expand Down Expand Up @@ -234,10 +241,6 @@ local function format(uri, write_password)
return str
end

local char_to_hex = function(c)
return string.format("%%%02X", string.byte(c))
end

-- Encodes a string into its escaped hexadecimal representation.
local function escape(buf, opts)
if type(buf) ~= "string" then
Expand All @@ -258,19 +261,12 @@ local function escape(buf, opts)
error("opts.plus must be a boolean")
end

return (buf:gsub("(.)", function(ch)
if options.plus == true and ch == " " then
return "+"
elseif options.unreserved[string.byte(ch)] == 1 then
return ch
else
return char_to_hex(ch)
end
end))
end

local hex_to_char = function(x)
return string.char(tonumber(x, 16))
-- The worst case is when all characters are encoded.
local dst = ffi.new("char[?]", #buf * 3)
local dst_size = builtin.uri_escape(buf, #buf, dst,
options.unreserved,
options.plus)
return ffi.string(dst, dst_size)
end

-- Decodes an escaped hexadecimal string into its binary representation.
Expand All @@ -290,12 +286,10 @@ local function unescape(buf, opts)
error("opts.plus must be a boolean")
end

local str = buf
if options.plus == true then
str = str:gsub("+", " ")
end
str = str:gsub("%%(%x%x)", hex_to_char)
return str
-- The worst case is when all characters were not decoded.
local dst = ffi.new("char[?]", #buf)
local dst_size = builtin.uri_unescape(buf, #buf, dst, options.plus)
return ffi.string(dst, dst_size)
end

local function encode_kv(key, values, res)
Expand Down
199 changes: 198 additions & 1 deletion test/unit/uri.c
Expand Up @@ -164,6 +164,13 @@ struct uri_set_expected {
struct uri_expected uris[URI_MAX];
};

struct str_escape {
const char *str;
const char *escaped;
const char *unreserved;
bool plus;
};

static int
uri_param_expected_check(const struct uri_param_expected *param,
const struct uri *uri)
Expand Down Expand Up @@ -583,16 +590,206 @@ test_invalid_string_uri_set(void)
footer();
}

#define RFC3986_unreserved "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-._~"

const struct str_escape escape_testcase[] = {
[0] = {
.str = "-._~",
.escaped = "-._~",
.unreserved = RFC3986_unreserved,
.plus = false,
},
[1] = {
.str = "0123456789",
.escaped = "0123456789",
.unreserved = RFC3986_unreserved,
.plus = false,
},
[2] = {
.str = "abcdefghijklm",
.escaped = "abcdefghijklm",
.unreserved = RFC3986_unreserved,
.plus = false,
},
[3] = {
.str = "nopqrstuvwxyz",
.escaped = "nopqrstuvwxyz",
.unreserved = RFC3986_unreserved,
.plus = false,
},
[4] = {
.str = "ABCDEFGHIJKLM",
.escaped = "ABCDEFGHIJKLM",
.unreserved = RFC3986_unreserved,
.plus = false,
},
[5] = {
.str = "NOPQRSTUVWXYZ",
.escaped = "NOPQRSTUVWXYZ",
.unreserved = RFC3986_unreserved,
.plus = false,
},
[6] = {
.str = "!$&'()*+,;=",
.escaped = "%21%24%26%27%28%29%2A%2B%2C%3B%3D",
.unreserved = RFC3986_unreserved,
.plus = false,
},
};

/**
* Builds an array with unreserved characters.
* uri.unreserved() implemented as a Lua function,
* unreserved_tbl() replaces Lua implementation for testing purposes.
*/
static void
unreserved_tbl(const char *str, unsigned char unreserved[256])
{
for (int i = 0; i < 256; i++)
unreserved[i] = 0;

for (; *str; str++) {
unsigned char ch = (unsigned char)*str;
unreserved[ch] = 1;
}
}

static void
test_escape(void)
{
header();
plan(lengthof(escape_testcase) * 3);
unsigned char unreserved[256];
for (unsigned i = 0; i < lengthof(escape_testcase); i++) {
const char *unescaped = escape_testcase[i].str;
const char *escaped = escape_testcase[i].escaped;
bool plus = escape_testcase[i].plus;
char *dst = xcalloc(strlen(unescaped) * 3 + 1, sizeof("char"));
unreserved_tbl(escape_testcase[i].unreserved, unreserved);
size_t dst_size = uri_escape(unescaped, strlen(unescaped),
dst, unreserved, plus);
is(dst_size, strlen(escaped),
"escaped string ('%s') length != %ld", dst, strlen(escaped));
is(strlen(dst), strlen(escaped),
"escaped string ('%s') length != %ld", dst, strlen(escaped));
is(memcmp(escaped, dst, dst_size), 0,
"escape: '%s' == '%s'", escaped, dst);
free(dst);
}
check_plan();
footer();
}

static void
test_unescape(void)
{
header();
plan(lengthof(escape_testcase) * 3);
for (unsigned i = 0; i < lengthof(escape_testcase); i++) {
const char *unescaped = escape_testcase[i].str;
const char *escaped = escape_testcase[i].escaped;
bool decode_plus = escape_testcase[i].plus;
char *dst = xcalloc(strlen(unescaped) + 1, sizeof("char"));
size_t dst_size = uri_unescape(escaped, strlen(escaped),
dst, decode_plus);
is(dst_size, strlen(unescaped),
"unescaped string ('%s') length != %ld", dst,
strlen(unescaped));
is(strlen(dst), strlen(unescaped),
"unescaped string ('%s') length != %ld", dst,
strlen(unescaped));
is(memcmp(dst, unescaped, dst_size), 0,
"unescape: '%s' == '%s'", unescaped, dst);
free(dst);
}
check_plan();
footer();
}

const struct str_escape unescape_testcase[] = {
/* Special case: %<non-hex><non-hex> */
[0] = {
.str = "%##",
.escaped = "%##",
.unreserved = "%%#",
.plus = false,
},
/* Special case: %<hex><non-hex> */
[1] = {
.str = "%A$",
.escaped = "%A$",
.unreserved = "%%A$",
.plus = false,
},
/* Special case: %<non-hex><hex> */
[2] = {
.str = "%$A",
.escaped = "%$A",
.unreserved = "%%$A",
.plus = false,
},
/* Special case: %<EOS> (<EOS> -- the end of a string) */
[3] = {
.str = "%",
.escaped = "%",
.unreserved = "%%",
.plus = false,
},
/* Special case: %<hex><EOS> (<EOS> -- the end of a string) */
[4] = {
.str = "%A",
.escaped = "%A",
.unreserved = "%%A",
.plus = false,
},
/* Special case: %<non-hex><EOS> (<EOS> -- the end of a string) */
[5] = {
.str = "%&",
.escaped = "%&",
.unreserved = "%%&",
.plus = false,
},
};

static void
test_unescape_special_cases(void)
{
header();
plan(lengthof(unescape_testcase) * 3);
for (unsigned i = 0; i < lengthof(unescape_testcase); i++) {
const char *unescaped = unescape_testcase[i].str;
const char *escaped = unescape_testcase[i].escaped;
bool decode_plus = escape_testcase[i].plus;
char *dst = xcalloc(strlen(unescaped) + 1, sizeof("char"));
size_t dst_size = uri_unescape(escaped, strlen(escaped),
dst, decode_plus);
is(dst_size, strlen(unescaped),
"unescaped string ('%s') length != %ld", dst,
strlen(unescaped));
is(strlen(dst), strlen(unescaped),
"unescaped string ('%s') length != %ld", dst,
strlen(unescaped));
is(memcmp(dst, unescaped, dst_size), 0,
"unescape: '%s' == '%s'", unescaped, dst);
free(dst);
}
check_plan();
footer();
}

int
main(void)
{
plan(7);
plan(10);
test_copy_sample();
test_copy_empty();
test_move_sample();
test_move_empty();
test_string_uri_with_query_params_parse();
test_string_uri_set_with_query_params_parse();
test_invalid_string_uri_set();
test_escape();
test_unescape();
test_unescape_special_cases();
return check_plan();
}

0 comments on commit dfc6d1a

Please sign in to comment.