Skip to content

Commit

Permalink
uri: speedup encode and decode functions
Browse files Browse the repository at this point in the history
Patch replaces encoding and decoding functions written in Lua with
functions implemented in C.

Performance of Lua implementation (before the patch):

```
uri.escape 159859.451617 ops/sec
uri.unescape 322307.411753 ops/sec
```

Performance of C implementation (after the patch):

```
uri.escape 4573385.455594 ops/sec
uri.unescape 4456735.138300 ops/sec
```

Follows up tarantool#3682

NO_CHANGELOG=see previous commit
NO_DOC=see previous commit
  • Loading branch information
ligurio committed Dec 23, 2022
1 parent 88ab118 commit f56981d
Show file tree
Hide file tree
Showing 5 changed files with 283 additions and 24 deletions.
2 changes: 2 additions & 0 deletions extra/exports
Expand Up @@ -549,6 +549,8 @@ tt_uuid_to_string
uri_destroy
uri_format
uri_set_destroy
uri_escape
uri_unescape
uuid_nil
uuid_unpack
_say
Expand Down
66 changes: 66 additions & 0 deletions src/lib/uri/uri.c
Expand Up @@ -3,6 +3,7 @@
*
* Copyright 2010-2021, Tarantool AUTHORS, please see AUTHORS file.
*/
#include <ctype.h>
#include "uri.h"
#include "uri_parser.h"
#include "trivia/util.h"
Expand Down Expand Up @@ -371,3 +372,68 @@ uri_set_create(struct uri_set *uri_set, const char *str)
uri_set->uri_count = 0;
return -1;
}

/**
* String percent-encoding.
*/
size_t
uri_escape(const char *src, size_t src_size, char *dst,
const unsigned char unreserved[256], bool encode_plus)
{
int pos = 0;
const char *hex = "0123456789ABCDEF";
while (src_size--) {
unsigned char ch = (unsigned char)*src;
if ((ch == ' ') && encode_plus) {
dst[pos++] = '+';
} else if (!unreserved[(int)ch]) {
dst[pos++] = '%';
dst[pos++] = hex[ch >> 4];
dst[pos++] = hex[ch & 15];
} else {
dst[pos++] = *src;
}
src++;
}
return (size_t)pos;
};

/**
* Converts a hex character to its integer value.
*/
static char
hex2ch(unsigned char ch)
{
return isdigit(ch) ? ch - '0' : tolower(ch) - 'a' + 10;
}

/**
* String percent-decoding.
*/
size_t
uri_unescape(const char *src, char *dst, bool decode_plus)
{
const char *pstr = src;
char *pbuf = dst;
int pos = 0;
while (*pstr) {
if (*pstr == '%') {
bool is_1_hex = isxdigit((unsigned char)pstr[1]);
bool is_2_hex = isxdigit((unsigned char)pstr[2]);
if (is_1_hex && is_2_hex) {
*pbuf++ = hex2ch(pstr[1]) << 4 |
hex2ch(pstr[2]);
pstr += 2;
} else {
*pbuf++ = '%';
}
} else if (*pstr == '+' && decode_plus) {
*pbuf++ = ' ';
} else {
*pbuf++ = *pstr;
}
pstr++;
pos++;
}
return pos;
}
13 changes: 13 additions & 0 deletions src/lib/uri/uri.h
Expand Up @@ -141,6 +141,19 @@ uri_param(const struct uri *uri, const char *name, int idx);
int
uri_param_count(const struct uri *uri, const char *name);

/**
* String percent-encoding.
*/
size_t
uri_escape(const char *src, size_t src_size, char *dst,
const unsigned char unreserved[256], bool plus);

/**
* String percent-decoding.
*/
size_t
uri_unescape(const char *src, char *dst, bool decode_plus);

#if defined(__cplusplus)
} /* extern "C" */
#endif /* defined(__cplusplus) */
40 changes: 17 additions & 23 deletions src/lua/uri.lua
Expand Up @@ -47,6 +47,13 @@ uri_set_destroy(struct uri_set *uri_set);
int
uri_format(char *str, size_t len, struct uri *uri, bool write_password);
size_t
uri_escape(const char *src, size_t src_size, char *dst,
const unsigned char unreserved[256], bool plus);
size_t
uri_unescape(const char *src, char *dst, bool decode_plus);
]]

pcall(ffi.cdef, uri_cdef) -- Required for running unit tests.
Expand Down Expand Up @@ -234,10 +241,6 @@ local function format(uri, write_password)
return str
end

local char_to_hex = function(c)
return string.format("%%%02X", string.byte(c))
end

-- Encodes a string into its escaped hexadecimal representation.
local function escape(buf, opts)
if type(buf) ~= "string" then
Expand All @@ -258,19 +261,12 @@ local function escape(buf, opts)
error("opts.plus must be a boolean")
end

return (buf:gsub("(.)", function(ch)
if options.plus == true and ch == " " then
return "+"
elseif options.unreserved[string.byte(ch)] == 1 then
return ch
else
return char_to_hex(ch)
end
end))
end

local hex_to_char = function(x)
return string.char(tonumber(x, 16))
-- The worst case is when all characters are encoded.
local dst = ffi.new("char[?]", #buf * 3)
local dst_size = builtin.uri_escape(buf, #buf, dst,
options.unreserved,
options.plus)
return ffi.string(dst, dst_size)
end

-- Decodes an escaped hexadecimal string into its binary representation.
Expand All @@ -290,12 +286,10 @@ local function unescape(buf, opts)
error("opts.plus must be a boolean")
end

local str = buf
if options.plus == true then
str = str:gsub("+", " ")
end
str = str:gsub("%%(%x%x)", hex_to_char)
return str
-- The worst case is when all characters were not decoded.
local dst = ffi.new("char[?]", #buf)
local dst_size = builtin.uri_unescape(buf, dst, options.plus)
return ffi.string(dst, dst_size)
end

local function encode_kv(key, values, res)
Expand Down
186 changes: 185 additions & 1 deletion test/unit/uri.c
Expand Up @@ -164,6 +164,13 @@ struct uri_set_expected {
struct uri_expected uris[URI_MAX];
};

struct str_escape {
const char *str;
const char *escaped;
const char *unreserved;
bool plus;
};

static int
uri_param_expected_check(const struct uri_param_expected *param,
const struct uri *uri)
Expand Down Expand Up @@ -583,16 +590,193 @@ test_invalid_string_uri_set(void)
footer();
}

#define RFC3986_unreserved "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-._~"

const struct str_escape escape_testcase[] = {
[0] = {
.str = "-._~",
.escaped = "-._~",
.unreserved = RFC3986_unreserved,
.plus = false,
},
[1] = {
.str = "0123456789",
.escaped = "0123456789",
.unreserved = RFC3986_unreserved,
.plus = false,
},
[2] = {
.str = "abcdefghijklm",
.escaped = "abcdefghijklm",
.unreserved = RFC3986_unreserved,
.plus = false,
},
[3] = {
.str = "nopqrstuvwxyz",
.escaped = "nopqrstuvwxyz",
.unreserved = RFC3986_unreserved,
.plus = false,
},
[4] = {
.str = "ABCDEFGHIJKLM",
.escaped = "ABCDEFGHIJKLM",
.unreserved = RFC3986_unreserved,
.plus = false,
},
[5] = {
.str = "NOPQRSTUVWXYZ",
.escaped = "NOPQRSTUVWXYZ",
.unreserved = RFC3986_unreserved,
.plus = false,
},
[6] = {
.str = "!$&'()*+,;=",
.escaped = "%21%24%26%27%28%29%2A%2B%2C%3B%3D",
.unreserved = RFC3986_unreserved,
.plus = false,
},
};

/**
* Builds an array with unreserved characters.
* uri.unreserved() implemented as a Lua function,
* unreserved_tbl() replaces Lua implementation for testing purposes.
*/
static void
unreserved_tbl(const char *str, unsigned char unreserved[256])
{
for (int i = 0; i < 256; i++)
unreserved[i] = 0;

for (; *str; str++) {
unsigned char ch = (unsigned char)*str;
unreserved[ch] = 1;
}
}

static void
test_escape(void)
{
header();
plan(lengthof(escape_testcase) * 2);
unsigned char unreserved[256];
for (unsigned i = 0; i < lengthof(escape_testcase); i++) {
const char *unescaped = escape_testcase[i].str;
const char *escaped = escape_testcase[i].escaped;
bool plus = escape_testcase[i].plus;
char *dst = xcalloc(strlen(unescaped) * 3, sizeof("char"));
unreserved_tbl(escape_testcase[i].unreserved, unreserved);
size_t dst_size = uri_escape(unescaped, strlen(unescaped),
dst, unreserved, plus);
isnt(dst_size, 0, "escaped string ('%s') length != 0", dst);
is(strcmp(escaped, dst), 0, "escape: '%s' == '%s'",
escaped, dst);
free(dst);
}
check_plan();
footer();
}

static void
test_unescape(void)
{
header();
plan(lengthof(escape_testcase) * 2);
for (unsigned i = 0; i < lengthof(escape_testcase); i++) {
const char *unescaped = escape_testcase[i].str;
const char *escaped = escape_testcase[i].escaped;
bool decode_plus = escape_testcase[i].plus;
char *dst = xcalloc(strlen(unescaped), sizeof("char"));
size_t dst_size = uri_unescape(escaped, dst, decode_plus);
isnt(strlen(dst), 0, "unescaped string ('%s') length != 0",
dst);
is(strcmp(dst, unescaped), 0, "unescape: '%s' == '%s'",
unescaped, dst);
free(dst);
}
check_plan();
footer();
}

const struct str_escape unescape_testcase[] = {
/* Special case: %<non-hex><non-hex> */
[0] = {
.str = "%##",
.escaped = "%##",
.unreserved = "%%#",
.plus = false,
},
/* Special case: %<hex><non-hex> */
[1] = {
.str = "%A$",
.escaped = "%A$",
.unreserved = "%%A$",
.plus = false,
},
/* Special case: %<non-hex><hex> */
[2] = {
.str = "%$A",
.escaped = "%$A",
.unreserved = "%%$A",
.plus = false,
},
/* Special case: %<EOS> (<EOS> -- the end of a string) */
[3] = {
.str = "%",
.escaped = "%",
.unreserved = "%%",
.plus = false,
},
/* Special case: %<hex><EOS> */
[4] = {
.str = "%A",
.escaped = "%A",
.unreserved = "%%A",
.plus = false,
},
/* Special case: %<non-hex><EOS> */
[5] = {
.str = "%&",
.escaped = "%&",
.unreserved = "%%&",
.plus = false,
},
};

static void
test_unescape_special_cases(void)
{
header();
plan(lengthof(unescape_testcase) * 2);
for (unsigned i = 0; i < lengthof(unescape_testcase); i++) {
const char *unescaped = unescape_testcase[i].str;
const char *escaped = unescape_testcase[i].escaped;
bool decode_plus = escape_testcase[i].plus;
char *dst = xcalloc(strlen(unescaped), sizeof("char"));
size_t dst_size = uri_unescape(escaped, dst, decode_plus);
isnt(strlen(dst), 0, "unescaped string ('%s') length != 0",
dst);
is(strcmp(dst, unescaped), 0, "unescape: '%s' == '%s'",
unescaped, dst);
free(dst);
}
check_plan();
footer();
}

int
main(void)
{
plan(7);
plan(10);
test_copy_sample();
test_copy_empty();
test_move_sample();
test_move_empty();
test_string_uri_with_query_params_parse();
test_string_uri_set_with_query_params_parse();
test_invalid_string_uri_set();
test_escape();
test_unescape();
test_unescape_special_cases();
return check_plan();
}

0 comments on commit f56981d

Please sign in to comment.