Permalink
Browse files

convert non-ASCII characters in json2 decoding

Non-ASCII characters in JSON strings are now decoded to UTF-8.
  • Loading branch information...
1 parent e549f86 commit 2e86da12b3ecc4482fd81d454cb4d70bc85269fa @vinoski vinoski committed May 20, 2014
Showing with 6 additions and 2 deletions.
  1. +3 −2 src/json2.erl
  2. +3 −0 test/t2/app_test.erl
View
@@ -222,7 +222,7 @@ scan_string(eof, _, X) -> {done, {error, missing_close_quote}, X};
scan_string([$" | Rest], A, _) -> {done, {ok, lists:reverse(A)}, Rest};
scan_string([$\\], _, X) -> {more, X};
scan_string([$\\, $u, U1, U2, U3, U4 | Rest], A, X) ->
- scan_string(Rest, [uni_char([U1, U2, U3, U4]) | A], X);
+ scan_string(Rest, lists:reverse(uni_char([U1, U2, U3, U4]))++A, X);
scan_string([$\\, $u | _], _, X) -> {more, X};
scan_string([$\\, C | Rest], A, X) ->
scan_string(Rest, [esc_to_char(C) | A], X);
@@ -232,7 +232,8 @@ scan_string([C | Rest], A, X) ->
%% Given a list of hex characters, convert to the corresponding integer.
uni_char(HexList) ->
- erlang:list_to_integer(HexList, 16).
+ UC = erlang:list_to_integer(HexList, 16),
+ binary_to_list(unicode:characters_to_binary([UC],utf8)).
esc_to_char($") -> $";
esc_to_char($/) -> $/;
@@ -322,6 +322,9 @@ test_json() ->
io:format(" encode/decode\n", []),
?line {ok,{response,[19]}} = jsonrpc:call(?JSON_URI, [],
{call, "subtract", [42, 23]}),
+ UStr = "{ \"origfilename\":\"Acronyms \\u2013 April 2014.pptx\" }",
+ ?line {ok, {struct,[{"origfilename",US}]}} = json2:decode_string(UStr),
+ ?line iolist_to_binary(US), % must not cause a badarg exception
io:format(" param obj1\n", []),
?line ok = do_json({struct, [{"jsonrpc", "2.0"},
{"method", "subtract"},

2 comments on commit 2e86da1

@NicoK
NicoK commented on 2e86da1 Apr 27, 2015

Can you elaborate a bit on why this is needed?

I'd expect non-ASCII characters to be mapped to their according erlang representations, i.e. integers larger than 255, to be compatible with erlang applications. If the application needs binary representation, it can convert it itself. That's at least what I read out of the json2.erl documentation (despite the mix of UTF-8 and UTF-16):

%%% Character Sets: the external representation, and the internal
%%% representation of strings, are lists of UTF-8 code units.
...
%%% Strings: If we represented JSON string data as Erlang binaries,
%%% we would have to choose a particular unicode format. Instead,
%%% we use lists of UTF-16 code units, which applications may then
%%% change to binaries in their application-preferred manner.

Please sign in to comment.