Skip to content

Commit

Permalink
Improve binary and compact binary handling
Browse files Browse the repository at this point in the history
I believe this improves [1] which addresses a bytes/str inconsistency issue
reported some time ago[2]. The goal of this patch is to have an option
to always deseriaize binary fields as bytes and string fields as str (on
Python 2, respectively: str and unicode). It achieves it by adding a
third option to previously purely boolean decode_response parameter (I
admit I'm not a fan of mixing types like this but I'm doing this
strictly for backwards compatibility and I'd argue the "auto" behavior
should become the default in the future and decode_response should be
removed altogether).

Internally the goal of this change is achieved by keeping the ttype of
binary fields as STRING but putting a non-None value in their
"container spec" which introduces a bit of inconsistency (string fields
are TType.STRING but binary fields are (TType.STRING, BINARY)) but
I believe we can live with it.

I've renamed and reorganized parts of the code in order for the names to
be reasonably truthful and to avoid too much code redundancy connected
to the fact that I've added more tests. Ideally some of the test code
would be shared so reduce the redundancy even further but I feel that
it's out of scope here. Similarly there's some almost identical code in
Python and Cython protocol implementations but I lack expertise
regarding possible ways to mitigate that.

Casting of values in the parser is fixed as a bonus, the following code
never worked because TType.STRING and TType.BINARY had the same value:

     if t == TType.STRING:
         return _cast_string
     if t == TType.BINARY:
         return _cast_binary

[1] Thriftpy@00c6553
[2] Thriftpy#190
  • Loading branch information
jstasiak committed May 17, 2016
1 parent 5aa0b90 commit c25b67d
Show file tree
Hide file tree
Showing 11 changed files with 209 additions and 107 deletions.
38 changes: 38 additions & 0 deletions docs/index.rst
Expand Up @@ -173,6 +173,44 @@ You can also use `from ... import ...` style after a standard module load.
>>> from addressbook_thrift import *
Binaries/bytes vs. strings/text
===============================

It's important to distinguish between arbitrary arrays of bytes (str
in Python 2, bytes in Python 3) and text (unicode in Python 2, str in
python 3). ThriftPy deals with this as follows:

When serializing - both string and binary fields accept bytes and unicode.
If unicode is passed it will be converted to bytes using UTF-8 encoding.

When deserializing - there's `decode_response` constructor parameter
present in binary and compact protocols and their factories that controls
ThriftPy's behavior in this regard. The default `decode_respone` value is
`True`. The complete list of field type and `decode_response` value
combinations is listed here:

string fields
`````````````

* `decode_response` set to True - the value is returned as text if it's
possible to decode it using UTF-8 or as bytes otherwise

* `decode_response` set to False - the value is always returned as bytes

* `decode_response` set to 'auto' - the value is decoded using UTF-8 and
returned as text, if decoding fails an exception is raised

binary fields
`````````````

* `decode_response` set to True - the value is returned as text if it's
possible to decode it using UTF-8 or as bytes otherwise

* `decode_response` set to False - the value is always returned as bytes

* `decode_response` set to 'auto' - the value is always returned as bytes


Benchmarks
==========

Expand Down
11 changes: 6 additions & 5 deletions tests/storm.py
Expand Up @@ -5,6 +5,7 @@
"""

from thriftpy.thrift import (
BINARY,
TPayload,
TException,
TType,
Expand All @@ -16,7 +17,7 @@ class JavaObjectArg(TPayload):
2: (TType.I64, 'long_arg', False),
3: (TType.STRING, 'string_arg', False),
4: (TType.BOOL, 'bool_arg', False),
5: (TType.BINARY, 'binary_arg', False),
5: (TType.STRING, 'binary_arg', BINARY, False),
6: (TType.DOUBLE, 'double_arg', False)}


Expand All @@ -42,7 +43,7 @@ class Grouping(TPayload):
4: (TType.STRUCT, 'none', NullStruct),
5: (TType.STRUCT, 'direct', NullStruct),
6: (TType.STRUCT, 'custom_object', JavaObject),
7: (TType.BINARY, 'custom_serialized'),
7: (TType.STRING, 'custom_serialized', BINARY),
8: (TType.STRUCT, 'local_or_shuffle', NullStruct)}


Expand All @@ -57,7 +58,7 @@ class ShellComponent(TPayload):


class ComponentObject(TPayload):
thrift_spec = {1: (TType.BINARY, 'serialized_java'),
thrift_spec = {1: (TType.STRING, 'serialized_java', BINARY),
2: (TType.STRUCT, 'shell', ShellComponent),
3: (TType.STRUCT, 'java_object', JavaObject)}

Expand Down Expand Up @@ -301,7 +302,7 @@ class beginFileUpload_result(TPayload):

class uploadChunk_args(TPayload):
thrift_spec = {1: (TType.STRING, 'location'),
2: (TType.BINARY, 'chunk')}
2: (TType.STRING, 'chunk', BINARY)}

class uploadChunk_result(TPayload):
thrift_spec = {}
Expand All @@ -322,7 +323,7 @@ class downloadChunk_args(TPayload):
thrift_spec = {1: (TType.STRING, 'id')}

class downloadChunk_result(TPayload):
thrift_spec = {0: (TType.BINARY, 'success')}
thrift_spec = {0: (TType.STRING, 'success', BINARY)}

class getNimbusConf_args(TPayload):
thrift_spec = {}
Expand Down
56 changes: 35 additions & 21 deletions tests/test_protocol_binary.py
Expand Up @@ -2,8 +2,10 @@

from io import BytesIO

import pytest

from thriftpy._compat import u
from thriftpy.thrift import TType, TPayload
from thriftpy.thrift import BINARY, TType, TPayload
from thriftpy.utils import hexlify
from thriftpy.protocol import binary as proto

Expand Down Expand Up @@ -71,29 +73,41 @@ def test_unpack_double():
assert 1234567890.1234567890 == proto.read_val(b, TType.DOUBLE)


def test_pack_string():
@pytest.mark.parametrize(
"spec,data,expected_result",
[
(None, "hello world!",
"00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"),
(BINARY, b"hello world!",
"00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"),
(None, u("你好世界"),
"00 00 00 0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c"),
],
)
def test_pack_string_or_binary(spec, data, expected_result):
b = BytesIO()
proto.write_val(b, TType.STRING, "hello world!")
assert "00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21" == \
hexlify(b.getvalue())

b = BytesIO()
proto.write_val(b, TType.STRING, u("你好世界"))
assert "00 00 00 0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c" == \
hexlify(b.getvalue())


def test_unpack_string():
proto.write_val(b, TType.STRING, data, spec)
assert expected_result == hexlify(b.getvalue())


@pytest.mark.parametrize(
"spec,decode_response,expecting_text",
[
(None, True, True),
(None, False, False),
(None, "auto", True),
(BINARY, True, True),
(BINARY, False, False),
(BINARY, "auto", False),
],
)
def test_read_string_or_binary(spec, decode_response, expecting_text):
b = BytesIO(b"\x00\x00\x00\x0c"
b"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c")
assert u("你好世界") == proto.read_val(b, TType.STRING)


def test_unpack_binary():
bs = BytesIO(b"\x00\x00\x00\x0c"
b"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c")
assert u("你好世界").encode("utf-8") == proto.read_val(
bs, TType.STRING, decode_response=False)
text = u("你好世界")
expected = text if expecting_text else text.encode('utf-8')
assert expected == proto.read_val(
b, TType.STRING, spec, decode_response=decode_response)


def test_write_message_begin():
Expand Down
51 changes: 33 additions & 18 deletions tests/test_protocol_compact.py
Expand Up @@ -2,8 +2,10 @@

from io import BytesIO

import pytest

from thriftpy._compat import u
from thriftpy.thrift import TType, TPayload
from thriftpy.thrift import BINARY, TType, TPayload
from thriftpy.utils import hexlify
from thriftpy.protocol import compact

Expand Down Expand Up @@ -85,34 +87,47 @@ def test_unpack_double():
assert 1234567890.1234567890 == proto.read_val(TType.DOUBLE)


def test_pack_string():
@pytest.mark.parametrize(
"spec,data,expected_result",
[
(None, "hello world!",
"0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"),
(BINARY, b"hello world!",
"0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"),
(None, u("你好世界"),
"0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c"),
],
)
def test_pack_string_or_binary(spec, data, expected_result):
b, proto = gen_proto()
proto.write_val(TType.STRING, "hello world!")
assert "0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21" == \
hexlify(b.getvalue())

b1, proto1 = gen_proto()
proto1.write_val(TType.STRING, "你好世界")
assert "0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c" == \
hexlify(b1.getvalue())
proto.write_val(TType.STRING, data, spec)
assert expected_result == hexlify(b.getvalue())


def test_unpack_string():
b, proto = gen_proto(b"\x0c\x68\x65\x6c\x6c\x6f"
b"\x20\x77\x6f\x72\x6c\x64\x21")
assert u('hello world!') == proto.read_val(TType.STRING)

b, proto = gen_proto(b'\x0c\xe4\xbd\xa0\xe5\xa5'
b'\xbd\xe4\xb8\x96\xe7\x95\x8c')
assert u('你好世界') == proto.read_val(TType.STRING)


def test_unpack_binary():
@pytest.mark.parametrize(
"spec,decode_response,expecting_text",
[
(None, True, True),
(None, False, False),
(None, "auto", True),
(BINARY, True, True),
(BINARY, False, False),
(BINARY, "auto", False),
],
)
def test_unpack_string_or_binary(spec, decode_response, expecting_text):
b, proto = gen_proto(b'\x0c\xe4\xbd\xa0\xe5\xa5'
b'\xbd\xe4\xb8\x96\xe7\x95\x8c')
proto.decode_response = False

assert u('你好世界').encode("utf-8") == proto.read_val(TType.STRING)
proto.decode_response = decode_response
text = u('你好世界')
expected = text if expecting_text else text.encode('utf-8')
assert expected == proto.read_val(TType.STRING, spec)


def test_pack_bool():
Expand Down
55 changes: 33 additions & 22 deletions tests/test_protocol_cybinary.py
Expand Up @@ -8,7 +8,7 @@
import pytest

from thriftpy._compat import u
from thriftpy.thrift import TType, TPayload, TDecodeException
from thriftpy.thrift import BINARY, TType, TPayload, TDecodeException
from thriftpy.transport import TSocket, TServerSocket
from thriftpy.utils import hexlify

Expand Down Expand Up @@ -120,31 +120,42 @@ def test_read_double():
assert 1234567890.1234567890 == proto.read_val(b, TType.DOUBLE)


def test_write_string():
@pytest.mark.parametrize(
"spec,data,expected_result",
[
(None, "hello world!",
"00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"),
(BINARY, b"hello world!",
"00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21"),
(None, u("你好世界"),
"00 00 00 0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c"),
],
)
def test_write_string_or_binary(spec, data, expected_result):
b = TCyMemoryBuffer()
proto.write_val(b, TType.STRING, "hello world!")
proto.write_val(b, TType.STRING, data, spec)
b.flush()
assert "00 00 00 0c 68 65 6c 6c 6f 20 77 6f 72 6c 64 21" == \
hexlify(b.getvalue())

b = TCyMemoryBuffer()
proto.write_val(b, TType.STRING, u("你好世界"))
b.flush()
assert "00 00 00 0c e4 bd a0 e5 a5 bd e4 b8 96 e7 95 8c" == \
hexlify(b.getvalue())


def test_read_string():
b = TCyMemoryBuffer(b"\x00\x00\x00\x0c"
b"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c")
assert u("你好世界") == proto.read_val(b, TType.STRING)


def test_read_binary():
assert expected_result == hexlify(b.getvalue())


@pytest.mark.parametrize(
"spec,decode_response,expecting_text",
[
(None, True, True),
(None, False, False),
(None, "auto", True),
(BINARY, True, True),
(BINARY, False, False),
(BINARY, "auto", False),
],
)
def test_read_string_or_binary(spec, decode_response, expecting_text):
b = TCyMemoryBuffer(b"\x00\x00\x00\x0c"
b"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c")
assert u("你好世界").encode("utf-8") == proto.read_val(
b, TType.STRING, decode_response=False)
text = u("你好世界")
expected = text if expecting_text else text.encode('utf-8')
assert expected == proto.read_val(
b, TType.STRING, spec, decode_response=decode_response)


def test_write_message_begin():
Expand Down
17 changes: 11 additions & 6 deletions thriftpy/parser/parser.py
Expand Up @@ -11,10 +11,12 @@
import os
import sys
import types

from ply import lex, yacc
from .lexer import * # noqa
from .exc import ThriftParserError, ThriftGrammerError
from ..thrift import gen_init, TType, TPayload, TException
from .._compat import text_type
from ..thrift import BINARY, gen_init, TType, TPayload, TException


def p_error(p):
Expand Down Expand Up @@ -394,7 +396,7 @@ def p_base_type(p): # noqa
if p[1] == 'string':
p[0] = TType.STRING
if p[1] == 'binary':
p[0] = TType.BINARY
p[0] = TType.STRING, BINARY


def p_container_type(p):
Expand Down Expand Up @@ -574,7 +576,7 @@ def _parse_seq(p):
p[0] = []


def _cast(t): # noqa
def _cast(t, spec=None): # noqa
if t == TType.BOOL:
return _cast_bool
if t == TType.BYTE:
Expand All @@ -589,7 +591,8 @@ def _cast(t): # noqa
return _cast_double
if t == TType.STRING:
return _cast_string
if t == TType.BINARY:
if t[0] == TType.STRING:
# t[1] will be BINARY here
return _cast_binary
if t[0] == TType.LIST:
return _cast_list(t)
Expand Down Expand Up @@ -634,12 +637,14 @@ def _cast_double(v):


def _cast_string(v):
assert isinstance(v, str)
if isinstance(v, bytes):
v = v.decode('utf-8')
assert isinstance(v, text_type)
return v


def _cast_binary(v):
assert isinstance(v, str)
assert isinstance(v, bytes)
return v


Expand Down

0 comments on commit c25b67d

Please sign in to comment.