From acafe384a3269b9b2f67ae5893fece919bf39d66 Mon Sep 17 00:00:00 2001 From: mdumandag Date: Wed, 4 Nov 2020 15:12:24 +0300 Subject: [PATCH 1/5] Optimize the hot path for the serialization of integer values For integers, unless the user configures the default int type, it will be 4 bytes integer. Therefore, we first check the default setting while looking for serailizer for integer types. --- hazelcast/serialization/base.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/hazelcast/serialization/base.py b/hazelcast/serialization/base.py index 273459a690..a2aa7823a8 100644 --- a/hazelcast/serialization/base.py +++ b/hazelcast/serialization/base.py @@ -11,7 +11,7 @@ from hazelcast import six -def empty_partitioning_strategy(key): +def empty_partitioning_strategy(_): return None @@ -85,6 +85,7 @@ def to_object(self, data): """ if not isinstance(data, Data): return data + if is_null_data(data): return None @@ -226,6 +227,7 @@ def serializer_for(self, obj): def lookup_default_serializer(self, obj_type, obj): if isinstance(obj, IdentifiedDataSerializable): return self._data_serializer + if isinstance(obj, Portable): return self._portable_serializer @@ -235,12 +237,12 @@ def lookup_default_serializer(self, obj_type, obj): type_id = None # LOCATE NUMERIC TYPES if obj_type in six.integer_types: - if self.int_type == IntType.BYTE: + if self.int_type == IntType.INT: + type_id = CONSTANT_TYPE_INTEGER + elif self.int_type == IntType.BYTE: type_id = CONSTANT_TYPE_BYTE elif self.int_type == IntType.SHORT: type_id = CONSTANT_TYPE_SHORT - elif self.int_type == IntType.INT: - type_id = CONSTANT_TYPE_INTEGER elif self.int_type == IntType.LONG: type_id = CONSTANT_TYPE_LONG elif self.int_type == IntType.BIG_INT: From e3b364121620341bd0b33da80303e48558bd3363 Mon Sep 17 00:00:00 2001 From: mdumandag Date: Wed, 4 Nov 2020 15:15:30 +0300 Subject: [PATCH 2/5] Optimize ObjectDataOutput#to_byte_array It is not necessary to create a new buffer on `to_byte_array` method before returning. `buffer[:pos]` already creates a copy. --- hazelcast/serialization/output.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hazelcast/serialization/output.py b/hazelcast/serialization/output.py index 24711e24d1..b8a79ff42d 100644 --- a/hazelcast/serialization/output.py +++ b/hazelcast/serialization/output.py @@ -121,9 +121,7 @@ def write_object(self, val): def to_byte_array(self): if self._buffer is None or self._pos == 0: return bytearray() - new_buffer = bytearray(self._pos) - new_buffer[:] = self._buffer[:self._pos] - return new_buffer + return self._buffer[:self._pos] def is_big_endian(self): return self._is_big_endian From 810a06618cb727ed27c35c3dfdb23a16d368fc6e Mon Sep 17 00:00:00 2001 From: mdumandag Date: Tue, 10 Nov 2020 16:13:09 +0300 Subject: [PATCH 3/5] Make use of the fact that Data's buffer is never None There were checks that assume Data's buffer might be None but this is not possible. Data might have a buffer of length 0, but it can never be None. This commit removes such checks. --- hazelcast/serialization/data.py | 26 ++++++++------------------ hazelcast/serialization/input.py | 1 - tests/data_test.py | 4 ---- tests/serialization/string_test.py | 8 ++------ 4 files changed, 10 insertions(+), 29 deletions(-) diff --git a/hazelcast/serialization/data.py b/hazelcast/serialization/data.py index a127db8097..a8eca2f268 100644 --- a/hazelcast/serialization/data.py +++ b/hazelcast/serialization/data.py @@ -16,8 +16,8 @@ class Data(object): It stores binary form of an object serialized by serialization service. """ - def __init__(self, buff=None): - self._buffer = buff + def __init__(self, buf): + self._buffer = buf def to_bytes(self): """Returns byte array representation of internal binary format. @@ -33,7 +33,7 @@ def get_type(self): Returns: int: Serialization type of binary form. """ - if self.total_size() == 0: + if len(self._buffer) == 0: return CONSTANT_TYPE_NULL return BE_INT.unpack_from(self._buffer, TYPE_OFFSET)[0] @@ -43,7 +43,7 @@ def total_size(self): Returns: int: Total size of Data in bytes. """ - return len(self._buffer) if self._buffer is not None else 0 + return len(self._buffer) def data_size(self): """Returns size of internal binary data in bytes. @@ -51,7 +51,7 @@ def data_size(self): Returns: int: Size of internal binary data in bytes. """ - return max(self.total_size() - HEAP_DATA_OVERHEAD, 0) + return max(len(self._buffer) - HEAP_DATA_OVERHEAD, 0) def get_partition_hash(self): """Returns partition hash calculated for serialized object. @@ -64,8 +64,9 @@ def get_partition_hash(self): Returns: int: Partition hash. """ - if self.has_partition_hash(): - return BE_INT.unpack_from(self._buffer, PARTITION_HASH_OFFSET)[0] + partition_hash = BE_INT.unpack_from(self._buffer, PARTITION_HASH_OFFSET)[0] + if partition_hash != 0: + return partition_hash return self.hash_code() def is_portable(self): @@ -76,17 +77,6 @@ def is_portable(self): """ return CONSTANT_TYPE_PORTABLE == self.get_type() - def has_partition_hash(self): - """Determines whether this ``Data`` has partition hash or not. - - Returns: - bool: ``True`` if ``Data`` has partition hash, ``False`` otherwise. - - """ - return self._buffer is not None \ - and len(self._buffer) >= HEAP_DATA_OVERHEAD \ - and BE_INT.unpack_from(self._buffer, PARTITION_HASH_OFFSET)[0] != 0 - def hash_code(self): """Returns the murmur hash of the internal data. diff --git a/hazelcast/serialization/input.py b/hazelcast/serialization/input.py index e46a746b03..dcc68ca971 100644 --- a/hazelcast/serialization/input.py +++ b/hazelcast/serialization/input.py @@ -1,6 +1,5 @@ from hazelcast.serialization.api import * from hazelcast.serialization.bits import * -from hazelcast.serialization.data import Data from hazelcast import six from hazelcast.six.moves import range diff --git a/tests/data_test.py b/tests/data_test.py index d1071c6c38..e5e8153f5e 100644 --- a/tests/data_test.py +++ b/tests/data_test.py @@ -19,13 +19,9 @@ def test_data(self): self.assertEqual(self._total_size, self._data.total_size()) self.assertEqual(self._total_size - DATA_OFFSET, self._data.data_size()) self.assertEqual(0x01020304, self._data.get_type()) - self.assertTrue(self._data.has_partition_hash()) self.assertFalse(self._data.is_portable()) self.assertEqual(1545424565, self._data.hash_code()) self.assertEqual(0x12345678, self._data.get_partition_hash()) def test_data_len(self): self.assertEqual(10, len(Data("1"* 10))) - -if __name__ == '__main__': - unittest.main() diff --git a/tests/serialization/string_test.py b/tests/serialization/string_test.py index 5fe0cef9c8..7c055d9b0a 100644 --- a/tests/serialization/string_test.py +++ b/tests/serialization/string_test.py @@ -41,7 +41,7 @@ def test_ascii_encode(self): def test_ascii_decode(self): data_byte = to_data_byte(TEST_DATA_ASCII) - data = Data(buff=data_byte) + data = Data(data_byte) actual_ascii = self.service.to_object(data) self.assertEqual(TEST_DATA_ASCII, actual_ascii) @@ -54,7 +54,7 @@ def test_utf8_encode(self): def test_utf8_decode(self): data_byte = to_data_byte(TEST_DATA_ALL) - data = Data(buff=data_byte) + data = Data(data_byte) actual_ascii = self.service.to_object(data) self.assertEqual(TEST_DATA_ALL, actual_ascii) @@ -62,7 +62,3 @@ def test_None_str_encode_decode(self): none_str = self.service.to_data(None) decoded = self.service.to_object(none_str) self.assertIsNone(decoded) - - -if __name__ == '__main__': - unittest.main() From 61a0f8f6d41387ceb89b59c5cca716f4a490b418 Mon Sep 17 00:00:00 2001 From: mdumandag Date: Wed, 11 Nov 2020 10:12:56 +0300 Subject: [PATCH 4/5] Make use of the fact that DataOutput's buffer is never None We initialize the buffer at the constructor and never set it to None on any of the code paths. Therefore, None checks are not necessary. --- hazelcast/serialization/output.py | 18 ++++++------------ tests/serialization/serialization_test.py | 4 ++-- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/hazelcast/serialization/output.py b/hazelcast/serialization/output.py index b8a79ff42d..56886c635f 100644 --- a/hazelcast/serialization/output.py +++ b/hazelcast/serialization/output.py @@ -119,8 +119,6 @@ def write_object(self, val): self._service.write_object(self, val) def to_byte_array(self): - if self._buffer is None or self._pos == 0: - return bytearray() return self._buffer[:self._pos] def is_big_endian(self): @@ -146,18 +144,14 @@ def _write_array_fnc(self, val, item_write_fnc): def _ensure_available(self, length): if self._available() < length: - if self._buffer is not None: - buffer_length = len(self._buffer) - new_length = max(buffer_length << 1, buffer_length + length) - new_buffer = bytearray(new_length) - new_buffer[:self._pos] = self._buffer[:self._pos] - self._buffer = new_buffer - else: - new_length = length * 2 if length > self._init_size // 2 else self._init_size - self._buffer = bytearray(new_length) + buffer_length = len(self._buffer) + new_length = max(buffer_length << 1, buffer_length + length) + new_buffer = bytearray(new_length) + new_buffer[:self._pos] = self._buffer[:self._pos] + self._buffer = new_buffer def _available(self): - return len(self._buffer) - self._pos if self._buffer is not None else 0 + return len(self._buffer) - self._pos def __repr__(self): from binascii import hexlify diff --git a/tests/serialization/serialization_test.py b/tests/serialization/serialization_test.py index d03f365cc0..4adeda22fd 100644 --- a/tests/serialization/serialization_test.py +++ b/tests/serialization/serialization_test.py @@ -63,7 +63,7 @@ def test_python_pickle_serialization_with_super_type(self): self.assertEqual(obj, obj2) def test_null_data(self): - data = Data() + data = Data(bytearray(0)) obj = self.service.to_object(data) self.assertIsNone(obj) @@ -73,7 +73,7 @@ def test_none_serialize(self): self.assertIsNone(data) def test_serialize_data(self): - data = Data() + data = Data(bytearray(0)) obj = self.service.to_data(data) self.assertTrue(isinstance(obj, Data)) From 7f8454a10b442c770475036748caa54629cf5cd0 Mon Sep 17 00:00:00 2001 From: mdumandag Date: Thu, 19 Nov 2020 12:07:47 +0300 Subject: [PATCH 5/5] Determine type id at the serialization service constructor There is no need to check for int type id on each lookup_default_serializer call. However, this is required for VAR size option. --- hazelcast/serialization/base.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/hazelcast/serialization/base.py b/hazelcast/serialization/base.py index a2aa7823a8..c70edeb1fe 100644 --- a/hazelcast/serialization/base.py +++ b/hazelcast/serialization/base.py @@ -11,6 +11,15 @@ from hazelcast import six +_int_type_to_type_id = { + IntType.BYTE: CONSTANT_TYPE_BYTE, + IntType.SHORT: CONSTANT_TYPE_SHORT, + IntType.INT: CONSTANT_TYPE_INTEGER, + IntType.LONG: CONSTANT_TYPE_LONG, + IntType.BIG_INT: JAVA_DEFAULT_TYPE_BIG_INTEGER, +} + + def empty_partitioning_strategy(_): return None @@ -164,6 +173,7 @@ def __init__(self, int_type): self._registration_lock = RLock() self.int_type = int_type + self._int_type_id = _int_type_to_type_id.get(int_type, None) def serializer_by_type_id(self, type_id): """Find and return the serializer for the type-id @@ -234,20 +244,11 @@ def lookup_default_serializer(self, obj_type, obj): if isinstance(obj, six.string_types): return self.serializer_by_type_id(CONSTANT_TYPE_STRING) - type_id = None # LOCATE NUMERIC TYPES if obj_type in six.integer_types: - if self.int_type == IntType.INT: - type_id = CONSTANT_TYPE_INTEGER - elif self.int_type == IntType.BYTE: - type_id = CONSTANT_TYPE_BYTE - elif self.int_type == IntType.SHORT: - type_id = CONSTANT_TYPE_SHORT - elif self.int_type == IntType.LONG: - type_id = CONSTANT_TYPE_LONG - elif self.int_type == IntType.BIG_INT: - type_id = JAVA_DEFAULT_TYPE_BIG_INTEGER - elif self.int_type == IntType.VAR: + type_id = self._int_type_id + if type_id is None: + # VAR size if MIN_BYTE <= obj <= MAX_BYTE: type_id = CONSTANT_TYPE_BYTE elif MIN_SHORT <= obj <= MAX_SHORT: @@ -258,8 +259,8 @@ def lookup_default_serializer(self, obj_type, obj): type_id = CONSTANT_TYPE_LONG else: type_id = JAVA_DEFAULT_TYPE_BIG_INTEGER - if type_id: - return self.serializer_by_type_id(type_id) + + return self.serializer_by_type_id(type_id) return self._constant_type_dict.get(obj_type, None)