From 7c9da66c5dfda378850afc70ab5970fb1aeb7fcc Mon Sep 17 00:00:00 2001 From: mdumandag Date: Tue, 3 Nov 2020 11:27:03 +0300 Subject: [PATCH 1/2] Optimize murmur hash calculation We were always passing a bytes-like object to the murmur hash function but, we were copying it from the HEAP_DATA_OVERHEAD(8) offset, which can be huge. I removed that copy and tuned the offsets according to our HEAP_DATA_OVERHEAD. Another parameter of the function was the data size. For valid Data objects, this is always equal to the `len(buffer) - HEAP_DATA_OVERHEAD`. This parameter is removed and this length is calculated inside the murmur hash function now. Also, inlined the _fmix function call and removed the unnecessary method calls on hash_to_index function. --- hazelcast/hash.py | 50 +++++++++++++-------------------- hazelcast/serialization/data.py | 2 +- tests/murmur_hash_test.py | 24 ++++++++-------- 3 files changed, 33 insertions(+), 43 deletions(-) diff --git a/hazelcast/hash.py b/hazelcast/hash.py index 8b7e49cdf3..07cf43fb4f 100644 --- a/hazelcast/hash.py +++ b/hazelcast/hash.py @@ -1,33 +1,20 @@ -import math +from hazelcast.serialization import LE_UINT from hazelcast.six.moves import range -def _fmix(h): - h ^= h >> 16 - h = (h * 0x85ebca6b) & 0xFFFFFFFF - h ^= h >> 13 - h = (h * 0xc2b2ae35) & 0xFFFFFFFF - h ^= h >> 16 - return h - - -def murmur_hash3_x86_32(data, offset, size, seed=0x01000193): +def murmur_hash3_x86_32(data): """murmur3 hash function to determine partition Args: - data (bytearray): Input byte array - offset (int): Offset. - size (int): Byte length. - seed (int): Murmur hash seed hazelcast uses 0x01000193. + data (bytearray or bytes): Input byte array Returns: int: Calculated hash value. """ - key = bytearray(data[offset: offset + size]) - length = len(key) + length = len(data) - 8 # Heap data overhead nblocks = int(length / 4) - h1 = seed + h1 = 0x01000193 c1 = 0xcc9e2d51 c2 = 0x1b873593 @@ -35,10 +22,7 @@ def murmur_hash3_x86_32(data, offset, size, seed=0x01000193): # body for block_start in range(0, nblocks * 4, 4): # ??? big endian? - k1 = key[block_start + 3] << 24 | \ - key[block_start + 2] << 16 | \ - key[block_start + 1] << 8 | \ - key[block_start + 0] + k1 = LE_UINT.unpack_from(data, block_start + 8)[0] k1 = c1 * k1 & 0xFFFFFFFF k1 = (k1 << 15 | k1 >> 17) & 0xFFFFFFFF # inlined ROTL32 @@ -53,12 +37,13 @@ def murmur_hash3_x86_32(data, offset, size, seed=0x01000193): k1 = 0 tail_size = length & 3 + # offsets below are shifted according to Heap data offset if tail_size >= 3: - k1 ^= key[tail_index + 2] << 16 + k1 ^= data[tail_index + 10] << 16 if tail_size >= 2: - k1 ^= key[tail_index + 1] << 8 + k1 ^= data[tail_index + 9] << 8 if tail_size >= 1: - k1 ^= key[tail_index + 0] + k1 ^= data[tail_index + 8] if tail_size != 0: k1 = (k1 * c1) & 0xFFFFFFFF @@ -66,12 +51,17 @@ def murmur_hash3_x86_32(data, offset, size, seed=0x01000193): k1 = (k1 * c2) & 0xFFFFFFFF h1 ^= k1 - result = _fmix(h1 ^ length) - return -(result & 0x80000000) | (result & 0x7FFFFFFF) + h1 ^= length + h1 ^= h1 >> 16 + h1 = (h1 * 0x85ebca6b) & 0xFFFFFFFF + h1 ^= h1 >> 13 + h1 = (h1 * 0xc2b2ae35) & 0xFFFFFFFF + h1 ^= h1 >> 16 + return -(h1 & 0x80000000) | (h1 & 0x7FFFFFFF) -def hash_to_index(hash, length): - if hash == 0x80000000: +def hash_to_index(mm_hash, length): + if mm_hash == 0x80000000: return 0 else: - return int(abs(math.fmod(hash, length))) + return abs(mm_hash) % length diff --git a/hazelcast/serialization/data.py b/hazelcast/serialization/data.py index bbce691283..03e277e76e 100644 --- a/hazelcast/serialization/data.py +++ b/hazelcast/serialization/data.py @@ -92,7 +92,7 @@ def hash_code(self): Returns: int: The murmur hash of the internal data. """ - return murmur_hash3_x86_32(self._buffer, DATA_OFFSET, self.data_size()) + return murmur_hash3_x86_32(self._buffer) def __hash__(self): return self.hash_code() diff --git a/tests/murmur_hash_test.py b/tests/murmur_hash_test.py index f8285eb5f7..1fcdcc647b 100644 --- a/tests/murmur_hash_test.py +++ b/tests/murmur_hash_test.py @@ -6,19 +6,19 @@ class HashTest(unittest.TestCase): def test_hash(self): expected = [ - (b"key-1", 1228513025, 107), - (b"key-2", 1503416236, 105), - (b"key-3", 1876349747, 218), - (b"key-4", -914632498, 181), - (b"key-5", -803210507, 111), - (b"key-6", -847942313, 115), - (b"key-7", 1196747334, 223), - (b"key-8", -1444149994, 208), - (b"key-9", 1182720020, 140), + (b"00000000key-1", 1228513025, 107), + (b"00000000key-2", 1503416236, 105), + (b"00000000key-3", 1876349747, 218), + (b"00000000key-4", -914632498, 181), + (b"00000000key-5", -803210507, 111), + (b"00000000key-6", -847942313, 115), + (b"00000000key-7", 1196747334, 223), + (b"00000000key-8", -1444149994, 208), + (b"00000000key-9", 1182720020, 140), ] - for key, hash, partition_id in expected: - h = murmur_hash3_x86_32(key, 0, len(key)) + for key, mm_hash, partition_id in expected: + h = murmur_hash3_x86_32(key) p = hash_to_index(h, 271) - self.assertEqual(h, hash) + self.assertEqual(h, mm_hash) self.assertEqual(p, partition_id) From 1429b01bc2729d3bc14ebea6e0755b9a766627c6 Mon Sep 17 00:00:00 2001 From: mdumandag Date: Tue, 10 Nov 2020 13:57:36 +0300 Subject: [PATCH 2/2] Python2 related fixes Fix the `__hash__` method of the `Data` class to convert `bytes` to `bytearray` if it executing on Python2 since we cannot use murmur hash on `str`(alias of `bytes` on Python2) objects. This extra if check is not necessary on `hash_code` method since it is only called on the Data objects to be sent by the client which will have a bytearray buffer. Also, extra test cases are added --- hazelcast/hash.py | 6 +++--- hazelcast/serialization/data.py | 12 +++++++++++- tests/murmur_hash_test.py | 13 +++++++++++-- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/hazelcast/hash.py b/hazelcast/hash.py index 07cf43fb4f..06b2167aa9 100644 --- a/hazelcast/hash.py +++ b/hazelcast/hash.py @@ -11,8 +11,8 @@ def murmur_hash3_x86_32(data): Returns: int: Calculated hash value. """ - length = len(data) - 8 # Heap data overhead - nblocks = int(length / 4) + length = max(len(data) - 8, 0) # Heap data overhead + nblocks = length // 4 h1 = 0x01000193 @@ -37,7 +37,7 @@ def murmur_hash3_x86_32(data): k1 = 0 tail_size = length & 3 - # offsets below are shifted according to Heap data offset + # Offsets below are shifted according to heap data overhead if tail_size >= 3: k1 ^= data[tail_index + 10] << 16 if tail_size >= 2: diff --git a/hazelcast/serialization/data.py b/hazelcast/serialization/data.py index 03e277e76e..a127db8097 100644 --- a/hazelcast/serialization/data.py +++ b/hazelcast/serialization/data.py @@ -1,3 +1,4 @@ +from hazelcast import six from hazelcast.hash import murmur_hash3_x86_32 from hazelcast.serialization import BE_INT from hazelcast.serialization.serialization_const import * @@ -95,7 +96,16 @@ def hash_code(self): return murmur_hash3_x86_32(self._buffer) def __hash__(self): - return self.hash_code() + # Data objects are used in NearCache as keys. + # When this method is called on Data objects + # received from the members, buffer is type + # of bytes instead of bytearray. Since bytes + # is an alias of str in Python2, we cannot + # use murmur hash directly on it. The + # conversion is necessary only on Python2 + if isinstance(self._buffer, bytearray) or six.PY3: + return murmur_hash3_x86_32(self._buffer) + return murmur_hash3_x86_32(bytearray(self._buffer)) def __eq__(self, other): return isinstance(other, Data) and self.total_size() == other.total_size() \ diff --git a/tests/murmur_hash_test.py b/tests/murmur_hash_test.py index 1fcdcc647b..860676afa0 100644 --- a/tests/murmur_hash_test.py +++ b/tests/murmur_hash_test.py @@ -5,8 +5,10 @@ class HashTest(unittest.TestCase): def test_hash(self): - expected = [ + expected = [ # Expected values are from the Java implementation + # 00000000 -> HEAP_DATA_OVERHEAD (b"00000000key-1", 1228513025, 107), + (b"12345678key-1", 1228513025, 107), # Heap data overhead should not matter (b"00000000key-2", 1503416236, 105), (b"00000000key-3", 1876349747, 218), (b"00000000key-4", -914632498, 181), @@ -15,10 +17,17 @@ def test_hash(self): (b"00000000key-7", 1196747334, 223), (b"00000000key-8", -1444149994, 208), (b"00000000key-9", 1182720020, 140), + # Test with different lengths + (b"00000000", -1585187909, 238), + (b"00000000a", -1686100800, 46), + (b"00000000ab", 312914265, 50), + (b"00000000abc", -2068121803, 208), + (b"00000000abcd", -973615161, 236), + (b"", -1585187909, 238), ] for key, mm_hash, partition_id in expected: - h = murmur_hash3_x86_32(key) + h = murmur_hash3_x86_32(bytearray(key)) p = hash_to_index(h, 271) self.assertEqual(h, mm_hash) self.assertEqual(p, partition_id)