diff --git a/hazelcast/hash.py b/hazelcast/hash.py index 8b7e49cdf3..06b2167aa9 100644 --- a/hazelcast/hash.py +++ b/hazelcast/hash.py @@ -1,33 +1,20 @@ -import math +from hazelcast.serialization import LE_UINT from hazelcast.six.moves import range -def _fmix(h): - h ^= h >> 16 - h = (h * 0x85ebca6b) & 0xFFFFFFFF - h ^= h >> 13 - h = (h * 0xc2b2ae35) & 0xFFFFFFFF - h ^= h >> 16 - return h - - -def murmur_hash3_x86_32(data, offset, size, seed=0x01000193): +def murmur_hash3_x86_32(data): """murmur3 hash function to determine partition Args: - data (bytearray): Input byte array - offset (int): Offset. - size (int): Byte length. - seed (int): Murmur hash seed hazelcast uses 0x01000193. + data (bytearray or bytes): Input byte array Returns: int: Calculated hash value. """ - key = bytearray(data[offset: offset + size]) - length = len(key) - nblocks = int(length / 4) + length = max(len(data) - 8, 0) # Heap data overhead + nblocks = length // 4 - h1 = seed + h1 = 0x01000193 c1 = 0xcc9e2d51 c2 = 0x1b873593 @@ -35,10 +22,7 @@ def murmur_hash3_x86_32(data, offset, size, seed=0x01000193): # body for block_start in range(0, nblocks * 4, 4): # ??? big endian? - k1 = key[block_start + 3] << 24 | \ - key[block_start + 2] << 16 | \ - key[block_start + 1] << 8 | \ - key[block_start + 0] + k1 = LE_UINT.unpack_from(data, block_start + 8)[0] k1 = c1 * k1 & 0xFFFFFFFF k1 = (k1 << 15 | k1 >> 17) & 0xFFFFFFFF # inlined ROTL32 @@ -53,12 +37,13 @@ def murmur_hash3_x86_32(data, offset, size, seed=0x01000193): k1 = 0 tail_size = length & 3 + # Offsets below are shifted according to heap data overhead if tail_size >= 3: - k1 ^= key[tail_index + 2] << 16 + k1 ^= data[tail_index + 10] << 16 if tail_size >= 2: - k1 ^= key[tail_index + 1] << 8 + k1 ^= data[tail_index + 9] << 8 if tail_size >= 1: - k1 ^= key[tail_index + 0] + k1 ^= data[tail_index + 8] if tail_size != 0: k1 = (k1 * c1) & 0xFFFFFFFF @@ -66,12 +51,17 @@ def murmur_hash3_x86_32(data, offset, size, seed=0x01000193): k1 = (k1 * c2) & 0xFFFFFFFF h1 ^= k1 - result = _fmix(h1 ^ length) - return -(result & 0x80000000) | (result & 0x7FFFFFFF) + h1 ^= length + h1 ^= h1 >> 16 + h1 = (h1 * 0x85ebca6b) & 0xFFFFFFFF + h1 ^= h1 >> 13 + h1 = (h1 * 0xc2b2ae35) & 0xFFFFFFFF + h1 ^= h1 >> 16 + return -(h1 & 0x80000000) | (h1 & 0x7FFFFFFF) -def hash_to_index(hash, length): - if hash == 0x80000000: +def hash_to_index(mm_hash, length): + if mm_hash == 0x80000000: return 0 else: - return int(abs(math.fmod(hash, length))) + return abs(mm_hash) % length diff --git a/hazelcast/serialization/data.py b/hazelcast/serialization/data.py index bbce691283..a127db8097 100644 --- a/hazelcast/serialization/data.py +++ b/hazelcast/serialization/data.py @@ -1,3 +1,4 @@ +from hazelcast import six from hazelcast.hash import murmur_hash3_x86_32 from hazelcast.serialization import BE_INT from hazelcast.serialization.serialization_const import * @@ -92,10 +93,19 @@ def hash_code(self): Returns: int: The murmur hash of the internal data. """ - return murmur_hash3_x86_32(self._buffer, DATA_OFFSET, self.data_size()) + return murmur_hash3_x86_32(self._buffer) def __hash__(self): - return self.hash_code() + # Data objects are used in NearCache as keys. + # When this method is called on Data objects + # received from the members, buffer is type + # of bytes instead of bytearray. Since bytes + # is an alias of str in Python2, we cannot + # use murmur hash directly on it. The + # conversion is necessary only on Python2 + if isinstance(self._buffer, bytearray) or six.PY3: + return murmur_hash3_x86_32(self._buffer) + return murmur_hash3_x86_32(bytearray(self._buffer)) def __eq__(self, other): return isinstance(other, Data) and self.total_size() == other.total_size() \ diff --git a/tests/murmur_hash_test.py b/tests/murmur_hash_test.py index f8285eb5f7..860676afa0 100644 --- a/tests/murmur_hash_test.py +++ b/tests/murmur_hash_test.py @@ -5,20 +5,29 @@ class HashTest(unittest.TestCase): def test_hash(self): - expected = [ - (b"key-1", 1228513025, 107), - (b"key-2", 1503416236, 105), - (b"key-3", 1876349747, 218), - (b"key-4", -914632498, 181), - (b"key-5", -803210507, 111), - (b"key-6", -847942313, 115), - (b"key-7", 1196747334, 223), - (b"key-8", -1444149994, 208), - (b"key-9", 1182720020, 140), + expected = [ # Expected values are from the Java implementation + # 00000000 -> HEAP_DATA_OVERHEAD + (b"00000000key-1", 1228513025, 107), + (b"12345678key-1", 1228513025, 107), # Heap data overhead should not matter + (b"00000000key-2", 1503416236, 105), + (b"00000000key-3", 1876349747, 218), + (b"00000000key-4", -914632498, 181), + (b"00000000key-5", -803210507, 111), + (b"00000000key-6", -847942313, 115), + (b"00000000key-7", 1196747334, 223), + (b"00000000key-8", -1444149994, 208), + (b"00000000key-9", 1182720020, 140), + # Test with different lengths + (b"00000000", -1585187909, 238), + (b"00000000a", -1686100800, 46), + (b"00000000ab", 312914265, 50), + (b"00000000abc", -2068121803, 208), + (b"00000000abcd", -973615161, 236), + (b"", -1585187909, 238), ] - for key, hash, partition_id in expected: - h = murmur_hash3_x86_32(key, 0, len(key)) + for key, mm_hash, partition_id in expected: + h = murmur_hash3_x86_32(bytearray(key)) p = hash_to_index(h, 271) - self.assertEqual(h, hash) + self.assertEqual(h, mm_hash) self.assertEqual(p, partition_id)