hazelcast · mdumandag · Nov 11, 2020 · Nov 3, 2020 · Nov 10, 2020
diff --git a/hazelcast/hash.py b/hazelcast/hash.py
@@ -1,44 +1,28 @@
-import math
+from hazelcast.serialization import LE_UINT
 from hazelcast.six.moves import range
 
 
-def _fmix(h):
-    h ^= h >> 16
-    h = (h * 0x85ebca6b) & 0xFFFFFFFF
-    h ^= h >> 13
-    h = (h * 0xc2b2ae35) & 0xFFFFFFFF
-    h ^= h >> 16
-    return h
-
-
-def murmur_hash3_x86_32(data, offset, size, seed=0x01000193):
+def murmur_hash3_x86_32(data):
     """murmur3 hash function to determine partition
 
     Args:
-        data (bytearray): Input byte array
-        offset (int): Offset.
-        size (int): Byte length.
-        seed (int): Murmur hash seed hazelcast uses 0x01000193.
+        data (bytearray or bytes): Input byte array
 
     Returns:
         int: Calculated hash value.
     """
-    key = bytearray(data[offset: offset + size])
-    length = len(key)
-    nblocks = int(length / 4)
+    length = max(len(data) - 8, 0)  # Heap data overhead
+    nblocks = length // 4
 
-    h1 = seed
+    h1 = 0x01000193
 
     c1 = 0xcc9e2d51
     c2 = 0x1b873593
 
     # body
     for block_start in range(0, nblocks * 4, 4):
         # ??? big endian?
-        k1 = key[block_start + 3] << 24 | \
-             key[block_start + 2] << 16 | \
-             key[block_start + 1] << 8 | \
-             key[block_start + 0]
+        k1 = LE_UINT.unpack_from(data, block_start + 8)[0]
 
         k1 = c1 * k1 & 0xFFFFFFFF
         k1 = (k1 << 15 | k1 >> 17) & 0xFFFFFFFF  # inlined ROTL32
@@ -53,25 +37,31 @@ def murmur_hash3_x86_32(data, offset, size, seed=0x01000193):
     k1 = 0
     tail_size = length & 3
 
+    # Offsets below are shifted according to heap data overhead
     if tail_size >= 3:
-        k1 ^= key[tail_index + 2] << 16
+        k1 ^= data[tail_index + 10] << 16
     if tail_size >= 2:
-        k1 ^= key[tail_index + 1] << 8
+        k1 ^= data[tail_index + 9] << 8
     if tail_size >= 1:
-        k1 ^= key[tail_index + 0]
+        k1 ^= data[tail_index + 8]
 
     if tail_size != 0:
         k1 = (k1 * c1) & 0xFFFFFFFF
         k1 = (k1 << 15 | k1 >> 17) & 0xFFFFFFFF  # _ROTL32
         k1 = (k1 * c2) & 0xFFFFFFFF
         h1 ^= k1
 
-    result = _fmix(h1 ^ length)
-    return -(result & 0x80000000) | (result & 0x7FFFFFFF)
+    h1 ^= length
+    h1 ^= h1 >> 16
+    h1 = (h1 * 0x85ebca6b) & 0xFFFFFFFF
+    h1 ^= h1 >> 13
+    h1 = (h1 * 0xc2b2ae35) & 0xFFFFFFFF
+    h1 ^= h1 >> 16
+    return -(h1 & 0x80000000) | (h1 & 0x7FFFFFFF)
 
 
-def hash_to_index(hash, length):
-    if hash == 0x80000000:
+def hash_to_index(mm_hash, length):
+    if mm_hash == 0x80000000:
         return 0
     else:
-        return int(abs(math.fmod(hash, length)))
+        return abs(mm_hash) % length
diff --git a/hazelcast/serialization/data.py b/hazelcast/serialization/data.py
@@ -1,3 +1,4 @@
+from hazelcast import six
 from hazelcast.hash import murmur_hash3_x86_32
 from hazelcast.serialization import BE_INT
 from hazelcast.serialization.serialization_const import *
@@ -92,10 +93,19 @@ def hash_code(self):
         Returns:
             int: The murmur hash of the internal data.
         """
-        return murmur_hash3_x86_32(self._buffer, DATA_OFFSET, self.data_size())
+        return murmur_hash3_x86_32(self._buffer)
 
     def __hash__(self):
-        return self.hash_code()
+        # Data objects are used in NearCache as keys.
+        # When this method is called on Data objects
+        # received from the members, buffer is type
+        # of bytes instead of bytearray. Since bytes
+        # is an alias of str in Python2, we cannot
+        # use murmur hash directly on it. The
+        # conversion is necessary only on Python2
+        if isinstance(self._buffer, bytearray) or six.PY3:
+            return murmur_hash3_x86_32(self._buffer)
+        return murmur_hash3_x86_32(bytearray(self._buffer))
 
     def __eq__(self, other):
         return isinstance(other, Data) and self.total_size() == other.total_size() \

diff --git a/tests/murmur_hash_test.py b/tests/murmur_hash_test.py
@@ -5,20 +5,29 @@
 
 class HashTest(unittest.TestCase):
     def test_hash(self):
-        expected = [
-            (b"key-1", 1228513025, 107),
-            (b"key-2", 1503416236, 105),
-            (b"key-3", 1876349747, 218),
-            (b"key-4", -914632498, 181),
-            (b"key-5", -803210507, 111),
-            (b"key-6", -847942313, 115),
-            (b"key-7", 1196747334, 223),
-            (b"key-8", -1444149994, 208),
-            (b"key-9", 1182720020, 140),
+        expected = [  # Expected values are from the Java implementation
+            #  00000000 -> HEAP_DATA_OVERHEAD
+            (b"00000000key-1", 1228513025, 107),
+            (b"12345678key-1", 1228513025, 107),  # Heap data overhead should not matter
+            (b"00000000key-2", 1503416236, 105),
+            (b"00000000key-3", 1876349747, 218),
+            (b"00000000key-4", -914632498, 181),
+            (b"00000000key-5", -803210507, 111),
+            (b"00000000key-6", -847942313, 115),
+            (b"00000000key-7", 1196747334, 223),
+            (b"00000000key-8", -1444149994, 208),
+            (b"00000000key-9", 1182720020, 140),
+            # Test with different lengths
+            (b"00000000", -1585187909, 238),
+            (b"00000000a", -1686100800, 46),
+            (b"00000000ab", 312914265, 50),
+            (b"00000000abc", -2068121803, 208),
+            (b"00000000abcd", -973615161, 236),
+            (b"", -1585187909, 238),
         ]
 
-        for key, hash, partition_id in expected:
-            h = murmur_hash3_x86_32(key, 0, len(key))
+        for key, mm_hash, partition_id in expected:
+            h = murmur_hash3_x86_32(bytearray(key))
             p = hash_to_index(h, 271)
-            self.assertEqual(h, hash)
+            self.assertEqual(h, mm_hash)
             self.assertEqual(p, partition_id)