From 7c9da66c5dfda378850afc70ab5970fb1aeb7fcc Mon Sep 17 00:00:00 2001
From: mdumandag <metin@hazelcast.com>
Date: Tue, 3 Nov 2020 11:27:03 +0300
Subject: [PATCH 1/2] Optimize murmur hash calculation

We were always passing a bytes-like object to the murmur hash function
but, we were copying it from the HEAP_DATA_OVERHEAD(8) offset, which
can be huge. I removed that copy and tuned the offsets according to our
HEAP_DATA_OVERHEAD.

Another parameter of the function was the data size. For valid Data
objects, this is always equal to the `len(buffer) - HEAP_DATA_OVERHEAD`.
This parameter is removed and this length is calculated inside the
murmur hash function now.

Also, inlined the _fmix function call and removed the unnecessary
method calls on hash_to_index function.
---
 hazelcast/hash.py               | 50 +++++++++++++--------------------
 hazelcast/serialization/data.py |  2 +-
 tests/murmur_hash_test.py       | 24 ++++++++--------
 3 files changed, 33 insertions(+), 43 deletions(-)

diff --git a/hazelcast/hash.py b/hazelcast/hash.py
index 8b7e49cdf3..07cf43fb4f 100644
--- a/hazelcast/hash.py
+++ b/hazelcast/hash.py
@@ -1,33 +1,20 @@
-import math
+from hazelcast.serialization import LE_UINT
 from hazelcast.six.moves import range
 
 
-def _fmix(h):
-    h ^= h >> 16
-    h = (h * 0x85ebca6b) & 0xFFFFFFFF
-    h ^= h >> 13
-    h = (h * 0xc2b2ae35) & 0xFFFFFFFF
-    h ^= h >> 16
-    return h
-
-
-def murmur_hash3_x86_32(data, offset, size, seed=0x01000193):
+def murmur_hash3_x86_32(data):
     """murmur3 hash function to determine partition
 
     Args:
-        data (bytearray): Input byte array
-        offset (int): Offset.
-        size (int): Byte length.
-        seed (int): Murmur hash seed hazelcast uses 0x01000193.
+        data (bytearray or bytes): Input byte array
 
     Returns:
         int: Calculated hash value.
     """
-    key = bytearray(data[offset: offset + size])
-    length = len(key)
+    length = len(data) - 8  # Heap data overhead
     nblocks = int(length / 4)
 
-    h1 = seed
+    h1 = 0x01000193
 
     c1 = 0xcc9e2d51
     c2 = 0x1b873593
@@ -35,10 +22,7 @@ def murmur_hash3_x86_32(data, offset, size, seed=0x01000193):
     # body
     for block_start in range(0, nblocks * 4, 4):
         # ??? big endian?
-        k1 = key[block_start + 3] << 24 | \
-             key[block_start + 2] << 16 | \
-             key[block_start + 1] << 8 | \
-             key[block_start + 0]
+        k1 = LE_UINT.unpack_from(data, block_start + 8)[0]
 
         k1 = c1 * k1 & 0xFFFFFFFF
         k1 = (k1 << 15 | k1 >> 17) & 0xFFFFFFFF  # inlined ROTL32
@@ -53,12 +37,13 @@ def murmur_hash3_x86_32(data, offset, size, seed=0x01000193):
     k1 = 0
     tail_size = length & 3
 
+    # offsets below are shifted according to Heap data offset
     if tail_size >= 3:
-        k1 ^= key[tail_index + 2] << 16
+        k1 ^= data[tail_index + 10] << 16
     if tail_size >= 2:
-        k1 ^= key[tail_index + 1] << 8
+        k1 ^= data[tail_index + 9] << 8
     if tail_size >= 1:
-        k1 ^= key[tail_index + 0]
+        k1 ^= data[tail_index + 8]
 
     if tail_size != 0:
         k1 = (k1 * c1) & 0xFFFFFFFF
@@ -66,12 +51,17 @@ def murmur_hash3_x86_32(data, offset, size, seed=0x01000193):
         k1 = (k1 * c2) & 0xFFFFFFFF
         h1 ^= k1
 
-    result = _fmix(h1 ^ length)
-    return -(result & 0x80000000) | (result & 0x7FFFFFFF)
+    h1 ^= length
+    h1 ^= h1 >> 16
+    h1 = (h1 * 0x85ebca6b) & 0xFFFFFFFF
+    h1 ^= h1 >> 13
+    h1 = (h1 * 0xc2b2ae35) & 0xFFFFFFFF
+    h1 ^= h1 >> 16
+    return -(h1 & 0x80000000) | (h1 & 0x7FFFFFFF)
 
 
-def hash_to_index(hash, length):
-    if hash == 0x80000000:
+def hash_to_index(mm_hash, length):
+    if mm_hash == 0x80000000:
         return 0
     else:
-        return int(abs(math.fmod(hash, length)))
+        return abs(mm_hash) % length
diff --git a/hazelcast/serialization/data.py b/hazelcast/serialization/data.py
index bbce691283..03e277e76e 100644
--- a/hazelcast/serialization/data.py
+++ b/hazelcast/serialization/data.py
@@ -92,7 +92,7 @@ def hash_code(self):
         Returns:
             int: The murmur hash of the internal data.
         """
-        return murmur_hash3_x86_32(self._buffer, DATA_OFFSET, self.data_size())
+        return murmur_hash3_x86_32(self._buffer)
 
     def __hash__(self):
         return self.hash_code()
diff --git a/tests/murmur_hash_test.py b/tests/murmur_hash_test.py
index f8285eb5f7..1fcdcc647b 100644
--- a/tests/murmur_hash_test.py
+++ b/tests/murmur_hash_test.py
@@ -6,19 +6,19 @@
 class HashTest(unittest.TestCase):
     def test_hash(self):
         expected = [
-            (b"key-1", 1228513025, 107),
-            (b"key-2", 1503416236, 105),
-            (b"key-3", 1876349747, 218),
-            (b"key-4", -914632498, 181),
-            (b"key-5", -803210507, 111),
-            (b"key-6", -847942313, 115),
-            (b"key-7", 1196747334, 223),
-            (b"key-8", -1444149994, 208),
-            (b"key-9", 1182720020, 140),
+            (b"00000000key-1", 1228513025, 107),
+            (b"00000000key-2", 1503416236, 105),
+            (b"00000000key-3", 1876349747, 218),
+            (b"00000000key-4", -914632498, 181),
+            (b"00000000key-5", -803210507, 111),
+            (b"00000000key-6", -847942313, 115),
+            (b"00000000key-7", 1196747334, 223),
+            (b"00000000key-8", -1444149994, 208),
+            (b"00000000key-9", 1182720020, 140),
         ]
 
-        for key, hash, partition_id in expected:
-            h = murmur_hash3_x86_32(key, 0, len(key))
+        for key, mm_hash, partition_id in expected:
+            h = murmur_hash3_x86_32(key)
             p = hash_to_index(h, 271)
-            self.assertEqual(h, hash)
+            self.assertEqual(h, mm_hash)
             self.assertEqual(p, partition_id)

From 1429b01bc2729d3bc14ebea6e0755b9a766627c6 Mon Sep 17 00:00:00 2001
From: mdumandag <metin@hazelcast.com>
Date: Tue, 10 Nov 2020 13:57:36 +0300
Subject: [PATCH 2/2] Python2 related fixes

Fix the `__hash__` method of the `Data` class to convert
`bytes` to `bytearray` if it executing on Python2 since we cannot
use murmur hash on `str`(alias of `bytes` on Python2) objects. This
extra if check is not necessary on `hash_code` method since
it is only called on the Data objects to be sent by the client
which will have a bytearray buffer.

Also, extra test cases are added
---
 hazelcast/hash.py               |  6 +++---
 hazelcast/serialization/data.py | 12 +++++++++++-
 tests/murmur_hash_test.py       | 13 +++++++++++--
 3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/hazelcast/hash.py b/hazelcast/hash.py
index 07cf43fb4f..06b2167aa9 100644
--- a/hazelcast/hash.py
+++ b/hazelcast/hash.py
@@ -11,8 +11,8 @@ def murmur_hash3_x86_32(data):
     Returns:
         int: Calculated hash value.
     """
-    length = len(data) - 8  # Heap data overhead
-    nblocks = int(length / 4)
+    length = max(len(data) - 8, 0)  # Heap data overhead
+    nblocks = length // 4
 
     h1 = 0x01000193
 
@@ -37,7 +37,7 @@ def murmur_hash3_x86_32(data):
     k1 = 0
     tail_size = length & 3
 
-    # offsets below are shifted according to Heap data offset
+    # Offsets below are shifted according to heap data overhead
     if tail_size >= 3:
         k1 ^= data[tail_index + 10] << 16
     if tail_size >= 2:
diff --git a/hazelcast/serialization/data.py b/hazelcast/serialization/data.py
index 03e277e76e..a127db8097 100644
--- a/hazelcast/serialization/data.py
+++ b/hazelcast/serialization/data.py
@@ -1,3 +1,4 @@
+from hazelcast import six
 from hazelcast.hash import murmur_hash3_x86_32
 from hazelcast.serialization import BE_INT
 from hazelcast.serialization.serialization_const import *
@@ -95,7 +96,16 @@ def hash_code(self):
         return murmur_hash3_x86_32(self._buffer)
 
     def __hash__(self):
-        return self.hash_code()
+        # Data objects are used in NearCache as keys.
+        # When this method is called on Data objects
+        # received from the members, buffer is type
+        # of bytes instead of bytearray. Since bytes
+        # is an alias of str in Python2, we cannot
+        # use murmur hash directly on it. The
+        # conversion is necessary only on Python2
+        if isinstance(self._buffer, bytearray) or six.PY3:
+            return murmur_hash3_x86_32(self._buffer)
+        return murmur_hash3_x86_32(bytearray(self._buffer))
 
     def __eq__(self, other):
         return isinstance(other, Data) and self.total_size() == other.total_size() \
diff --git a/tests/murmur_hash_test.py b/tests/murmur_hash_test.py
index 1fcdcc647b..860676afa0 100644
--- a/tests/murmur_hash_test.py
+++ b/tests/murmur_hash_test.py
@@ -5,8 +5,10 @@
 
 class HashTest(unittest.TestCase):
     def test_hash(self):
-        expected = [
+        expected = [  # Expected values are from the Java implementation
+            #  00000000 -> HEAP_DATA_OVERHEAD
             (b"00000000key-1", 1228513025, 107),
+            (b"12345678key-1", 1228513025, 107),  # Heap data overhead should not matter
             (b"00000000key-2", 1503416236, 105),
             (b"00000000key-3", 1876349747, 218),
             (b"00000000key-4", -914632498, 181),
@@ -15,10 +17,17 @@ def test_hash(self):
             (b"00000000key-7", 1196747334, 223),
             (b"00000000key-8", -1444149994, 208),
             (b"00000000key-9", 1182720020, 140),
+            # Test with different lengths
+            (b"00000000", -1585187909, 238),
+            (b"00000000a", -1686100800, 46),
+            (b"00000000ab", 312914265, 50),
+            (b"00000000abc", -2068121803, 208),
+            (b"00000000abcd", -973615161, 236),
+            (b"", -1585187909, 238),
         ]
 
         for key, mm_hash, partition_id in expected:
-            h = murmur_hash3_x86_32(key)
+            h = murmur_hash3_x86_32(bytearray(key))
             p = hash_to_index(h, 271)
             self.assertEqual(h, mm_hash)
             self.assertEqual(p, partition_id)