Fix numpy.dtype hashing for numpy >= 1.20 (#1136)

joblib · Dec 11, 2020 · 69f2b09 · 69f2b09
1 parent 87b6e94
commit 69f2b09
Show file tree

Hide file tree

Showing 4 changed files with 145 additions and 64 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -4,6 +4,15 @@ Latest changes
 In development
 --------------
 
+- Make `joblib.hash` and `joblib.Memory` caching system compatible with `numpy
+  >= 1.20.0`. Also make it explicit in the documentation that users should now
+  expect to have their `joblib.Memory` cache invalidated when either `joblib`
+  or a third party library involved in the cached values definition is
+  upgraded.  In particular, users updating `joblib` to a release that includes
+  this fix will see their previous cache invalidated if they contained
+  reference to `numpy` objects. 
+  https://github.com/joblib/joblib/pull/1136
+
 - Remove deprecated `check_pickle` argument in `delayed`.
   https://github.com/joblib/joblib/pull/903
 

diff --git a/doc/memory.rst b/doc/memory.rst
@@ -385,6 +385,15 @@ Gotchas
      ``self.method`` does not depend on ``self`` you can use
      ``self.method = memory.cache(self.method, ignore=['self'])``.
 
+* **joblib cache entries may be invalidated after environment updates**.
+  Values returned by ``joblib.hash`` are not guaranteed to stay
+  constant across ``joblib`` versions. This means that **all** entries of a
+  ``joblib.Memory`` cache can get invalidated when upgrading ``joblib``.
+  Invalidation can also happen when upgrading a third party library (such as
+  ``numpy``): in such a case, only the cached function calls with parameters
+  that are constructs (or contain references to contructs) defined in the
+  upgraded library should potentially be invalidated after the uprade.
+
 
 Ignoring some arguments
 -----------------------

diff --git a/joblib/hashing.py b/joblib/hashing.py
@@ -220,19 +220,24 @@ def save(self, obj):
             # The object will be pickled by the pickler hashed at the end.
             obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides))
         elif isinstance(obj, self.np.dtype):
-            # Atomic dtype objects are interned by their default constructor:
-            # np.dtype('f8') is np.dtype('f8')
-            # This interning is not maintained by a
-            # pickle.loads + pickle.dumps cycle, because __reduce__
-            # uses copy=True in the dtype constructor. This
-            # non-deterministic behavior causes the internal memoizer
-            # of the hasher to generate different hash values
-            # depending on the history of the dtype object.
-            # To prevent the hash from being sensitive to this, we use
-            # .descr which is a full (and never interned) description of
-            # the array dtype according to the numpy doc.
-            klass = obj.__class__
-            obj = (klass, ('HASHED', obj.descr))
+            # numpy.dtype consistent hashing is tricky to get right. This comes
+            # from the fact that atomic np.dtype objects are interned:
+            # ``np.dtype('f4') is np.dtype('f4')``. The situation is
+            # complicated by the fact that this interning does not resist a
+            # simple pickle.load/dump roundtrip:
+            # ``pickle.loads(pickle.dumps(np.dtype('f4'))) is not
+            # np.dtype('f4') Because pickle relies on memoization during
+            # pickling, it is easy to
+            # produce different hashes for seemingly identical objects, such as
+            # ``[np.dtype('f4'), np.dtype('f4')]``
+            # and ``[np.dtype('f4'), pickle.loads(pickle.dumps('f4'))]``.
+            # To prevent memoization from interfering with hashing, we isolate
+            # the serialization (and thus the pickle memoization) of each dtype
+            # using each time a different ``pickle.dumps`` call unrelated to
+            # the current Hasher instance.
+            self._hash.update("_HASHED_DTYPE".encode('utf-8'))
+            self._hash.update(pickle.dumps(obj))
+            return
         Hasher.save(self, obj)
 
 

diff --git a/joblib/test/test_hashing.py b/joblib/test/test_hashing.py
@@ -16,6 +16,7 @@
 import itertools
 import pickle
 import random
+from concurrent.futures import ProcessPoolExecutor
 from decimal import Decimal
 import pytest
 
@@ -326,14 +327,48 @@ def test_string():
 
 
 @with_numpy
-def test_dtype():
-    # Test that we obtain the same hash for object owning several dtype,
-    # whatever the past of these dtypes. Catter for cache invalidation with
-    # complex dtype
-    a = np.dtype([('f1', np.uint), ('f2', np.int32)])
-    b = a
-    c = pickle.loads(pickle.dumps(a))
-    assert hash([a, c]) == hash([a, b])
+def test_numpy_dtype_pickling():
+    # numpy dtype hashing is tricky to get right: see #231, #239, #251 #1080,
+    # #1082, and explanatory comments inside
+    # ``joblib.hashing.NumpyHasher.save``.
+
+    # In this test, we make sure that the pickling of numpy dtypes is robust to
+    # object identity and object copy.
+
+    dt1 = np.dtype('f4')
+    dt2 = np.dtype('f4')
+
+    # simple dtypes objects are interned
+    assert dt1 is dt2
+    assert hash(dt1) == hash(dt2)
+
+    dt1_roundtripped = pickle.loads(pickle.dumps(dt1))
+    assert dt1 is not dt1_roundtripped
+    assert hash(dt1) == hash(dt1_roundtripped)
+
+    assert hash([dt1, dt1]) == hash([dt1_roundtripped, dt1_roundtripped])
+    assert hash([dt1, dt1]) == hash([dt1, dt1_roundtripped])
+
+    complex_dt1 = np.dtype(
+        [('name', np.str_, 16), ('grades', np.float64, (2,))]
+    )
+    complex_dt2 = np.dtype(
+        [('name', np.str_, 16), ('grades', np.float64, (2,))]
+    )
+
+    # complex dtypes objects are not interned
+    assert hash(complex_dt1) == hash(complex_dt2)
+
+    complex_dt1_roundtripped = pickle.loads(pickle.dumps(complex_dt1))
+    assert complex_dt1_roundtripped is not complex_dt1
+    assert hash(complex_dt1) == hash(complex_dt1_roundtripped)
+
+    assert hash([complex_dt1, complex_dt1]) == hash(
+        [complex_dt1_roundtripped, complex_dt1_roundtripped]
+    )
+    assert hash([complex_dt1, complex_dt1]) == hash(
+        [complex_dt1_roundtripped, complex_dt1]
+    )
 
 
 @parametrize('to_hash,expected',
@@ -378,49 +413,72 @@ def test_0d_and_1d_array_hashing_is_different():
 
 @with_numpy
 def test_hashes_stay_the_same_with_numpy_objects():
-    # We want to make sure that hashes don't change with joblib
-    # version. For end users, that would mean that they have to
-    # regenerate their cache from scratch, which potentially means
-    # lengthy recomputations.
-    rng = np.random.RandomState(42)
-    # Being explicit about dtypes in order to avoid
-    # architecture-related differences. Also using 'f4' rather than
-    # 'f8' for float arrays because 'f8' arrays generated by
-    # rng.random.randn don't seem to be bit-identical on 32bit and
-    # 64bit machines.
-    to_hash_list = [
-        rng.randint(-1000, high=1000, size=50).astype('<i8'),
-        tuple(rng.randn(3).astype('<f4') for _ in range(5)),
-        [rng.randn(3).astype('<f4') for _ in range(5)],
-        {
-            -3333: rng.randn(3, 5).astype('<f4'),
-            0: [
-                rng.randint(10, size=20).astype('<i8'),
-                rng.randn(10).astype('<f4')
-            ]
-        },
-        # Non regression cases for https://github.com/joblib/joblib/issues/308.
-        # Generated with joblib 0.9.4.
-        np.arange(100, dtype='<i8').reshape((10, 10)),
-        # Fortran contiguous array
-        np.asfortranarray(np.arange(100, dtype='<i8').reshape((10, 10))),
-        # Non contiguous array
-        np.arange(100, dtype='<i8').reshape((10, 10))[:, :2],
-    ]
-
-    # These expected results have been generated with joblib 0.9.0
-    expected_hashes = [
-        '10a6afc379ca2708acfbaef0ab676eab',
-        '988a7114f337f381393025911ebc823b',
-        'c6809f4b97e35f2fa0ee8d653cbd025c',
-        'b3ad17348e32728a7eb9cda1e7ede438',
-        '927b3e6b0b6a037e8e035bda134e0b05',
-        '108f6ee98e7db19ea2006ffd208f4bf1',
-        'bd48ccaaff28e16e6badee81041b7180'
-    ]
-
-    for to_hash, expected in zip(to_hash_list, expected_hashes):
-        assert hash(to_hash) == expected
+    # Note: joblib used to test numpy objects hashing by comparing the produced
+    # hash of an object with some hard-coded target value to guarantee that
+    # hashing remains the same across joblib versions. However, since numpy
+    # 1.20 and joblib 1.0, joblib relies on potentially unstable implementation
+    # details of numpy to hash np.dtype objects, which makes the stability of
+    # hash values across different environments hard to guarantee and to test.
+    # As a result, hashing stability across joblib versions becomes best-effort
+    # only, and we only test the consistency within a single environment by
+    # making sure:
+    # - the hash of two copies of the same objects is the same
+    # - hashing some object in two different python processes produces the same
+    #   value. This should be viewed as a proxy for testing hash consistency
+    #   through time between Python sessions (provided no change in the
+    #   environment was done between sessions).
+
+    def create_objects_to_hash():
+        rng = np.random.RandomState(42)
+        # Being explicit about dtypes in order to avoid
+        # architecture-related differences. Also using 'f4' rather than
+        # 'f8' for float arrays because 'f8' arrays generated by
+        # rng.random.randn don't seem to be bit-identical on 32bit and
+        # 64bit machines.
+        to_hash_list = [
+            rng.randint(-1000, high=1000, size=50).astype('<i8'),
+            tuple(rng.randn(3).astype('<f4') for _ in range(5)),
+            [rng.randn(3).astype('<f4') for _ in range(5)],
+            {
+                -3333: rng.randn(3, 5).astype('<f4'),
+                0: [
+                    rng.randint(10, size=20).astype('<i8'),
+                    rng.randn(10).astype('<f4')
+                ]
+            },
+            # Non regression cases for
+            # https://github.com/joblib/joblib/issues/308
+            np.arange(100, dtype='<i8').reshape((10, 10)),
+            # Fortran contiguous array
+            np.asfortranarray(np.arange(100, dtype='<i8').reshape((10, 10))),
+            # Non contiguous array
+            np.arange(100, dtype='<i8').reshape((10, 10))[:, :2],
+        ]
+        return to_hash_list
+
+    # Create two lists containing copies of the same objects.  joblib.hash
+    # should return the same hash for to_hash_list_one[i] and
+    # to_hash_list_two[i]
+    to_hash_list_one = create_objects_to_hash()
+    to_hash_list_two = create_objects_to_hash()
+
+    e1 = ProcessPoolExecutor(max_workers=1)
+    e2 = ProcessPoolExecutor(max_workers=1)
+
+    try:
+        for obj_1, obj_2 in zip(to_hash_list_one, to_hash_list_two):
+            # testing consistency of hashes across python processes
+            hash_1 = e1.submit(hash, obj_1).result()
+            hash_2 = e2.submit(hash, obj_1).result()
+            assert hash_1 == hash_2
+
+            # testing consistency when hashing two copies of the same objects.
+            hash_3 = e1.submit(hash, obj_2).result()
+            assert hash_1 == hash_3
+
+    finally:
+        e1.shutdown()
+        e2.shutdown()
 
 
 def test_hashing_pickling_error():