Skip to content

Commit

Permalink
Fix numpy.dtype hashing for numpy >= 1.20 (#1136)
Browse files Browse the repository at this point in the history
  • Loading branch information
pierreglaser committed Dec 11, 2020
1 parent 87b6e94 commit 69f2b09
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 64 deletions.
9 changes: 9 additions & 0 deletions CHANGES.rst
Expand Up @@ -4,6 +4,15 @@ Latest changes
In development
--------------

- Make `joblib.hash` and `joblib.Memory` caching system compatible with `numpy
>= 1.20.0`. Also make it explicit in the documentation that users should now
expect to have their `joblib.Memory` cache invalidated when either `joblib`
or a third party library involved in the cached values definition is
upgraded. In particular, users updating `joblib` to a release that includes
this fix will see their previous cache invalidated if they contained
reference to `numpy` objects.
https://github.com/joblib/joblib/pull/1136

- Remove deprecated `check_pickle` argument in `delayed`.
https://github.com/joblib/joblib/pull/903

Expand Down
9 changes: 9 additions & 0 deletions doc/memory.rst
Expand Up @@ -385,6 +385,15 @@ Gotchas
``self.method`` does not depend on ``self`` you can use
``self.method = memory.cache(self.method, ignore=['self'])``.

* **joblib cache entries may be invalidated after environment updates**.
Values returned by ``joblib.hash`` are not guaranteed to stay
constant across ``joblib`` versions. This means that **all** entries of a
``joblib.Memory`` cache can get invalidated when upgrading ``joblib``.
Invalidation can also happen when upgrading a third party library (such as
``numpy``): in such a case, only the cached function calls with parameters
that are constructs (or contain references to contructs) defined in the
upgraded library should potentially be invalidated after the uprade.


Ignoring some arguments
-----------------------
Expand Down
31 changes: 18 additions & 13 deletions joblib/hashing.py
Expand Up @@ -220,19 +220,24 @@ def save(self, obj):
# The object will be pickled by the pickler hashed at the end.
obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides))
elif isinstance(obj, self.np.dtype):
# Atomic dtype objects are interned by their default constructor:
# np.dtype('f8') is np.dtype('f8')
# This interning is not maintained by a
# pickle.loads + pickle.dumps cycle, because __reduce__
# uses copy=True in the dtype constructor. This
# non-deterministic behavior causes the internal memoizer
# of the hasher to generate different hash values
# depending on the history of the dtype object.
# To prevent the hash from being sensitive to this, we use
# .descr which is a full (and never interned) description of
# the array dtype according to the numpy doc.
klass = obj.__class__
obj = (klass, ('HASHED', obj.descr))
# numpy.dtype consistent hashing is tricky to get right. This comes
# from the fact that atomic np.dtype objects are interned:
# ``np.dtype('f4') is np.dtype('f4')``. The situation is
# complicated by the fact that this interning does not resist a
# simple pickle.load/dump roundtrip:
# ``pickle.loads(pickle.dumps(np.dtype('f4'))) is not
# np.dtype('f4') Because pickle relies on memoization during
# pickling, it is easy to
# produce different hashes for seemingly identical objects, such as
# ``[np.dtype('f4'), np.dtype('f4')]``
# and ``[np.dtype('f4'), pickle.loads(pickle.dumps('f4'))]``.
# To prevent memoization from interfering with hashing, we isolate
# the serialization (and thus the pickle memoization) of each dtype
# using each time a different ``pickle.dumps`` call unrelated to
# the current Hasher instance.
self._hash.update("_HASHED_DTYPE".encode('utf-8'))
self._hash.update(pickle.dumps(obj))
return
Hasher.save(self, obj)


Expand Down
160 changes: 109 additions & 51 deletions joblib/test/test_hashing.py
Expand Up @@ -16,6 +16,7 @@
import itertools
import pickle
import random
from concurrent.futures import ProcessPoolExecutor
from decimal import Decimal
import pytest

Expand Down Expand Up @@ -326,14 +327,48 @@ def test_string():


@with_numpy
def test_dtype():
# Test that we obtain the same hash for object owning several dtype,
# whatever the past of these dtypes. Catter for cache invalidation with
# complex dtype
a = np.dtype([('f1', np.uint), ('f2', np.int32)])
b = a
c = pickle.loads(pickle.dumps(a))
assert hash([a, c]) == hash([a, b])
def test_numpy_dtype_pickling():
# numpy dtype hashing is tricky to get right: see #231, #239, #251 #1080,
# #1082, and explanatory comments inside
# ``joblib.hashing.NumpyHasher.save``.

# In this test, we make sure that the pickling of numpy dtypes is robust to
# object identity and object copy.

dt1 = np.dtype('f4')
dt2 = np.dtype('f4')

# simple dtypes objects are interned
assert dt1 is dt2
assert hash(dt1) == hash(dt2)

dt1_roundtripped = pickle.loads(pickle.dumps(dt1))
assert dt1 is not dt1_roundtripped
assert hash(dt1) == hash(dt1_roundtripped)

assert hash([dt1, dt1]) == hash([dt1_roundtripped, dt1_roundtripped])
assert hash([dt1, dt1]) == hash([dt1, dt1_roundtripped])

complex_dt1 = np.dtype(
[('name', np.str_, 16), ('grades', np.float64, (2,))]
)
complex_dt2 = np.dtype(
[('name', np.str_, 16), ('grades', np.float64, (2,))]
)

# complex dtypes objects are not interned
assert hash(complex_dt1) == hash(complex_dt2)

complex_dt1_roundtripped = pickle.loads(pickle.dumps(complex_dt1))
assert complex_dt1_roundtripped is not complex_dt1
assert hash(complex_dt1) == hash(complex_dt1_roundtripped)

assert hash([complex_dt1, complex_dt1]) == hash(
[complex_dt1_roundtripped, complex_dt1_roundtripped]
)
assert hash([complex_dt1, complex_dt1]) == hash(
[complex_dt1_roundtripped, complex_dt1]
)


@parametrize('to_hash,expected',
Expand Down Expand Up @@ -378,49 +413,72 @@ def test_0d_and_1d_array_hashing_is_different():

@with_numpy
def test_hashes_stay_the_same_with_numpy_objects():
# We want to make sure that hashes don't change with joblib
# version. For end users, that would mean that they have to
# regenerate their cache from scratch, which potentially means
# lengthy recomputations.
rng = np.random.RandomState(42)
# Being explicit about dtypes in order to avoid
# architecture-related differences. Also using 'f4' rather than
# 'f8' for float arrays because 'f8' arrays generated by
# rng.random.randn don't seem to be bit-identical on 32bit and
# 64bit machines.
to_hash_list = [
rng.randint(-1000, high=1000, size=50).astype('<i8'),
tuple(rng.randn(3).astype('<f4') for _ in range(5)),
[rng.randn(3).astype('<f4') for _ in range(5)],
{
-3333: rng.randn(3, 5).astype('<f4'),
0: [
rng.randint(10, size=20).astype('<i8'),
rng.randn(10).astype('<f4')
]
},
# Non regression cases for https://github.com/joblib/joblib/issues/308.
# Generated with joblib 0.9.4.
np.arange(100, dtype='<i8').reshape((10, 10)),
# Fortran contiguous array
np.asfortranarray(np.arange(100, dtype='<i8').reshape((10, 10))),
# Non contiguous array
np.arange(100, dtype='<i8').reshape((10, 10))[:, :2],
]

# These expected results have been generated with joblib 0.9.0
expected_hashes = [
'10a6afc379ca2708acfbaef0ab676eab',
'988a7114f337f381393025911ebc823b',
'c6809f4b97e35f2fa0ee8d653cbd025c',
'b3ad17348e32728a7eb9cda1e7ede438',
'927b3e6b0b6a037e8e035bda134e0b05',
'108f6ee98e7db19ea2006ffd208f4bf1',
'bd48ccaaff28e16e6badee81041b7180'
]

for to_hash, expected in zip(to_hash_list, expected_hashes):
assert hash(to_hash) == expected
# Note: joblib used to test numpy objects hashing by comparing the produced
# hash of an object with some hard-coded target value to guarantee that
# hashing remains the same across joblib versions. However, since numpy
# 1.20 and joblib 1.0, joblib relies on potentially unstable implementation
# details of numpy to hash np.dtype objects, which makes the stability of
# hash values across different environments hard to guarantee and to test.
# As a result, hashing stability across joblib versions becomes best-effort
# only, and we only test the consistency within a single environment by
# making sure:
# - the hash of two copies of the same objects is the same
# - hashing some object in two different python processes produces the same
# value. This should be viewed as a proxy for testing hash consistency
# through time between Python sessions (provided no change in the
# environment was done between sessions).

def create_objects_to_hash():
rng = np.random.RandomState(42)
# Being explicit about dtypes in order to avoid
# architecture-related differences. Also using 'f4' rather than
# 'f8' for float arrays because 'f8' arrays generated by
# rng.random.randn don't seem to be bit-identical on 32bit and
# 64bit machines.
to_hash_list = [
rng.randint(-1000, high=1000, size=50).astype('<i8'),
tuple(rng.randn(3).astype('<f4') for _ in range(5)),
[rng.randn(3).astype('<f4') for _ in range(5)],
{
-3333: rng.randn(3, 5).astype('<f4'),
0: [
rng.randint(10, size=20).astype('<i8'),
rng.randn(10).astype('<f4')
]
},
# Non regression cases for
# https://github.com/joblib/joblib/issues/308
np.arange(100, dtype='<i8').reshape((10, 10)),
# Fortran contiguous array
np.asfortranarray(np.arange(100, dtype='<i8').reshape((10, 10))),
# Non contiguous array
np.arange(100, dtype='<i8').reshape((10, 10))[:, :2],
]
return to_hash_list

# Create two lists containing copies of the same objects. joblib.hash
# should return the same hash for to_hash_list_one[i] and
# to_hash_list_two[i]
to_hash_list_one = create_objects_to_hash()
to_hash_list_two = create_objects_to_hash()

e1 = ProcessPoolExecutor(max_workers=1)
e2 = ProcessPoolExecutor(max_workers=1)

try:
for obj_1, obj_2 in zip(to_hash_list_one, to_hash_list_two):
# testing consistency of hashes across python processes
hash_1 = e1.submit(hash, obj_1).result()
hash_2 = e2.submit(hash, obj_1).result()
assert hash_1 == hash_2

# testing consistency when hashing two copies of the same objects.
hash_3 = e1.submit(hash, obj_2).result()
assert hash_1 == hash_3

finally:
e1.shutdown()
e2.shutdown()


def test_hashing_pickling_error():
Expand Down

0 comments on commit 69f2b09

Please sign in to comment.