Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generalized hashing of keys for memoization #1074

Merged
merged 14 commits into from Jan 18, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 6 additions & 5 deletions holoviews/core/spaces.py
Expand Up @@ -426,12 +426,13 @@ def __call__(self, *args, **kwargs):
values = tuple(tuple(sorted(s.contents.items())) for s in streams)
key = args + tuple(sorted(kwargs.items())) + values

if key in self._memoized:
return self._memoized[key]
else:

hashed_key = util.deephash(key)
ret = self._memoized.get(hashed_key, None)
if hashed_key and ret is None:
ret = self.callable_function(*args, **kwargs)
self._memoized = {key : ret}
return ret
self._memoized = {hashed_key : ret}
return ret


def get_nested_streams(dmap):
Expand Down
59 changes: 59 additions & 0 deletions holoviews/core/util.py
Expand Up @@ -9,6 +9,8 @@
import numpy as np
import param

import json

try:
from cyordereddict import OrderedDict
except:
Expand All @@ -24,6 +26,63 @@
except ImportError:
dd = None




class HashableJSON(json.JSONEncoder):
"""
Extends JSONEncoder to generate a hashable string for as many types
of object as possible including nested objects and objects that are
not normally hashable. The purpose of this class is to generate
unique strings that once hashed are suitable for use in memoization
and other cases where deep equality must be tested without storing
the entire object.

By default JSONEncoder supports booleans, numbers, strings, lists,
tuples and dictionaries. In order to support other types such as
sets, datetime objects and mutable objects such as pandas Dataframes
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and custom mutable

or numpy arrays, HashableJSON has to convert these types to
datastructures that can normally be represented as JSON.

Support for other object types may need to be introduced in
future. By default, unrecognized object types are represented by
their id.

One limitation of this approach is that dictionaries with composite
keys (e.g tuples) are not supported due to the JSON spec.
"""
string_hashable = (dt.datetime,)
repr_hashable = ()

def default(self, obj):
if isinstance(obj, set):
return hash(frozenset(obj))
elif isinstance(obj, np.ndarray):
return obj.tolist()
if pd and isinstance(obj, (pd.Series, pd.DataFrame)):
return repr(sorted(list(obj.to_dict().items())))
elif isinstance(obj, self.string_hashable):
return str(obj)
elif isinstance(obj, self.repr_hashable):
return repr(obj)
try:
return hash(obj)
except:
return id(obj)



def deephash(obj):
"""
Given an object, return a hash using HashableJSON. This hash is not
architecture, Python version or platform independent.
"""
try:
return hash(json.dumps(obj, cls=HashableJSON, sort_keys=True))
except:
return None


# Python3 compatibility
import types
if sys.version_info.major == 3:
Expand Down
116 changes: 115 additions & 1 deletion tests/testutils.py
Expand Up @@ -6,9 +6,15 @@
import unittest
from unittest import SkipTest

import datetime
import numpy as np
from collections import OrderedDict
try:
import pandas as pd
except:
pd = None

from holoviews.core.util import sanitize_identifier_fn, find_range, max_range, wrap_tuple_streams
from holoviews.core.util import sanitize_identifier_fn, find_range, max_range, wrap_tuple_streams, deephash
from holoviews import Dimension
from holoviews.streams import PositionXY
from holoviews.element.comparison import ComparisonTestCase
Expand All @@ -17,6 +23,114 @@

sanitize_identifier = sanitize_identifier_fn.instance()


class TestDeepHash(ComparisonTestCase):
"""
Tests of deephash function used for memoization.
"""

def test_deephash_list_equality(self):
self.assertEqual(deephash([1,2,3]), deephash([1,2,3]))

def test_deephash_list_inequality(self):
obj1 = [1,2,3]
obj2 = [1,2,3,4]
self.assertNotEqual(deephash(obj1), deephash(obj2))

def test_deephash_set_equality(self):
self.assertEqual(deephash(set([1,2,3])), deephash(set([1,3,2])))

def test_deephash_set_inequality(self):
self.assertNotEqual(deephash(set([1,2,3])), deephash(set([1,3,4])))

def test_deephash_dict_equality(self):
self.assertEqual(deephash({1:'a',2:'b'}), deephash({2:'b', 1:'a'}))

def test_deephash_dict_equality(self):
self.assertNotEqual(deephash({1:'a',2:'b'}), deephash({2:'b', 1:'c'}))

def test_deephash_odict_equality(self):
odict1 = OrderedDict([(1,'a'), (2,'b')])
odict2 = OrderedDict([(1,'a'), (2,'b')])
self.assertEqual(deephash(odict1), deephash(odict2))

def test_deephash_odict_equality(self):
odict1 = OrderedDict([(1,'a'), (2,'b')])
odict2 = OrderedDict([(1,'a'), (2,'c')])
self.assertNotEqual(deephash(odict1), deephash(odict2))

def test_deephash_numpy_equality(self):
self.assertEqual(deephash(np.array([1,2,3])),
deephash(np.array([1,2,3])))

def test_deephash_numpy_inequality(self):
arr1 = np.array([1,2,3])
arr2 = np.array([1,2,4])
self.assertNotEqual(deephash(arr1), deephash(arr2))

def test_deephash_dataframe_equality(self):
if pd is None: raise SkipTest
self.assertEqual(deephash(pd.DataFrame({'a':[1,2,3],'b':[4,5,6]})),
deephash(pd.DataFrame({'a':[1,2,3],'b':[4,5,6]})))

def test_deephash_dataframe_inequality(self):
if pd is None: raise SkipTest
self.assertNotEqual(deephash(pd.DataFrame({'a':[1,2,3],'b':[4,5,6]})),
deephash(pd.DataFrame({'a':[1,2,3],'b':[4,5,8]})))

def test_deephash_series_equality(self):
if pd is None: raise SkipTest
self.assertEqual(deephash(pd.Series([1,2,3])),
deephash(pd.Series([1,2,3])))

def test_deephash_series_inequality(self):
if pd is None: raise SkipTest
self.assertNotEqual(deephash(pd.Series([1,2,3])),
deephash(pd.Series([1,2,7])))

def test_deephash_datetime_equality(self):
dt1 = datetime.datetime(1,2,3)
dt2 = datetime.datetime(1,2,3)
self.assertEqual(deephash(dt1), deephash(dt2))

def test_deephash_datetime_inequality(self):
dt1 = datetime.datetime(1,2,3)
dt2 = datetime.datetime(1,2,5)
self.assertNotEqual(deephash(dt1), deephash(dt2))

def test_deephash_nested_native_equality(self):
obj1 = [[1,2], (3,6,7, [True]), 'a', 9.2, 42, {1:3,2:'c'}]
obj2 = [[1,2], (3,6,7, [True]), 'a', 9.2, 42, {1:3,2:'c'}]
self.assertEqual(deephash(obj1), deephash(obj2))

def test_deephash_nested_native_inequality(self):
obj1 = [[1,2], (3,6,7, [False]), 'a', 9.2, 42, {1:3,2:'c'}]
obj2 = [[1,2], (3,6,7, [True]), 'a', 9.2, 42, {1:3,2:'c'}]
self.assertNotEqual(deephash(obj1), deephash(obj2))

def test_deephash_nested_mixed_equality(self):
obj1 = [datetime.datetime(1,2,3), set([1,2,3]),
pd.DataFrame({'a':[1,2],'b':[3,4]}),
np.array([1,2,3]), {'a':'b', '1':True},
OrderedDict([(1,'a'),(2,'b')]), np.int64(34)]
obj2 = [datetime.datetime(1,2,3), set([1,2,3]),
pd.DataFrame({'a':[1,2],'b':[3,4]}),
np.array([1,2,3]), {'a':'b', '1':True},
OrderedDict([(1,'a'),(2,'b')]), np.int64(34)]
self.assertEqual(deephash(obj1), deephash(obj2))

def test_deephash_nested_mixed_inequality(self):
obj1 = [datetime.datetime(1,2,3), set([1,2,3]),
pd.DataFrame({'a':[1,2],'b':[3,4]}),
np.array([1,2,3]), {'a':'b', '2':True},
OrderedDict([(1,'a'),(2,'b')]), np.int64(34)]
obj2 = [datetime.datetime(1,2,3), set([1,2,3]),
pd.DataFrame({'a':[1,2],'b':[3,4]}),
np.array([1,2,3]), {'a':'b', '1':True},
OrderedDict([(1,'a'),(2,'b')]), np.int64(34)]
self.assertNotEqual(deephash(obj1), deephash(obj2))


class TestAllowablePrefix(ComparisonTestCase):
"""
Tests of allowable and hasprefix method.
Expand Down