From b8d3263134857da841dcb267cd2aa8340c9d31d4 Mon Sep 17 00:00:00 2001 From: Philipp Rudiger Date: Sat, 13 Aug 2022 12:18:16 +0200 Subject: [PATCH] Implement cache function for memoization support (#2411) --- .../Performance_and_Debugging.ipynb | 43 +- panel/__init__.py | 2 +- panel/io/__init__.py | 1 + panel/io/cache.py | 395 ++++++++++++++++++ panel/io/state.py | 31 ++ panel/tests/conftest.py | 4 + panel/tests/io/test_cache.py | 226 ++++++++++ setup.py | 7 +- 8 files changed, 696 insertions(+), 13 deletions(-) create mode 100644 panel/io/cache.py create mode 100644 panel/tests/io/test_cache.py diff --git a/examples/user_guide/Performance_and_Debugging.ipynb b/examples/user_guide/Performance_and_Debugging.ipynb index baffd3a8e6..fed26ec87c 100644 --- a/examples/user_guide/Performance_and_Debugging.ipynb +++ b/examples/user_guide/Performance_and_Debugging.ipynb @@ -14,16 +14,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "When developing applications that are to be used by multiple users and which may process a lot of data it is important to ensure the application is well optimized. Additionally complex applications may have very complex callbacks which are difficult to trace and debug. In this user guide section we will walk you some of the best practices to debug your applications and profile your application to maximize performance." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "When developing applications that are to be used by multiple users and which may process a lot of data it is important to ensure the application is well optimized. Additionally complex applications may have very complex callbacks which are difficult to trace and debug. In this user guide section we will walk you some of the best practices to debug your applications and profile your application to maximize performance.\n", + "\n", "## Caching\n", "\n", - "The Panel architecture ensures that multiple user sessions can run in the same process and therefore have access to the same global state. This means that we can cache data in Panel's global `state` object, either by directly assigning to the `pn.state.cache` dictionary object or by using the `pn.state.as_cached` helper function.\n", + "Caching data and computation is one of the most effective ways to speed up your applications. Some common examples of scenarios that benefit from caching is working with large datasets that you have to load from disk or over a network connection or you have to perform expensive computations that don't depend on any extraneous state. Panel makes it easy for you to add caching to you applications using a few approaches. Panel' architecture is also very well suited towards caching since multiple user sessions can run in the same process and therefore have access to the same global state. This means that we can cache data in Panel's global `state` object, either by directly assigning to the `pn.state.cache` dictionary object, using the `pn.state.as_cached` helper function or the `pn.cache` decorator. Once cached all current and subsequent sessions will be sped up by having access to the cache.\n", + "\n", + "### Manual usage\n", "\n", "To assign to the cache manually, simply put the data load or expensive calculation in an `if`/`else` block which checks whether the custom key is already present: \n", "\n", @@ -34,16 +31,42 @@ " pn.state.cache['data'] = data = ... # Load some data or perform an expensive computation\n", "```\n", "\n", + "### `pn.cache` decorator\n", + "\n", + "The `pn.cache` decorator provides an easy way to cache the outputs of a function depending on its inputs (i.e. `memoize`). If you've ever used the Python `@lru_cache` decorator you will be familiar with this concept. However the `pn.cache` functions supports additional cache `policy`'s apart from LRU (least-recently used), including `LFU` (least-frequently-used) and 'FIFO' (first-in-first-out). This means that if the specified number of `max_items` is reached Panel will automatically evict items from the cache based on this `policy`. Additionally items can be deleted from the cache based on a `ttl` (time-to-live) value given in seconds.\n", + "\n", + "#### Caching in memory\n", + "\n", + "The `pn.cache` decorator can easily be combined with the different Panel APIs including `pn.bind` and `pn.depends` providing a powerful way to speed up your applications.\n", + "\n", + "```python\n", + "@pn.cache(max_items=10, policy='LRU')\n", + "def load_data(path):\n", + " return ... # Load some data\n", + "```\n", + "\n", + "Once you have decorated your function with `pn.cache` any call to `load_data` will be cached in memory until `max_items` value is reached (i.e. you have loaded 10 different `path` values). At that point the `policy` will determine which item is evicted.\n", + "\n", + "#### Disk caching\n", + "\n", + "If you have `diskcache` installed you can also cache the results to disk by setting `to_disk=True`. The `diskcache` library will then cache the value to the supplied `cache_path` (defaulting to `./cache`). Making use of disk caching allows you to cache items even if the server is restarted.\n", + "\n", + "#### Clearing the cache\n", + "\n", + "Once a function has been decorated with `pn.cache` you can easily clear the cache by calling `.clear()` on that function, e.g. in the example above you could call `load_data.clear()`. If you want to clear all caches you may also call `pn.state.clear_caches()`.\n", + "\n", + "### `pn.state.as_cached`\n", + "\n", "The `as_cached` helper function on the other hand allows providing a custom key and a function and automatically caching the return value. If provided the `args` and `kwargs` will also be hashed making it easy to cache (or memoize) on the arguments to the function: \n", "\n", "```python\n", "def load_data(*args, **kwargs):\n", - " return ... # Load some data\n", + " return ... # Load some data\n", "\n", "data = pn.state.as_cached('data', load_data, *args, **kwargs)\n", "```\n", "\n", - "The first time the app is loaded the data will be cached and subsequent sessions will simply look up the data in the cache, speeding up the process of rendering. If you want to warm up the cache before the first user visits the application you can also provide the `--warm` argument to the `panel serve` command, which will ensure the application is initialized as soon as it is launched. If you want to populate the cache in a separate script from your main application you may also provide the path to a setup script using the `--setup` argument to `panel serve`. If you want to periodically update the cache look into the ability to [schedule tasks](Deploy_and_Export.ipynb#Scheduling-task-with-pn.state.schedule_task)." + "The first time the app is loaded the data will be cached and subsequent sessions will simply look up the data in the cache, speeding up the process of rendering. If you want to warm up the cache before the first user visits the application you can also provide the `--warm` argument to the `panel serve` command, which will ensure the application is initialized as soon as it is launched. If you want to populate the cache in a separate script from your main application you may also provide the path to a setup script using the `--setup` argument to `panel serve`. If you want to periodically update the cache look into the ability to [schedule tasks](Deploy_and_Export.ipynb#Scheduling-task-with-pn.state.schedule_task).\n" ] }, { diff --git a/panel/__init__.py b/panel/__init__.py index cf0c925474..e042167828 100644 --- a/panel/__init__.py +++ b/panel/__init__.py @@ -57,7 +57,7 @@ from .depends import bind, depends # noqa from .interact import interact # noqa from .io import ( # noqa - _jupyter_server_extension_paths, ipywidget, serve, state, + _jupyter_server_extension_paths, cache, ipywidget, serve, state, ) from .layout import ( # noqa Accordion, Card, Column, FlexBox, GridBox, GridSpec, Row, Spacer, Tabs, diff --git a/panel/io/__init__.py b/panel/io/__init__.py index a540af54e6..03a3682f9f 100644 --- a/panel/io/__init__.py +++ b/panel/io/__init__.py @@ -4,6 +4,7 @@ """ import sys +from .cache import cache # noqa from .callbacks import PeriodicCallback # noqa from .document import init_doc, unlocked, with_lock # noqa from .embed import embed_state # noqa diff --git a/panel/io/cache.py b/panel/io/cache.py new file mode 100644 index 0000000000..06b10896fc --- /dev/null +++ b/panel/io/cache.py @@ -0,0 +1,395 @@ +""" +Implements memoization for functions with arbitrary arguments +""" +import functools +import hashlib +import inspect +import io +import os +import pickle +import sys +import threading +import time +import unittest +import unittest.mock +import weakref + +import param + +from .state import state + +#--------------------------------------------------------------------- +# Private API +#--------------------------------------------------------------------- + +_CYCLE_PLACEHOLDER = b"panel-93KZ39Q-floatingdangeroushomechose-CYCLE" + +_FFI_TYPE_NAMES = ("_cffi_backend.FFI", "builtins.CompiledFFI",) + +_HASH_MAP = {} + +_HASH_STACKS = weakref.WeakKeyDictionary() + +_INDETERMINATE = type('INDETERMINATE', (object,), {})() + +_NATIVE_TYPES = ( + bytes, str, float, int, bool, bytearray, type(None) +) + +_NP_SIZE_LARGE = 100_000 + +_NP_SAMPLE_SIZE = 100_000 + +_PANDAS_ROWS_LARGE = 100_000 + +_PANDAS_SAMPLE_SIZE = 100_000 + +if sys.platform == 'win32': + _TIME_FN = time.perf_counter +else: + _TIME_FN = time.monotonic + +class _Stack(object): + + def __init__(self): + self._stack = {} + + def push(self, val): + self._stack[id(val)] = val + + def pop(self): + self._stack.popitem() + + def __contains__(self, val): + return id(val) in self._stack + +def _get_fqn(obj): + """Get module.type_name for a given type.""" + the_type = type(obj) + module = the_type.__module__ + name = the_type.__qualname__ + return "%s.%s" % (module, name) + +def _int_to_bytes(i): + num_bytes = (i.bit_length() + 8) // 8 + return i.to_bytes(num_bytes, "little", signed=True) + +def _is_native(obj): + return isinstance(obj, _NATIVE_TYPES) + +def _is_native_tuple(obj): + return isinstance(obj, tuple) and all(_is_native_tuple(v) for v in obj) + +def _container_hash(obj): + h = hashlib.new("md5") + h.update(_generate_hash(f'__{type(obj).__name__}')) + for item in (obj.items() if isinstance(obj, dict) else obj): + h.update(_generate_hash(item)) + return h.digest() + +def _partial_hash(obj): + h = hashlib.new("md5") + h.update(_generate_hash(obj.args)) + h.update(_generate_hash(obj.func)) + h.update(_generate_hash(obj.keywords)) + return h.digest() + +def _pandas_hash(obj): + import pandas as pd + + if len(obj) >= _PANDAS_ROWS_LARGE: + obj = obj.sample(n=_PANDAS_SAMPLE_SIZE, random_state=0) + try: + return b"%s" % pd.util.hash_pandas_object(obj).sum() + except TypeError: + # Use pickle if pandas cannot hash the object for example if + # it contains unhashable objects. + return b"%s" % pickle.dumps(obj, pickle.HIGHEST_PROTOCOL) + +def _numpy_hash(obj): + h = hashlib.new("md5") + h.update(_generate_hash(obj.shape)) + if obj.size >= _NP_SIZE_LARGE: + import numpy as np + state = np.random.RandomState(0) + obj = state.choice(obj.flat, size=_NP_SAMPLE_SIZE) + h.update(obj.tobytes()) + return h.digest() + +def _io_hash(obj): + h = hashlib.new("md5") + h.update(_generate_hash(obj.tell())) + h.update(_generate_hash(obj.getvalue())) + return h.digest() + +_hash_funcs = { + # Types + int : _int_to_bytes, + str : lambda obj: obj.encode(), + float : lambda obj: _int_to_bytes(hash(obj)), + bool : lambda obj: b'1' if obj is True else b'0', + type(None) : lambda obj: b'0', + (bytes, bytearray) : lambda obj: obj, + (list, tuple, dict): _container_hash, + functools.partial : _partial_hash, + unittest.mock.Mock : lambda obj: _int_to_bytes(id(obj)), + (io.StringIO, io.BytesIO): _io_hash, + # Fully qualified type strings + 'numpy.ndarray' : _numpy_hash, + 'pandas.core.series.Series' : _pandas_hash, + 'pandas.core.frame.DataFrame': _pandas_hash, + 'builtins.mappingproxy' : lambda obj: _container_hash(dict(obj)), + 'builtins.dict_items' : lambda obj: _container_hash(dict(obj)), + 'builtins.getset_descriptor' : lambda obj: obj.__qualname__.encode(), + "numpy.ufunc" : lambda obj: obj.__name__.encode(), + # Functions + inspect.isbuiltin : lambda obj: obj.__name__.encode(), + inspect.ismodule : lambda obj: obj.__name__ +} + +for name in _FFI_TYPE_NAMES: + _hash_funcs[name] = b'0' + +def _find_hash_func(obj, hash_funcs={}): + fqn_type = _get_fqn(obj) + if fqn_type in hash_funcs: + return hash_funcs[fqn_type] + elif fqn_type in _hash_funcs: + return _hash_funcs[fqn_type] + for otype, hash_func in _hash_funcs.items(): + if isinstance(otype, str): + if otype == fqn_type: + return hash_func + elif inspect.isfunction(otype): + if otype(obj): + return hash_func + elif isinstance(obj, otype): + return hash_func + +def _generate_hash_inner(obj, hash_funcs={}): + hash_func = _find_hash_func(obj, hash_funcs) + if hash_func is not None: + try: + output = hash_func(obj) + except BaseException as e: + raise ValueError( + f'User hash function {hash_func!r} failed for input ' + f'{obj!r} with following error: {type(e).__name__}("{e}").' + ) + return output + if hasattr(obj, '__reduce__'): + h = hashlib.new("md5") + try: + reduce_data = obj.__reduce__() + except BaseException: + raise ValueError(f'Could not hash object of type {type(obj).__name__}') + for item in reduce_data: + h.update(_generate_hash(item)) + return h.digest() + return _int_to_bytes(id(obj)) + +def _generate_hash(obj, hash_funcs={}): + # Break recursive cycles. + hash_stack = state._current_stack + if obj in hash_stack: + return _CYCLE_PLACEHOLDER + hash_stack.push(obj) + try: + hash_value = _generate_hash_inner(obj, hash_funcs) + finally: + hash_stack.pop() + return hash_value + +def _key(obj): + if obj is None: + return None + elif _is_native(obj) or _is_native_tuple(obj): + return obj + elif isinstance(obj, list): + if all(_is_native(item) for item in obj): + return ('__list', *obj) + elif ( + _get_fqn(obj) == "pandas.core.frame.DataFrame" + or _get_fqn(obj) == "numpy.ndarray" + or inspect.isbuiltin(obj) + or inspect.isroutine(obj) + or inspect.iscode(obj) + ): + return id(obj) + return _INDETERMINATE + +def _cleanup_cache(cache, policy, max_items, time): + """ + Deletes items in the cache if the exceed the number of items or + their TTL (time-to-live) has expired. + """ + while len(cache) >= max_items: + if policy.lower() == 'lifo': + key = list(cache.keys())[0] + elif policy.lower() == 'lru': + key = sorted(((k, time-t) for k, (_, _, _, t) in cache.items()), + key=lambda o: o[1])[-1][0] + elif policy.lower() == 'lfu': + key = sorted(cache.items(), key=lambda o: o[1][2])[0][0] + del cache[key] + +def _cleanup_ttl(cache, ttl, time): + """ + Deletes items in the cache if their TTL (time-to-live) has expired. + """ + for key, (_, ts, _, _) in list(cache.items()): + if (time-ts) > ttl: + del cache[key] + +#--------------------------------------------------------------------- +# Public API +#--------------------------------------------------------------------- + +def compute_hash(func, hash_funcs, args, kwargs): + """ + Computes a hash given a function and its arguments. + + Arguments + --------- + func: callable + The function to cache. + hash_funcs: dict + A dictionary of custom hash functions indexed by type + args: tuple + Arguments to hash + kwargs: dict + Keyword arguments to hash + """ + key = (func, _key(args), _key(kwargs)) + if _INDETERMINATE not in key and key in _HASH_MAP: + return _HASH_MAP[key] + hasher = hashlib.new("md5") + if args: + hasher.update(_generate_hash(args, hash_funcs)) + if kwargs: + hasher.update(_generate_hash(kwargs, hash_funcs)) + hash_value = hasher.hexdigest() + if _INDETERMINATE not in key: + _HASH_MAP[key] = hash_value + return hash_value + + +def cache( + func=None, hash_funcs=None, max_items=None, policy='LRU', + ttl=None, to_disk=False, cache_path='./cache' +): + """ + Decorator to memoize functions with options to configure the + caching behavior + + Arguments + --------- + func: callable + The function to cache. + hash_funcs: dict or None + A dictionary mapping from a type to a function which returns + a hash for an object of that type. If provided this will + override the default hashing function provided by Panel. + policy: str + A caching policy when max_items is set, must be one of: + - FIFO: First in - First out + - LRU: Least recently used + - LFU: Least frequently used + ttl: float or None + The number of seconds to keep an item in the cache, or None if + the cache should not expire. The default is None. + to_disk: bool + Whether to cache to disk using diskcache. + cache_dir: str + Directory to cache to on disk. + """ + + hash_funcs = hash_funcs or {} + if func is None: + return lambda f: cache( + func=f, + hash_funcs=hash_funcs, + max_items=max_items, + ttl=ttl, + to_disk=to_disk, + cache_path=cache_path + ) + func_hash = None + + lock = threading.RLock() + + @functools.wraps(func) + def wrapped_func(*args, **kwargs): + global func_hash + # Handle param.depends method by adding parameters to arguments + func_name = func.__name__ + is_method = ( + args and isinstance(args[0], object) and + getattr(type(args[0]), func_name, None) is wrapped_func + ) + hash_args, hash_kwargs = args, kwargs + if (is_method and isinstance(args[0], param.Parameterized)): + dinfo = getattr(wrapped_func, '_dinfo') + hash_args = tuple(getattr(args[0], d) for d in dinfo['dependencies']) + args[1:] + hash_kwargs = dict(dinfo['kw'], **kwargs) + hash_value = compute_hash(func, hash_funcs, hash_args, hash_kwargs) + + time = _TIME_FN() + + # If the function is defined inside a bokeh/panel application + # it is recreated for each session, therefore we cache by + # filen, class and function name + fname = sys.modules[func.__module__].__file__ + if is_method: + func_hash = (fname, type(args[0]).__name__, func.__name__) + else: + func_hash = (fname, func.__name__) + func_hash = hashlib.sha256(_generate_hash(func_hash)).hexdigest() + + func_cache = state._memoize_cache.get(func_hash) + + empty = func_cache is None + if empty: + if to_disk: + from diskcache import Index + cache = Index(os.path.join(cache_path, func_hash)) + else: + cache = {} + state._memoize_cache[func_hash] = func_cache = cache + + if ttl is not None: + _cleanup_ttl(func_cache, ttl, time) + + if not empty and hash_value in func_cache: + with lock: + ret, ts, count, _ = func_cache[hash_value] + func_cache[hash_value] = (ret, ts, count+1, time) + return ret + + if max_items is not None: + _cleanup_cache(func_cache, policy, max_items, time) + + ret = func(*args, **kwargs) + with lock: + func_cache[hash_value] = (ret, time, 0, time) + return ret + + def clear(): + global func_hash + if func_hash is None: + return + if to_disk: + from diskcache import Index + cache = Index(os.path.join(cache_path, func_hash)) + cache.clear() + else: + cache = state._memoize_cache.get(func_hash, {}) + cache.clear() + wrapped_func.clear = clear + + try: + wrapped_func.__dict__.update(func.__dict__) + except AttributeError: + pass + + return wrapped_func diff --git a/panel/io/state.py b/panel/io/state.py index 6b30b17563..d81af53316 100644 --- a/panel/io/state.py +++ b/panel/io/state.py @@ -9,6 +9,7 @@ import inspect import json import logging +import shutil import sys import threading import time @@ -111,6 +112,9 @@ class _state(param.Parameterized): The bokeh Document for which a server event is currently being processed.""") + _memoize_cache = param.Dict(default={}, doc=""" + A dictionary used by the cache decorator.""") + # Whether to hold comm events _hold: ClassVar[bool] = False @@ -145,6 +149,9 @@ class _state(param.Parameterized): # Jupyter display handles _handles: ClassVar[Dict[str, [DisplayHandle, List[str]]]] = {} + # Stacks for hashing + _stacks = WeakKeyDictionary() + # Dictionary of callbacks to be triggered on app load _onload: ClassVar[Dict[Document, Callable[[], None]]] = WeakKeyDictionary() _on_session_created: ClassVar[List[Callable[[BokehSessionContext], []]]] = [] @@ -260,6 +267,16 @@ def _destroy_session(self, session_context): if doc in self._templates: del self._templates[doc] + @property + def _current_stack(self): + current_thread = threading.current_thread() + stack = self._stacks.get(current_thread, None) + if stack is None: + from .cache import _Stack + stack = _Stack() + self._stacks[current_thread] = stack + return stack + def _get_callback(self, endpoint: str): _updating: Dict[int, bool] = {} def link(*events): @@ -450,6 +467,20 @@ def cancel_task(self, name: str, wait: bool=False): else: del self._scheduled[name] + def clear_caches(self): + """ + Clears caches generated by panel.io.cache function. + """ + for cache in self._memoize_cache.values(): + cache.clear() + if hasattr(cache, 'directory'): + cache.cache.close() + try: + shutil.rmtree(cache.directory) + except OSError: # Windows wonkiness + pass + self._memoize_cache.clear() + def execute(self, callback: Callable([], None)) -> None: """ Executes both synchronous and asynchronous callbacks diff --git a/panel/tests/conftest.py b/panel/tests/conftest.py index 54027cec1c..348270ba31 100644 --- a/panel/tests/conftest.py +++ b/panel/tests/conftest.py @@ -230,6 +230,10 @@ def server_cleanup(): state._thread_pool.shutdown(wait=False) state._thread_pool = None +@pytest.fixture(autouse=True) +def cache_cleanup(): + state.clear_caches() + @pytest.fixture def py_file(): tf = tempfile.NamedTemporaryFile(mode='w', suffix='.py') diff --git a/panel/tests/io/test_cache.py b/panel/tests/io/test_cache.py new file mode 100644 index 0000000000..4153021bee --- /dev/null +++ b/panel/tests/io/test_cache.py @@ -0,0 +1,226 @@ +import io +import pathlib +import time + +import numpy as np +import pytest + +from panel.io.cache import _find_hash_func, cache +from panel.tests.util import pd_available + +################ +# Test hashing # +################ + +def hashes_equal(v1, v2): + a, b = _find_hash_func(v1)(v1), _find_hash_func(v2)(v2) + return a == b + +def test_str_hash(): + assert hashes_equal('foo', 'foo') + assert not hashes_equal('foo', 'bar') + +def test_int_hash(): + assert hashes_equal(12, 12) + assert not hashes_equal(1, 2) + +def test_float_hash(): + assert hashes_equal(3.14, 3.14) + assert not hashes_equal(1.2, 3.14) + +def test_bool_hash(): + assert hashes_equal(True, True) + assert hashes_equal(False, False) + assert not hashes_equal(True, False) + +def test_none_hash(): + assert hashes_equal(None, None) + assert not hashes_equal(None, False) + +def test_bytes_hash(): + assert hashes_equal(b'0', b'0') + assert not hashes_equal(b'0', b'1') + +def test_list_hash(): + assert hashes_equal([0], [0]) + assert hashes_equal(['a', ['b']], ['a', ['b']]) + assert not hashes_equal([0], [1]) + assert not hashes_equal(['a', ['b']], ['a', ['c']]) + + # Recursion + l = [0] + l.append(l) + assert hashes_equal(l, list(l)) + +def test_tuple_hash(): + assert hashes_equal((0,), (0,)) + assert hashes_equal(('a', ('b',)), ('a', ('b',))) + assert not hashes_equal((0,), (1,)) + assert not hashes_equal(('a', ('b',)), ('a', ('c',))) + +def test_dict_hash(): + assert hashes_equal({'a': 0}, {'a': 0}) + assert hashes_equal({'a': {'b': 0}}, {'a': {'b': 0}}) + assert not hashes_equal({'a': 0}, {'b': 0}) + assert not hashes_equal({'a': 0}, {'a': 1}) + assert not hashes_equal({'a': {'b': 0}}, {'a': {'b': 1}}) + + # Recursion + d = {'a': {}} + d['a'] = d + assert hashes_equal(d, dict(d)) + +def test_stringio_hash(): + sio1, sio2 = io.StringIO(), io.StringIO() + sio1.write('foo') + sio2.write('foo') + sio1.seek(0) + sio2.seek(0) + assert hashes_equal(sio1, sio2) + sio3 = io.StringIO() + sio3.write('bar') + sio3.seek(0) + assert not hashes_equal(sio1, sio3) + +def test_bytesio_hash(): + bio1, bio2 = io.BytesIO(), io.BytesIO() + bio1.write(b'foo') + bio2.write(b'foo') + bio1.seek(0) + bio2.seek(0) + assert hashes_equal(bio1, bio2) + bio3 = io.BytesIO() + bio3.write(b'bar') + bio3.seek(0) + assert not hashes_equal(bio1, bio3) + +def test_ndarray_hash(): + assert hashes_equal(np.array([0, 1, 2]), np.array([0, 1, 2])) + assert not hashes_equal( + np.array([0, 1, 2], dtype='uint32'), + np.array([0, 1, 2], dtype='float64') + ) + assert not hashes_equal( + np.array([0, 1, 2]), + np.array([2, 1, 0]) + ) + +@pd_available +def test_dataframe_hash(): + import pandas as pd + df1, df2 = pd._testing.makeMixedDataFrame(), pd._testing.makeMixedDataFrame() + assert hashes_equal(df1, df2) + df2['A'] = df2['A'].values[::-1] + assert not hashes_equal(df1, df2) + +@pd_available +def test_series_hash(): + import pandas as pd + series1 = pd._testing.makeStringSeries() + series2 = series1.copy() + assert hashes_equal(series1, series2) + series2.iloc[0] = 3.14 + assert not hashes_equal(series1, series2) + +def test_ufunc_hash(): + assert hashes_equal(np.absolute, np.absolute) + assert not hashes_equal(np.sin, np.cos) + +def test_builtin_hash(): + assert hashes_equal(max, max) + assert not hashes_equal(max, min) + +def test_module_hash(): + assert hashes_equal(np, np) + assert not hashes_equal(np, io) + +################ +# Test caching # +################ + +OFFSET = {} + +def function_with_args(a, b): + global OFFSET + offset = OFFSET.get((a, b), 0) + result = a + b + offset + OFFSET[(a, b)] = offset + 1 + return result + +def test_cache_with_args(): + global OFFSET + OFFSET.clear() + fn = cache(function_with_args) + assert fn(0, 0) == 0 + assert fn(0, 0) == 0 + +def test_cache_with_kwargs(): + global OFFSET + OFFSET.clear() + fn = cache(function_with_args) + assert fn(a=0, b=0) == 0 + assert fn(a=0, b=0) == 0 + +def test_cache_clear(): + global OFFSET + OFFSET.clear() + fn = cache(function_with_args) + assert fn(0, 0) == 0 + fn.clear() + assert fn(0, 0) == 1 + +def test_disk_cache(): + global OFFSET + OFFSET.clear() + fn = cache(function_with_args, to_disk=True) + + assert fn(0, 0) == 0 + assert pathlib.Path('./cache').exists() + assert list(pathlib.Path('./cache').glob('*')) + assert fn(0, 0) == 0 + fn.clear() + assert fn(0, 0) == 1 + +@pytest.mark.parametrize('to_disk', (True, False)) +def test_cache_lifo(to_disk): + global OFFSET + OFFSET.clear() + fn = cache(function_with_args, max_items=2, policy='lifo', to_disk=to_disk) + assert fn(0, 0) == 0 + assert fn(0, 1) == 1 + assert fn(0, 0) == 0 + assert fn(0, 2) == 2 # (0, 0) should be evicted + assert fn(0, 0) == 1 + +@pytest.mark.parametrize('to_disk', (True, False)) +def test_cache_lfu(to_disk): + global OFFSET + OFFSET.clear() + fn = cache(function_with_args, max_items=2, policy='lfu', to_disk=to_disk) + assert fn(0, 0) == 0 + assert fn(0, 0) == 0 + assert fn(0, 1) == 1 + assert fn(0, 2) == 2 # (0, 1) should be evicted + assert fn(0, 1) == 2 + +@pytest.mark.parametrize('to_disk', (True, False)) +def test_cache_lru(to_disk): + global OFFSET + OFFSET.clear() + fn = cache(function_with_args, max_items=3, policy='lru', to_disk=to_disk) + assert fn(0, 0) == 0 + assert fn(0, 1) == 1 + assert fn(0, 2) == 2 + assert fn(0, 0) == 0 + assert fn(0, 3) == 3 # (0, 1) should be evicted + assert fn(0, 0) == 0 + assert fn(0, 1) == 2 + +@pytest.mark.parametrize('to_disk', (True, False)) +def test_cache_ttl(to_disk): + global OFFSET + OFFSET.clear() + fn = cache(function_with_args, ttl=0.1, to_disk=to_disk) + assert fn(0, 0) == 0 + time.sleep(0.11) + assert fn(0, 0) == 1 diff --git a/setup.py b/setup.py index 952d06a6e3..257e14cc4c 100644 --- a/setup.py +++ b/setup.py @@ -119,12 +119,16 @@ def run(self): ] _tests = [ + # Test dependencies 'flake8', 'parameterized', 'pytest', 'nbval', + 'flaky', + 'pytest-xdist', 'pytest-cov', 'pre-commit', + # Libraries tested in unit tests 'folium', 'ipympl', 'scipy', @@ -132,8 +136,7 @@ def run(self): 'pandas >=1.3', 'ipython >=7.0', 'holoviews', - 'flaky', - 'pytest-xdist', + 'diskcache', ] _ui = [