diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index 9210f1f2878d40..70eb15392710cb 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -1,5 +1,5 @@ from .pandas_vb_common import * -from pandas.util.decorators import cache_readonly +from pandas.util._decorators import cache_readonly class DataFrameAttributes(object): diff --git a/doc/source/merging.rst b/doc/source/merging.rst index fb020727d077ed..170dde87c83638 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -13,7 +13,7 @@ import matplotlib.pyplot as plt plt.close('all') - import pandas.util.doctools as doctools + import pandas.util._doctools as doctools p = doctools.TablePlotter() diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 230c7c0b90ac0f..69863d2227f2c9 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1238,10 +1238,10 @@ If indicated, a deprecation warning will be issued if you reference theses modul "pandas.types", "pandas.core.dtypes", "" "pandas.io.sas.saslib", "pandas.io.sas.libsas", "" "pandas._join", "pandas._libs.join", "" - "pandas._hash", "pandas.util.libhashing", "" + "pandas._hash", "pandas._libs.hashing", "" "pandas._period", "pandas._libs.period", "" "pandas._sparse", "pandas.core.sparse.libsparse", "" - "pandas._testing", "pandas.util.libtesting", "" + "pandas._testing", "pandas.util._testing", "" "pandas._window", "pandas.core.libwindow", "" @@ -1254,6 +1254,8 @@ these are now the public subpackages. - The function :func:`~pandas.api.types.union_categoricals` is now importable from ``pandas.api.types``, formerly from ``pandas.types.concat`` (:issue:`15998`) - The type import ``pandas.tslib.NaTType`` is deprecated and can be replaced by using ``type(pandas.NaT)`` (:issue:`16146`) +- The public functions in ``pandas.util.hashing`` deprecated from that locations, but are now importable from ``pandas.util`` (:issue:`16223`) +- The modules in ``pandas.util``: ``decorators``, ``print_versions``, ``doctools``, `validators``, ``depr_module`` are now private (:issue:`16223`) .. _whatsnew_0200.privacy.errors: @@ -1278,7 +1280,7 @@ The following are now part of this API: 'UnsupportedFunctionCall'] -.. _whatsnew_0200.privay.testing: +.. _whatsnew_0200.privacy.testing: ``pandas.testing`` ^^^^^^^^^^^^^^^^^^ @@ -1292,14 +1294,13 @@ The following testing functions are now part of this API: - :func:`testing.assert_index_equal` -.. _whatsnew_0200.privay.plotting: +.. _whatsnew_0200.privacy.plotting: ``pandas.plotting`` ^^^^^^^^^^^^^^^^^^^ A new public ``pandas.plotting`` module has been added that holds plotting functionality that was previously in either ``pandas.tools.plotting`` or in the top-level namespace. See the :ref:`deprecations sections ` for more details. - .. _whatsnew_0200.privacy.development: Other Development Changes diff --git a/pandas/__init__.py b/pandas/__init__.py index 20c7e0d9d59939..48ac9d173559da 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -50,17 +50,17 @@ import pandas.tools.plotting plot_params = pandas.plotting._style._Options(deprecated=True) # do not import deprecate to top namespace -scatter_matrix = pandas.util.decorators.deprecate( +scatter_matrix = pandas.util._decorators.deprecate( 'pandas.scatter_matrix', pandas.plotting.scatter_matrix, 'pandas.plotting.scatter_matrix') -from pandas.util.print_versions import show_versions +from pandas.util._print_versions import show_versions from pandas.io.api import * from pandas.util._tester import test import pandas.testing # extension module deprecations -from pandas.util.depr_module import _DeprecatedModule +from pandas.util._depr_module import _DeprecatedModule json = _DeprecatedModule(deprmod='pandas.json', moved={'dumps': 'pandas.io.json.dumps', diff --git a/pandas/util/hashing.pyx b/pandas/_libs/hashing.pyx similarity index 100% rename from pandas/util/hashing.pyx rename to pandas/_libs/hashing.pyx diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index d707ac66c4eabd..a324bf94171ce5 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -19,8 +19,8 @@ """ from numpy import ndarray -from pandas.util.validators import (validate_args, validate_kwargs, - validate_args_and_kwargs) +from pandas.util._validators import (validate_args, validate_kwargs, + validate_args_and_kwargs) from pandas.errors import UnsupportedFunctionCall from pandas.core.dtypes.common import is_integer, is_bool from pandas.compat import OrderedDict diff --git a/pandas/core/api.py b/pandas/core/api.py index 3e84720c32a1c9..265fb4004d9973 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -35,7 +35,7 @@ from pandas.core.resample import TimeGrouper # see gh-14094. -from pandas.util.depr_module import _DeprecatedModule +from pandas.util._depr_module import _DeprecatedModule _removals = ['day', 'bday', 'businessDay', 'cday', 'customBusinessDay', 'customBusinessMonthEnd', 'customBusinessMonthBegin', diff --git a/pandas/core/base.py b/pandas/core/base.py index fd0846b0ad33c8..a3ef24c80f8839 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -9,14 +9,14 @@ from pandas.core.dtypes.missing import isnull from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCIndexClass from pandas.core.dtypes.common import is_object_dtype, is_list_like, is_scalar -from pandas.util.validators import validate_bool_kwarg +from pandas.util._validators import validate_bool_kwarg from pandas.core import common as com import pandas.core.nanops as nanops import pandas._libs.lib as lib from pandas.compat.numpy import function as nv -from pandas.util.decorators import (Appender, cache_readonly, - deprecate_kwarg, Substitution) +from pandas.util._decorators import (Appender, cache_readonly, + deprecate_kwarg, Substitution) from pandas.core.common import AbstractMethodError _shared_docs = dict() diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index a3667e9322959e..7eb86232cbb073 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -34,11 +34,11 @@ import pandas.core.common as com from pandas.core.missing import interpolate_2d from pandas.compat.numpy import function as nv -from pandas.util.decorators import (Appender, cache_readonly, - deprecate_kwarg, Substitution) +from pandas.util._decorators import (Appender, cache_readonly, + deprecate_kwarg, Substitution) -from pandas.util.terminal import get_terminal_size -from pandas.util.validators import validate_bool_kwarg +from pandas.io.formats.terminal import get_terminal_size +from pandas.util._validators import validate_bool_kwarg from pandas.core.config import get_option diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 15e13025a7c53c..22e376306280ac 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -11,7 +11,7 @@ from pandas.core.computation.scope import _ensure_scope from pandas.compat import string_types from pandas.core.computation.engines import _engines -from pandas.util.validators import validate_bool_kwarg +from pandas.util._validators import validate_bool_kwarg def _check_engine(engine): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 67966374fcf9ad..e6ea58e7e05be1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -80,8 +80,8 @@ OrderedDict, raise_with_traceback) from pandas import compat from pandas.compat.numpy import function as nv -from pandas.util.decorators import Appender, Substitution -from pandas.util.validators import validate_bool_kwarg +from pandas.util._decorators import Appender, Substitution +from pandas.util._validators import validate_bool_kwarg from pandas.core.indexes.period import PeriodIndex from pandas.core.indexes.datetimes import DatetimeIndex diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2bc64795b5f208..27a489293db8ff 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -51,8 +51,8 @@ from pandas.compat import (map, zip, lzip, lrange, string_types, isidentifier, set_function_name) import pandas.core.nanops as nanops -from pandas.util.decorators import Appender, Substitution, deprecate_kwarg -from pandas.util.validators import validate_bool_kwarg +from pandas.util._decorators import Appender, Substitution, deprecate_kwarg +from pandas.util._validators import validate_bool_kwarg from pandas.core import config # goal is to be able to define the docs close to function, while still being @@ -1382,7 +1382,7 @@ def to_clipboard(self, excel=None, sep=None, **kwargs): - Windows: none - OS X: none """ - from pandas.io import clipboard + from pandas.io.clipboard import clipboard clipboard.to_clipboard(self, excel=excel, sep=sep, **kwargs) def to_xarray(self): diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 479d2f7d26eb65..91b55c414b507d 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -54,10 +54,10 @@ from pandas.core.sorting import (get_group_index_sorter, get_group_index, compress_group_index, get_flattened_iterator, decons_obs_group_ids, get_indexer_dict) -from pandas.util.decorators import (cache_readonly, Substitution, - Appender, make_signature) +from pandas.util._decorators import (cache_readonly, Substitution, + Appender, make_signature) from pandas.io.formats.printing import pprint_thing -from pandas.util.validators import validate_kwargs +from pandas.util._validators import validate_kwargs import pandas.core.algorithms as algorithms import pandas.core.common as com diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4345c74664bf5c..82f3bf3b154629 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -41,8 +41,8 @@ from pandas.core.base import PandasObject, IndexOpsMixin import pandas.core.base as base -from pandas.util.decorators import (Appender, Substitution, cache_readonly, - deprecate, deprecate_kwarg) +from pandas.util._decorators import (Appender, Substitution, cache_readonly, + deprecate, deprecate_kwarg) from pandas.core.indexes.frozen import FrozenList import pandas.core.common as com import pandas.core.dtypes.concat as _concat diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 760db4ba206757..395513d7b9b810 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -16,7 +16,7 @@ from pandas.core.algorithms import take_1d -from pandas.util.decorators import Appender, cache_readonly +from pandas.util._decorators import Appender, cache_readonly from pandas.core.config import get_option from pandas.core.indexes.base import Index, _index_shared_docs import pandas.core.base as base diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 387209ceb038f1..cd8559bcca03cb 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -28,7 +28,7 @@ from pandas._libs.period import Period from pandas.core.indexes.base import Index, _index_shared_docs -from pandas.util.decorators import Appender, cache_readonly +from pandas.util._decorators import Appender, cache_readonly import pandas.core.dtypes.concat as _concat import pandas.tseries.frequencies as frequencies diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index b0264759f2f8d3..ec678b1577d81b 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -41,8 +41,8 @@ from pandas.core.tools.datetimes import ( parse_time_string, normalize_date, to_time) from pandas.core.tools.timedeltas import to_timedelta -from pandas.util.decorators import (Appender, cache_readonly, - deprecate_kwarg, Substitution) +from pandas.util._decorators import (Appender, cache_readonly, + deprecate_kwarg, Substitution) import pandas.core.common as com import pandas.tseries.offsets as offsets import pandas.core.tools.datetimes as tools diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index ccd0d8bee4abc2..039346cba56c8c 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -28,7 +28,7 @@ from pandas.core.indexes.multi import MultiIndex from pandas.compat.numpy import function as nv from pandas.core import common as com -from pandas.util.decorators import cache_readonly, Appender +from pandas.util._decorators import cache_readonly, Appender from pandas.core.config import get_option import pandas.core.indexes.base as ibase diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c760d2943b823e..7ef037d8f3536f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -26,8 +26,8 @@ is_null_slice) import pandas.core.base as base -from pandas.util.decorators import (Appender, cache_readonly, - deprecate, deprecate_kwarg) +from pandas.util._decorators import (Appender, cache_readonly, + deprecate, deprecate_kwarg) import pandas.core.common as com import pandas.core.missing as missing import pandas.core.algorithms as algos @@ -718,7 +718,7 @@ def _inferred_type_levels(self): @cache_readonly def _hashed_values(self): """ return a uint64 ndarray of my hashed values """ - from pandas.util.hashing import hash_tuples + from pandas.core.util.hashing import hash_tuples return hash_tuples(self) def _hashed_indexing_key(self, key): @@ -740,7 +740,7 @@ def _hashed_indexing_key(self, key): we need to stringify if we have mixed levels """ - from pandas.util.hashing import hash_tuples + from pandas.core.util.hashing import hash_tuples if not isinstance(key, tuple): return hash_tuples(key) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 21ba2a386d96a3..bdae0ac7ac5e93 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -11,7 +11,7 @@ from pandas.core import algorithms from pandas.core.indexes.base import ( Index, InvalidIndexError, _index_shared_docs) -from pandas.util.decorators import Appender, cache_readonly +from pandas.util._decorators import Appender, cache_readonly import pandas.core.indexes.base as ibase diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 378661a49e20dc..15fd9b7dc2b6a4 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -40,8 +40,8 @@ from pandas.core.indexes.base import _index_shared_docs, _ensure_index from pandas import compat -from pandas.util.decorators import (Appender, Substitution, cache_readonly, - deprecate_kwarg) +from pandas.util._decorators import (Appender, Substitution, cache_readonly, + deprecate_kwarg) from pandas.compat import zip, u import pandas.core.indexes.base as ibase diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index acd040693af2e9..b7a8e0b54a1289 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -13,7 +13,7 @@ from pandas.compat import lrange, range from pandas.compat.numpy import function as nv from pandas.core.indexes.base import Index, _index_shared_docs -from pandas.util.decorators import Appender, cache_readonly +from pandas.util._decorators import Appender, cache_readonly import pandas.core.indexes.base as ibase from pandas.core.indexes.numeric import Int64Index diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 1081787b2c0b02..ab94a5bffb4f94 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -27,7 +27,7 @@ from pandas.core.indexes.base import _index_shared_docs import pandas.core.common as com import pandas.core.dtypes.concat as _concat -from pandas.util.decorators import Appender, Substitution, deprecate_kwarg +from pandas.util._decorators import Appender, Substitution, deprecate_kwarg from pandas.core.indexes.datetimelike import TimelikeOps, DatetimeIndexOpsMixin from pandas.core.tools.timedeltas import ( to_timedelta, _coerce_scalar_to_timedelta_type) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 840206977cf301..15851a17274cac 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -64,8 +64,8 @@ from pandas._libs.lib import BlockPlacement import pandas.core.computation.expressions as expressions -from pandas.util.decorators import cache_readonly -from pandas.util.validators import validate_bool_kwarg +from pandas.util._decorators import cache_readonly +from pandas.util._validators import validate_bool_kwarg from pandas import compat, _np_version_under1p9 from pandas.compat import range, map, zip, u diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 41a17a0957cbf8..e7cfbdb0fc9c6f 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -15,7 +15,7 @@ tslib as libts, algos as libalgos, iNaT) from pandas import compat -from pandas.util.decorators import Appender +from pandas.util._decorators import Appender import pandas.core.computation.expressions as expressions from pandas.compat import bind_method diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 39d2ebdeec3ace..d1f5b4587059cf 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -34,7 +34,7 @@ from pandas.core.ops import _op_descriptions from pandas.core.series import Series from pandas.core.reshape.util import cartesian_product -from pandas.util.decorators import (deprecate, Appender) +from pandas.util._decorators import (deprecate, Appender) _shared_doc_kwargs = dict( axes='items, major_axis, minor_axis', diff --git a/pandas/core/resample.py b/pandas/core/resample.py index cbb2f6a93c2fd3..631b91c3aad11e 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -25,7 +25,7 @@ from pandas._libs.lib import Timestamp from pandas._libs.period import IncompatibleFrequency -from pandas.util.decorators import Appender +from pandas.util._decorators import Appender from pandas.core.generic import _shared_docs _shared_docs_kwargs = dict() diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 1ca3786ecc1743..c55f4b5bf935f1 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -34,7 +34,7 @@ from pandas.core.dtypes.missing import na_value_for_dtype from pandas.core.internals import (items_overlap_with_suffix, concatenate_block_managers) -from pandas.util.decorators import Appender, Substitution +from pandas.util._decorators import Appender, Substitution from pandas.core.sorting import is_int64_overflow_possible import pandas.core.algorithms as algos diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index a3cf80d758b7b0..00b52e75f66fb3 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -30,7 +30,7 @@ from pandas._libs import algos as _algos, reshape as _reshape from pandas.core.frame import _shared_docs -from pandas.util.decorators import Appender +from pandas.util._decorators import Appender from pandas.core.index import MultiIndex, _get_na_value diff --git a/pandas/core/series.py b/pandas/core/series.py index e5f1d91eedfec8..6ec163bbaa73da 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -60,7 +60,7 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexes.period import PeriodIndex from pandas import compat -from pandas.util.terminal import get_terminal_size +from pandas.io.formats.terminal import get_terminal_size from pandas.compat import zip, u, OrderedDict, StringIO from pandas.compat.numpy import function as nv @@ -70,8 +70,8 @@ import pandas.core.common as com import pandas.core.nanops as nanops import pandas.io.formats.format as fmt -from pandas.util.decorators import Appender, deprecate_kwarg, Substitution -from pandas.util.validators import validate_bool_kwarg +from pandas.util._decorators import Appender, deprecate_kwarg, Substitution +from pandas.util._validators import validate_bool_kwarg from pandas._libs import index as libindex, tslib as libts, lib, iNaT from pandas.core.config import get_option diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index ef3600266c0370..dc34edd607ffb3 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -35,7 +35,7 @@ import pandas.core.algorithms as algos import pandas.core.ops as ops import pandas.io.formats.printing as printing -from pandas.util.decorators import Appender +from pandas.util._decorators import Appender from pandas.core.indexes.base import _index_shared_docs diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 05c97fac4b53a2..378d4ed4514a9b 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -26,7 +26,7 @@ import pandas.core.generic as generic from pandas.core.sparse.series import SparseSeries, SparseArray from pandas.core.sparse.libsparse import BlockIndex, get_blocks -from pandas.util.decorators import Appender +from pandas.util._decorators import Appender import pandas.core.ops as ops diff --git a/pandas/core/sparse/list.py b/pandas/core/sparse/list.py index e69ad6d0ab7adc..e40cb30d37fda4 100644 --- a/pandas/core/sparse/list.py +++ b/pandas/core/sparse/list.py @@ -5,7 +5,7 @@ from pandas.core.dtypes.common import is_scalar from pandas.core.sparse.array import SparseArray -from pandas.util.validators import validate_bool_kwarg +from pandas.util._validators import validate_bool_kwarg from pandas.core.sparse import libsparse as splib diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index a77bce8f06783a..52a07f340336e1 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -21,7 +21,7 @@ import pandas.core.common as com import pandas.core.ops as ops import pandas._libs.index as _index -from pandas.util.decorators import Appender +from pandas.util._decorators import Appender from pandas.core.sparse.array import ( make_sparse, _sparse_array_op, SparseArray, diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 5082ac7f80fbfe..c57d7a93624900 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -17,7 +17,7 @@ from pandas.core.algorithms import take_1d import pandas.compat as compat from pandas.core.base import AccessorProperty, NoNewAttributesMixin -from pandas.util.decorators import Appender +from pandas.util._decorators import Appender import re import pandas._libs.lib as lib import warnings diff --git a/pandas/util/importing.py b/pandas/core/util/__init__.py similarity index 100% rename from pandas/util/importing.py rename to pandas/core/util/__init__.py diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py new file mode 100644 index 00000000000000..6a5343e8a8e252 --- /dev/null +++ b/pandas/core/util/hashing.py @@ -0,0 +1,282 @@ +""" +data hash pandas / numpy objects +""" +import itertools + +import numpy as np +from pandas._libs import hashing +from pandas._libs.lib import is_bool_array +from pandas.core.dtypes.generic import ( + ABCMultiIndex, + ABCIndexClass, + ABCSeries, + ABCDataFrame) +from pandas.core.dtypes.common import ( + is_categorical_dtype, is_numeric_dtype, + is_datetime64_dtype, is_timedelta64_dtype, + is_list_like) + +# 16 byte long hashing key +_default_hash_key = '0123456789123456' + + +def _combine_hash_arrays(arrays, num_items): + """ + Parameters + ---------- + arrays : generator + num_items : int + + Should be the same as CPython's tupleobject.c + """ + try: + first = next(arrays) + except StopIteration: + return np.array([], dtype=np.uint64) + + arrays = itertools.chain([first], arrays) + + mult = np.uint64(1000003) + out = np.zeros_like(first) + np.uint64(0x345678) + for i, a in enumerate(arrays): + inverse_i = num_items - i + out ^= a + out *= mult + mult += np.uint64(82520 + inverse_i + inverse_i) + assert i + 1 == num_items, 'Fed in wrong num_items' + out += np.uint64(97531) + return out + + +def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, + categorize=True): + """ + Return a data hash of the Index/Series/DataFrame + + .. versionadded:: 0.19.2 + + Parameters + ---------- + index : boolean, default True + include the index in the hash (if Series/DataFrame) + encoding : string, default 'utf8' + encoding for data & key when strings + hash_key : string key to encode, default to _default_hash_key + categorize : bool, default True + Whether to first categorize object arrays before hashing. This is more + efficient when the array contains duplicate values. + + .. versionadded:: 0.20.0 + + Returns + ------- + Series of uint64, same length as the object + + """ + from pandas import Series + if hash_key is None: + hash_key = _default_hash_key + + if isinstance(obj, ABCMultiIndex): + return Series(hash_tuples(obj, encoding, hash_key), + dtype='uint64', copy=False) + + if isinstance(obj, ABCIndexClass): + h = hash_array(obj.values, encoding, hash_key, + categorize).astype('uint64', copy=False) + h = Series(h, index=obj, dtype='uint64', copy=False) + elif isinstance(obj, ABCSeries): + h = hash_array(obj.values, encoding, hash_key, + categorize).astype('uint64', copy=False) + if index: + index_iter = (hash_pandas_object(obj.index, + index=False, + encoding=encoding, + hash_key=hash_key, + categorize=categorize).values + for _ in [None]) + arrays = itertools.chain([h], index_iter) + h = _combine_hash_arrays(arrays, 2) + + h = Series(h, index=obj.index, dtype='uint64', copy=False) + + elif isinstance(obj, ABCDataFrame): + hashes = (hash_array(series.values) for _, series in obj.iteritems()) + num_items = len(obj.columns) + if index: + index_hash_generator = (hash_pandas_object(obj.index, + index=False, + encoding=encoding, + hash_key=hash_key, + categorize=categorize).values # noqa + for _ in [None]) + num_items += 1 + hashes = itertools.chain(hashes, index_hash_generator) + h = _combine_hash_arrays(hashes, num_items) + + h = Series(h, index=obj.index, dtype='uint64', copy=False) + else: + raise TypeError("Unexpected type for hashing %s" % type(obj)) + return h + + +def hash_tuples(vals, encoding='utf8', hash_key=None): + """ + Hash an MultiIndex / list-of-tuples efficiently + + .. versionadded:: 0.20.0 + + Parameters + ---------- + vals : MultiIndex, list-of-tuples, or single tuple + encoding : string, default 'utf8' + hash_key : string key to encode, default to _default_hash_key + + Returns + ------- + ndarray of hashed values array + """ + + is_tuple = False + if isinstance(vals, tuple): + vals = [vals] + is_tuple = True + elif not is_list_like(vals): + raise TypeError("must be convertible to a list-of-tuples") + + from pandas import Categorical, MultiIndex + + if not isinstance(vals, ABCMultiIndex): + vals = MultiIndex.from_tuples(vals) + + # create a list-of-Categoricals + vals = [Categorical(vals.labels[level], + vals.levels[level], + ordered=False, + fastpath=True) + for level in range(vals.nlevels)] + + # hash the list-of-ndarrays + hashes = (_hash_categorical(cat, + encoding=encoding, + hash_key=hash_key) + for cat in vals) + h = _combine_hash_arrays(hashes, len(vals)) + if is_tuple: + h = h[0] + + return h + + +def _hash_categorical(c, encoding, hash_key): + """ + Hash a Categorical by hashing its categories, and then mapping the codes + to the hashes + + Parameters + ---------- + c : Categorical + encoding : string, default 'utf8' + hash_key : string key to encode, default to _default_hash_key + + Returns + ------- + ndarray of hashed values array, same size as len(c) + """ + hashed = hash_array(c.categories.values, encoding, hash_key, + categorize=False) + + # we have uint64, as we don't directly support missing values + # we don't want to use take_nd which will coerce to float + # instead, directly construt the result with a + # max(np.uint64) as the missing value indicator + # + # TODO: GH 15362 + + mask = c.isnull() + if len(hashed): + result = hashed.take(c.codes) + else: + result = np.zeros(len(mask), dtype='uint64') + + if mask.any(): + result[mask] = np.iinfo(np.uint64).max + + return result + + +def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): + """ + Given a 1d array, return an array of deterministic integers. + + .. versionadded:: 0.19.2 + + Parameters + ---------- + vals : ndarray, Categorical + encoding : string, default 'utf8' + encoding for data & key when strings + hash_key : string key to encode, default to _default_hash_key + categorize : bool, default True + Whether to first categorize object arrays before hashing. This is more + efficient when the array contains duplicate values. + + .. versionadded:: 0.20.0 + + Returns + ------- + 1d uint64 numpy array of hash values, same length as the vals + + """ + + if not hasattr(vals, 'dtype'): + raise TypeError("must pass a ndarray-like") + + if hash_key is None: + hash_key = _default_hash_key + + # For categoricals, we hash the categories, then remap the codes to the + # hash values. (This check is above the complex check so that we don't ask + # numpy if categorical is a subdtype of complex, as it will choke. + if is_categorical_dtype(vals.dtype): + return _hash_categorical(vals, encoding, hash_key) + + # we'll be working with everything as 64-bit values, so handle this + # 128-bit value early + if np.issubdtype(vals.dtype, np.complex128): + return hash_array(vals.real) + 23 * hash_array(vals.imag) + + # First, turn whatever array this is into unsigned 64-bit ints, if we can + # manage it. + if is_bool_array(vals): + vals = vals.astype('u8') + elif (is_datetime64_dtype(vals) or + is_timedelta64_dtype(vals)): + vals = vals.view('i8').astype('u8', copy=False) + elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8): + vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') + else: + # With repeated values, its MUCH faster to categorize object dtypes, + # then hash and rename categories. We allow skipping the categorization + # when the values are known/likely to be unique. + if categorize: + from pandas import factorize, Categorical, Index + codes, categories = factorize(vals, sort=False) + cat = Categorical(codes, Index(categories), + ordered=False, fastpath=True) + return _hash_categorical(cat, encoding, hash_key) + + try: + vals = hashing.hash_object_array(vals, hash_key, encoding) + except TypeError: + # we have mixed types + vals = hashing.hash_object_array(vals.astype(str).astype(object), + hash_key, encoding) + + # Then, redistribute these 64-bit ints within the space of 64-bit ints + vals ^= vals >> 30 + vals *= np.uint64(0xbf58476d1ce4e5b9) + vals ^= vals >> 27 + vals *= np.uint64(0x94d049bb133111eb) + vals ^= vals >> 31 + return vals diff --git a/pandas/core/window.py b/pandas/core/window.py index 6d8f12e982f12b..772e7422b27a95 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -37,8 +37,8 @@ from pandas.tseries.offsets import DateOffset from pandas import compat from pandas.compat.numpy import function as nv -from pandas.util.decorators import (Substitution, Appender, - cache_readonly) +from pandas.util._decorators import (Substitution, Appender, + cache_readonly) from pandas.core.generic import _shared_docs from textwrap import dedent diff --git a/pandas/io/api.py b/pandas/io/api.py index e312e7bc2f300f..7f0d3c3631f639 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -5,7 +5,7 @@ # flake8: noqa from pandas.io.parsers import read_csv, read_table, read_fwf -from pandas.io.clipboard import read_clipboard +from pandas.io.clipboard.clipboard import read_clipboard from pandas.io.excel import ExcelFile, ExcelWriter, read_excel from pandas.io.pytables import HDFStore, get_store, read_hdf from pandas.io.json import read_json diff --git a/pandas/util/clipboard/__init__.py b/pandas/io/clipboard/__init__.py similarity index 100% rename from pandas/util/clipboard/__init__.py rename to pandas/io/clipboard/__init__.py diff --git a/pandas/io/clipboard.py b/pandas/io/clipboard/clipboard.py similarity index 97% rename from pandas/io/clipboard.py rename to pandas/io/clipboard/clipboard.py index 3c7ac528d83fdd..6252a02b0d63da 100644 --- a/pandas/io/clipboard.py +++ b/pandas/io/clipboard/clipboard.py @@ -26,7 +26,7 @@ def read_clipboard(sep='\s+', **kwargs): # pragma: no cover raise NotImplementedError( 'reading from clipboard only supports utf-8 encoding') - from pandas.util.clipboard import clipboard_get + from pandas.io.clipboard import clipboard_get from pandas.io.parsers import read_table text = clipboard_get() @@ -92,7 +92,7 @@ def to_clipboard(obj, excel=None, sep=None, **kwargs): # pragma: no cover if encoding is not None and encoding.lower().replace('-', '') != 'utf8': raise ValueError('clipboard only supports utf-8 encoding') - from pandas.util.clipboard import clipboard_set + from pandas.io.clipboard import clipboard_set if excel is None: excel = True diff --git a/pandas/util/clipboard/clipboards.py b/pandas/io/clipboard/clipboards.py similarity index 100% rename from pandas/util/clipboard/clipboards.py rename to pandas/io/clipboard/clipboards.py diff --git a/pandas/util/clipboard/exceptions.py b/pandas/io/clipboard/exceptions.py similarity index 100% rename from pandas/util/clipboard/exceptions.py rename to pandas/io/clipboard/exceptions.py diff --git a/pandas/util/clipboard/windows.py b/pandas/io/clipboard/windows.py similarity index 100% rename from pandas/util/clipboard/windows.py rename to pandas/io/clipboard/windows.py diff --git a/pandas/io/excel.py b/pandas/io/excel.py index fbb10ebdfc56de..f56b555163d5ab 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -29,7 +29,7 @@ import pandas.compat.openpyxl_compat as openpyxl_compat from warnings import warn from distutils.version import LooseVersion -from pandas.util.decorators import Appender +from pandas.util._decorators import Appender from textwrap import fill __all__ = ["read_excel", "ExcelWriter", "ExcelFile"] diff --git a/pandas/io/formats/console.py b/pandas/io/formats/console.py index 0e46b0073a53de..ab75e3fa253ced 100644 --- a/pandas/io/formats/console.py +++ b/pandas/io/formats/console.py @@ -4,7 +4,7 @@ import sys import locale -from pandas.util.terminal import get_terminal_size +from pandas.io.formats.terminal import get_terminal_size # ----------------------------------------------------------------------------- # Global formatting options diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 43b0b5fbeee906..65098bb2aa4045 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -30,7 +30,7 @@ from pandas import compat from pandas.compat import (StringIO, lzip, range, map, zip, u, OrderedDict, unichr) -from pandas.util.terminal import get_terminal_size +from pandas.io.formats.terminal import get_terminal_size from pandas.core.config import get_option, set_option from pandas.io.common import _get_handle, UnicodeWriter, _expand_user from pandas.io.formats.printing import adjoin, justify, pprint_thing diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 71c61998be0924..eac82ddde23187 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -29,7 +29,7 @@ from pandas.core.generic import _shared_docs import pandas.core.common as com from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice -from pandas.util.decorators import Appender +from pandas.util._decorators import Appender try: import matplotlib.pyplot as plt from matplotlib import colors diff --git a/pandas/util/terminal.py b/pandas/io/formats/terminal.py similarity index 100% rename from pandas/util/terminal.py rename to pandas/io/formats/terminal.py diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 95b1394c88ac2c..d9e163cb50695e 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -37,7 +37,7 @@ _NA_VALUES, _infer_compression) from pandas.core.tools import datetimes as tools -from pandas.util.decorators import Appender +from pandas.util._decorators import Appender import pandas._libs.lib as lib import pandas.io.libparsers as libparsers diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 76fc55154bc497..a43a5988a2ade8 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -14,7 +14,7 @@ from pandas import compat import struct import numpy as np -from pandas.util.decorators import Appender +from pandas.util._decorators import Appender import warnings _correct_line1 = ("HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!" diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 691582629251a1..55cac83804cd90 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -27,7 +27,7 @@ from pandas import compat, to_timedelta, to_datetime, isnull, DatetimeIndex from pandas.compat import lrange, lmap, lzip, text_type, string_types, range, \ zip, BytesIO -from pandas.util.decorators import Appender +from pandas.util._decorators import Appender import pandas as pd from pandas.io.common import get_filepath_or_buffer, BaseIterator diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index c3476d1443fc3a..e88979b14c8af2 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -9,7 +9,7 @@ import numpy as np -from pandas.util.decorators import cache_readonly +from pandas.util._decorators import cache_readonly from pandas.core.base import PandasObject from pandas.core.dtypes.common import ( is_list_like, @@ -25,7 +25,7 @@ from pandas.compat import range, lrange, map, zip, string_types import pandas.compat as compat from pandas.io.formats.printing import pprint_thing -from pandas.util.decorators import Appender +from pandas.util._decorators import Appender from pandas.plotting._compat import (_mpl_ge_1_3_1, _mpl_ge_1_5_0) diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 93eceba9a3f026..20ada033c0f58a 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -4,7 +4,7 @@ import numpy as np -from pandas.util.decorators import deprecate_kwarg +from pandas.util._decorators import deprecate_kwarg from pandas.core.dtypes.missing import notnull from pandas.compat import range, lrange, lmap, zip from pandas.io.formats.printing import pprint_thing diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index f98ffa26e0c2bb..f6c3a08c6721ac 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -8,7 +8,7 @@ import numpy as np from pandas.core.dtypes.common import is_scalar from pandas.core.api import DataFrame, Series -from pandas.util.decorators import Substitution, Appender +from pandas.util._decorators import Substitution, Appender __all__ = ['rolling_count', 'rolling_max', 'rolling_min', 'rolling_sum', 'rolling_mean', 'rolling_std', 'rolling_cov', diff --git a/pandas/tests/frame/common.py b/pandas/tests/frame/common.py index b9cd764c8704c7..b475d25eb5dac6 100644 --- a/pandas/tests/frame/common.py +++ b/pandas/tests/frame/common.py @@ -1,7 +1,7 @@ import numpy as np from pandas import compat -from pandas.util.decorators import cache_readonly +from pandas.util._decorators import cache_readonly import pandas.util.testing as tm import pandas as pd diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 3cea731cfd4404..648ff9387b5d87 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -29,7 +29,7 @@ import pandas.io.formats.printing as printing import pandas.util.testing as tm -from pandas.util.terminal import get_terminal_size +from pandas.io.formats.terminal import get_terminal_size from pandas.core.config import (set_option, get_option, option_context, reset_option) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index e9ffb2dca7ae5d..5ae4a18e8108c8 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -10,7 +10,7 @@ from pandas import get_option from pandas.util import testing as tm from pandas.util.testing import makeCustomDataframe as mkdf -from pandas.util.clipboard.exceptions import PyperclipException +from pandas.io.clipboard.exceptions import PyperclipException try: diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 9a24e4ae2dad00..444e5b12f212fe 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -7,7 +7,7 @@ from pandas import DataFrame, Series from pandas.compat import zip, iteritems -from pandas.util.decorators import cache_readonly +from pandas.util._decorators import cache_readonly from pandas.core.dtypes.api import is_list_like import pandas.util.testing as tm from pandas.util.testing import (ensure_clean, diff --git a/pandas/tests/series/common.py b/pandas/tests/series/common.py index 613961e1c670f0..0c25dcb29c3b2a 100644 --- a/pandas/tests/series/common.py +++ b/pandas/tests/series/common.py @@ -1,4 +1,4 @@ -from pandas.util.decorators import cache_readonly +from pandas.util._decorators import cache_readonly import pandas.util.testing as tm import pandas as pd diff --git a/pandas/tests/util/__init__.py b/pandas/tests/util/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/pandas/tests/reshape/test_hashing.py b/pandas/tests/util/test_hashing.py similarity index 94% rename from pandas/tests/reshape/test_hashing.py rename to pandas/tests/util/test_hashing.py index 622768353dd50f..0d2122ff14221c 100644 --- a/pandas/tests/reshape/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -5,7 +5,8 @@ import pandas as pd from pandas import DataFrame, Series, Index, MultiIndex -from pandas.util.hashing import hash_array, hash_tuples, hash_pandas_object +from pandas.util import hash_array, hash_pandas_object +from pandas.core.util.hashing import hash_tuples import pandas.util.testing as tm @@ -267,3 +268,18 @@ def test_hash_collisions(self): result = hash_array(np.asarray(L, dtype=object), 'utf8') tm.assert_numpy_array_equal( result, np.concatenate([expected1, expected2], axis=0)) + + +def test_deprecation(): + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + from pandas.util.hashing import hash_pandas_object + obj = Series(list('abc')) + hash_pandas_object(obj, hash_key='9876543210123456') + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + from pandas.util.hashing import hash_array + obj = np.array([1, 2, 3]) + hash_array(obj, hash_key='9876543210123456') diff --git a/pandas/tests/test_testing.py b/pandas/tests/util/test_testing.py similarity index 100% rename from pandas/tests/test_testing.py rename to pandas/tests/util/test_testing.py diff --git a/pandas/tests/test_util.py b/pandas/tests/util/test_util.py similarity index 98% rename from pandas/tests/test_util.py rename to pandas/tests/util/test_util.py index e9e04f76704f2d..65e2cf7006ca8d 100644 --- a/pandas/tests/test_util.py +++ b/pandas/tests/util/test_util.py @@ -9,10 +9,10 @@ import pytest from pandas.compat import intern from pandas.util._move import move_into_mutable_buffer, BadMove, stolenbuf -from pandas.util.decorators import deprecate_kwarg -from pandas.util.validators import (validate_args, validate_kwargs, - validate_args_and_kwargs, - validate_bool_kwarg) +from pandas.util._decorators import deprecate_kwarg +from pandas.util._validators import (validate_args, validate_kwargs, + validate_args_and_kwargs, + validate_bool_kwarg) import pandas.util.testing as tm diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 06d70f14565184..dddf835424f67e 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -16,7 +16,7 @@ import pandas.core.algorithms as algos from pandas.core.algorithms import unique from pandas.tseries.offsets import DateOffset -from pandas.util.decorators import cache_readonly, deprecate_kwarg +from pandas.util._decorators import cache_readonly, deprecate_kwarg import pandas.tseries.offsets as offsets from pandas._libs import lib, tslib diff --git a/pandas/util/__init__.py b/pandas/util/__init__.py index e69de29bb2d1d6..e86af930fef7c3 100644 --- a/pandas/util/__init__.py +++ b/pandas/util/__init__.py @@ -0,0 +1,2 @@ +from pandas.core.util.hashing import hash_pandas_object, hash_array # noqa +from pandas.util._decorators import Appender, Substitution, cache_readonly # noqa diff --git a/pandas/util/decorators.py b/pandas/util/_decorators.py similarity index 100% rename from pandas/util/decorators.py rename to pandas/util/_decorators.py diff --git a/pandas/util/depr_module.py b/pandas/util/_depr_module.py similarity index 100% rename from pandas/util/depr_module.py rename to pandas/util/_depr_module.py diff --git a/pandas/util/doctools.py b/pandas/util/_doctools.py similarity index 100% rename from pandas/util/doctools.py rename to pandas/util/_doctools.py diff --git a/pandas/util/print_versions.py b/pandas/util/_print_versions.py similarity index 100% rename from pandas/util/print_versions.py rename to pandas/util/_print_versions.py diff --git a/pandas/util/testing.pyx b/pandas/util/_testing.pyx similarity index 100% rename from pandas/util/testing.pyx rename to pandas/util/_testing.pyx diff --git a/pandas/util/validators.py b/pandas/util/_validators.py similarity index 100% rename from pandas/util/validators.py rename to pandas/util/_validators.py diff --git a/pandas/util/hashing.py b/pandas/util/hashing.py index 3046c62a03f48c..de2f91f9f6b4a0 100644 --- a/pandas/util/hashing.py +++ b/pandas/util/hashing.py @@ -1,278 +1,18 @@ -""" -data hash pandas / numpy objects -""" -import itertools +import warnings +import sys -import numpy as np -from pandas import Series, factorize, Categorical, Index, MultiIndex -from pandas.util import libhashing as _hash -from pandas._libs.lib import is_bool_array -from pandas.core.dtypes.generic import ( - ABCIndexClass, - ABCSeries, - ABCDataFrame) -from pandas.core.dtypes.common import ( - is_categorical_dtype, is_numeric_dtype, - is_datetime64_dtype, is_timedelta64_dtype, - is_list_like) +m = sys.modules['pandas.util.hashing'] +for t in ['hash_pandas_object', 'hash_array']: -# 16 byte long hashing key -_default_hash_key = '0123456789123456' + def outer(t=t): + def wrapper(*args, **kwargs): + from pandas import util + warnings.warn("pandas.util.hashing is deprecated and will be " + "removed in a future version, import " + "from pandas.util", + FutureWarning, stacklevel=3) + return getattr(util, t)(*args, **kwargs) + return wrapper -def _combine_hash_arrays(arrays, num_items): - """ - Parameters - ---------- - arrays : generator - num_items : int - - Should be the same as CPython's tupleobject.c - """ - try: - first = next(arrays) - except StopIteration: - return np.array([], dtype=np.uint64) - - arrays = itertools.chain([first], arrays) - - mult = np.uint64(1000003) - out = np.zeros_like(first) + np.uint64(0x345678) - for i, a in enumerate(arrays): - inverse_i = num_items - i - out ^= a - out *= mult - mult += np.uint64(82520 + inverse_i + inverse_i) - assert i + 1 == num_items, 'Fed in wrong num_items' - out += np.uint64(97531) - return out - - -def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, - categorize=True): - """ - Return a data hash of the Index/Series/DataFrame - - .. versionadded:: 0.19.2 - - Parameters - ---------- - index : boolean, default True - include the index in the hash (if Series/DataFrame) - encoding : string, default 'utf8' - encoding for data & key when strings - hash_key : string key to encode, default to _default_hash_key - categorize : bool, default True - Whether to first categorize object arrays before hashing. This is more - efficient when the array contains duplicate values. - - .. versionadded:: 0.20.0 - - Returns - ------- - Series of uint64, same length as the object - - """ - if hash_key is None: - hash_key = _default_hash_key - - if isinstance(obj, MultiIndex): - return Series(hash_tuples(obj, encoding, hash_key), - dtype='uint64', copy=False) - - if isinstance(obj, ABCIndexClass): - h = hash_array(obj.values, encoding, hash_key, - categorize).astype('uint64', copy=False) - h = Series(h, index=obj, dtype='uint64', copy=False) - elif isinstance(obj, ABCSeries): - h = hash_array(obj.values, encoding, hash_key, - categorize).astype('uint64', copy=False) - if index: - index_iter = (hash_pandas_object(obj.index, - index=False, - encoding=encoding, - hash_key=hash_key, - categorize=categorize).values - for _ in [None]) - arrays = itertools.chain([h], index_iter) - h = _combine_hash_arrays(arrays, 2) - - h = Series(h, index=obj.index, dtype='uint64', copy=False) - - elif isinstance(obj, ABCDataFrame): - hashes = (hash_array(series.values) for _, series in obj.iteritems()) - num_items = len(obj.columns) - if index: - index_hash_generator = (hash_pandas_object(obj.index, - index=False, - encoding=encoding, - hash_key=hash_key, - categorize=categorize).values # noqa - for _ in [None]) - num_items += 1 - hashes = itertools.chain(hashes, index_hash_generator) - h = _combine_hash_arrays(hashes, num_items) - - h = Series(h, index=obj.index, dtype='uint64', copy=False) - else: - raise TypeError("Unexpected type for hashing %s" % type(obj)) - return h - - -def hash_tuples(vals, encoding='utf8', hash_key=None): - """ - Hash an MultiIndex / list-of-tuples efficiently - - .. versionadded:: 0.20.0 - - Parameters - ---------- - vals : MultiIndex, list-of-tuples, or single tuple - encoding : string, default 'utf8' - hash_key : string key to encode, default to _default_hash_key - - Returns - ------- - ndarray of hashed values array - """ - - is_tuple = False - if isinstance(vals, tuple): - vals = [vals] - is_tuple = True - elif not is_list_like(vals): - raise TypeError("must be convertible to a list-of-tuples") - - if not isinstance(vals, MultiIndex): - vals = MultiIndex.from_tuples(vals) - - # create a list-of-Categoricals - vals = [Categorical(vals.labels[level], - vals.levels[level], - ordered=False, - fastpath=True) - for level in range(vals.nlevels)] - - # hash the list-of-ndarrays - hashes = (_hash_categorical(cat, - encoding=encoding, - hash_key=hash_key) - for cat in vals) - h = _combine_hash_arrays(hashes, len(vals)) - if is_tuple: - h = h[0] - - return h - - -def _hash_categorical(c, encoding, hash_key): - """ - Hash a Categorical by hashing its categories, and then mapping the codes - to the hashes - - Parameters - ---------- - c : Categorical - encoding : string, default 'utf8' - hash_key : string key to encode, default to _default_hash_key - - Returns - ------- - ndarray of hashed values array, same size as len(c) - """ - hashed = hash_array(c.categories.values, encoding, hash_key, - categorize=False) - - # we have uint64, as we don't directly support missing values - # we don't want to use take_nd which will coerce to float - # instead, directly construt the result with a - # max(np.uint64) as the missing value indicator - # - # TODO: GH 15362 - - mask = c.isnull() - if len(hashed): - result = hashed.take(c.codes) - else: - result = np.zeros(len(mask), dtype='uint64') - - if mask.any(): - result[mask] = np.iinfo(np.uint64).max - - return result - - -def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): - """ - Given a 1d array, return an array of deterministic integers. - - .. versionadded:: 0.19.2 - - Parameters - ---------- - vals : ndarray, Categorical - encoding : string, default 'utf8' - encoding for data & key when strings - hash_key : string key to encode, default to _default_hash_key - categorize : bool, default True - Whether to first categorize object arrays before hashing. This is more - efficient when the array contains duplicate values. - - .. versionadded:: 0.20.0 - - Returns - ------- - 1d uint64 numpy array of hash values, same length as the vals - - """ - - if not hasattr(vals, 'dtype'): - raise TypeError("must pass a ndarray-like") - - if hash_key is None: - hash_key = _default_hash_key - - # For categoricals, we hash the categories, then remap the codes to the - # hash values. (This check is above the complex check so that we don't ask - # numpy if categorical is a subdtype of complex, as it will choke. - if is_categorical_dtype(vals.dtype): - return _hash_categorical(vals, encoding, hash_key) - - # we'll be working with everything as 64-bit values, so handle this - # 128-bit value early - if np.issubdtype(vals.dtype, np.complex128): - return hash_array(vals.real) + 23 * hash_array(vals.imag) - - # First, turn whatever array this is into unsigned 64-bit ints, if we can - # manage it. - if is_bool_array(vals): - vals = vals.astype('u8') - elif (is_datetime64_dtype(vals) or - is_timedelta64_dtype(vals)): - vals = vals.view('i8').astype('u8', copy=False) - elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8): - vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') - else: - # With repeated values, its MUCH faster to categorize object dtypes, - # then hash and rename categories. We allow skipping the categorization - # when the values are known/likely to be unique. - if categorize: - codes, categories = factorize(vals, sort=False) - cat = Categorical(codes, Index(categories), - ordered=False, fastpath=True) - return _hash_categorical(cat, encoding, hash_key) - - try: - vals = _hash.hash_object_array(vals, hash_key, encoding) - except TypeError: - # we have mixed types - vals = _hash.hash_object_array(vals.astype(str).astype(object), - hash_key, encoding) - - # Then, redistribute these 64-bit ints within the space of 64-bit ints - vals ^= vals >> 30 - vals *= np.uint64(0xbf58476d1ce4e5b9) - vals ^= vals >> 27 - vals *= np.uint64(0x94d049bb133111eb) - vals ^= vals >> 31 - return vals + setattr(m, t, outer(t)) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 354e11ce0133a5..307dbeb0665e41 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -48,7 +48,7 @@ Index, MultiIndex, Series, DataFrame, Panel, Panel4D) -from pandas.util import libtesting +from pandas.util import _testing from pandas.io.common import urlopen try: import pytest @@ -184,7 +184,7 @@ def assert_almost_equal(left, right, check_exact=False, else: obj = 'Input' assert_class_equal(left, right, obj=obj) - return libtesting.assert_almost_equal( + return _testing.assert_almost_equal( left, right, check_dtype=check_dtype, check_less_precise=check_less_precise, @@ -220,7 +220,7 @@ def _check_isinstance(left, right, cls): def assert_dict_equal(left, right, compare_keys=True): _check_isinstance(left, right, dict) - return libtesting.assert_dict_equal(left, right, compare_keys=compare_keys) + return _testing.assert_dict_equal(left, right, compare_keys=compare_keys) def randbool(size=(), p=0.5): @@ -937,10 +937,10 @@ def _get_ilevel_values(index, level): .format(obj, np.round(diff, 5)) raise_assert_detail(obj, msg, left, right) else: - libtesting.assert_almost_equal(left.values, right.values, - check_less_precise=check_less_precise, - check_dtype=exact, - obj=obj, lobj=left, robj=right) + _testing.assert_almost_equal(left.values, right.values, + check_less_precise=check_less_precise, + check_dtype=exact, + obj=obj, lobj=left, robj=right) # metadata comparison if check_names: @@ -1273,10 +1273,10 @@ def assert_series_equal(left, right, check_dtype=True, assert_index_equal(l, r, obj='{0}.index'.format(obj)) else: - libtesting.assert_almost_equal(left.get_values(), right.get_values(), - check_less_precise=check_less_precise, - check_dtype=check_dtype, - obj='{0}'.format(obj)) + _testing.assert_almost_equal(left.get_values(), right.get_values(), + check_less_precise=check_less_precise, + check_dtype=check_dtype, + obj='{0}'.format(obj)) # metadata comparison if check_names: diff --git a/setup.py b/setup.py index 6f3ddbe2ad9d0e..51aacf5e0b8455 100755 --- a/setup.py +++ b/setup.py @@ -116,7 +116,7 @@ def is_platform_mac(): 'join': ['_libs/join_helper.pxi.in', '_libs/join_func_helper.pxi.in'], 'reshape': ['_libs/reshape_helper.pxi.in'], 'hashtable': ['_libs/hashtable_class_helper.pxi.in', - '_libs/hashtable_func_helper.pxi.in'], + '_libs/hashtable_func_helper.pxi.in'], 'index': ['_libs/index_class_helper.pxi.in'], 'sparse': ['core/sparse/sparse_op_helper.pxi.in'], 'interval': ['_libs/intervaltree.pxi.in'] @@ -337,10 +337,10 @@ class CheckSDist(sdist_class): 'pandas/_libs/algos.pyx', 'pandas/_libs/join.pyx', 'pandas/_libs/interval.pyx', + 'pandas/_libs/hashing.pyx', 'pandas/core/window.pyx', 'pandas/core/sparse/sparse.pyx', - 'pandas/util/testing.pyx', - 'pandas/tools/hash.pyx', + 'pandas/util/_testing.pyx', 'pandas/io/parsers.pyx', 'pandas/io/sas/sas.pyx'] @@ -526,10 +526,10 @@ def pxd(name): 'core.sparse.libsparse': {'pyxfile': 'core/sparse/sparse', 'depends': (['pandas/core/sparse/sparse.pyx'] + _pxi_dep['sparse'])}, - 'util.libtesting': {'pyxfile': 'util/testing', - 'depends': ['pandas/util/testing.pyx']}, - 'util.libhashing': {'pyxfile': 'util/hashing', - 'depends': ['pandas/util/hashing.pyx']}, + 'util._testing': {'pyxfile': 'util/_testing', + 'depends': ['pandas/util/_testing.pyx']}, + '_libs.hashing': {'pyxfile': '_libs/hashing', + 'depends': ['pandas/_libs/hashing.pyx']}, 'io.sas.libsas': {'pyxfile': 'io/sas/sas'}, } @@ -645,6 +645,7 @@ def pxd(name): 'pandas.core.reshape', 'pandas.core.sparse', 'pandas.core.tools', + 'pandas.core.util', 'pandas.computation', 'pandas.errors', 'pandas.io', @@ -652,6 +653,7 @@ def pxd(name): 'pandas.io.sas', 'pandas.io.msgpack', 'pandas.io.formats', + 'pandas.io.clipboard', 'pandas._libs', 'pandas.plotting', 'pandas.stats', @@ -679,9 +681,9 @@ def pxd(name): 'pandas.tests.tseries', 'pandas.tests.plotting', 'pandas.tests.tools', + 'pandas.tests.util', 'pandas.tools', 'pandas.tseries', - 'pandas.util.clipboard' ], package_data={'pandas.tests': ['data/*.csv'], 'pandas.tests.indexes': ['data/*.pickle'],