From baeb1bf92b763861212d0dee951bd8ea658deadb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 19 Jun 2019 12:57:41 -0500 Subject: [PATCH 01/34] BUG: modfy(SparseArray) (#26947) Closes #26946 (cherry picked from commit 430f664ddbb4dab542b34b2c75b6d086fdef4934) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/arrays/sparse.py | 11 +++++++++++ pandas/tests/arrays/sparse/test_array.py | 10 ++++++++++ 3 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index b458b0f998255..e6bc422b52e89 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -769,6 +769,7 @@ Sparse - Bug in :class:`SparseFrame` constructor where passing ``None`` as the data would cause ``default_fill_value`` to be ignored (:issue:`16807`) - Bug in :class:`SparseDataFrame` when adding a column in which the length of values does not match length of index, ``AssertionError`` is raised instead of raising ``ValueError`` (:issue:`25484`) - Introduce a better error message in :meth:`Series.sparse.from_coo` so it returns a ``TypeError`` for inputs that are not coo matrices (:issue:`26554`) +- Bug in :func:`numpy.modf` on a :class:`SparseArray`. Now a tuple of :class:`SparseArray` is returned. Other ^^^^^ diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 5e636b5105e56..3dda6868a80da 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1697,6 +1697,17 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # No alignment necessary. sp_values = getattr(ufunc, method)(self.sp_values, **kwargs) fill_value = getattr(ufunc, method)(self.fill_value, **kwargs) + + if isinstance(sp_values, tuple): + # multiple outputs. e.g. modf + arrays = tuple( + self._simple_new(sp_value, + self.sp_index, + SparseDtype(sp_value.dtype, fv)) + for sp_value, fv in zip(sp_values, fill_value) + ) + return arrays + return self._simple_new(sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value)) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index c0a1b32079044..231b5a92dbb3a 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -1071,6 +1071,16 @@ def test_ufunc_args(self): result = SparseArray([2, 0, 1, -1], fill_value=1) tm.assert_sp_array_equal(np.add(sparse, 1), result) + @pytest.mark.parametrize('fill_value', [0.0, np.nan]) + def test_modf(self, fill_value): + # https://github.com/pandas-dev/pandas/issues/26946 + sparse = pd.SparseArray([fill_value] * 10 + [1.1, 2.2], + fill_value=fill_value) + r1, r2 = np.modf(sparse) + e1, e2 = np.modf(np.asarray(sparse)) + tm.assert_sp_array_equal(r1, pd.SparseArray(e1, fill_value=fill_value)) + tm.assert_sp_array_equal(r2, pd.SparseArray(e2, fill_value=fill_value)) + def test_nbytes_integer(self): arr = SparseArray([1, 0, 0, 0, 2], kind='integer') result = arr.nbytes From d150f17384f53c3269189daccf2276d1aded7936 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 20 Jun 2019 08:58:52 +0200 Subject: [PATCH 02/34] TST: fix class method of test BoolArray (#26957) --- pandas/tests/extension/arrow/bool.py | 1 + pandas/tests/extension/arrow/test_bool.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/arrow/bool.py b/pandas/tests/extension/arrow/bool.py index 435ea4e3ec2b5..2263f53544e41 100644 --- a/pandas/tests/extension/arrow/bool.py +++ b/pandas/tests/extension/arrow/bool.py @@ -114,6 +114,7 @@ def copy(self, deep=False): else: return type(self)(copy.copy(self._data)) + @classmethod def _concat_same_type(cls, to_concat): chunks = list(itertools.chain.from_iterable(x._data.chunks for x in to_concat)) diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index 01163064b0918..a7f28310b7554 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -36,8 +36,7 @@ def test_array_type_with_arg(self, data, dtype): class TestInterface(BaseArrowTests, base.BaseInterfaceTests): - def test_repr(self, data): - raise pytest.skip("TODO") + pass class TestConstructors(BaseArrowTests, base.BaseConstructorsTests): From cfd65e98e694b2ad40e97d06ffdd9096a3dea909 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Jun 2019 06:48:13 -0500 Subject: [PATCH 03/34] TST: Fix flaky import test (#26953) * TST: Fix flaky import test I'm not sure what, but the missing depedency test is causing issues. Now we check that things work by running it in a subprocess with site-packages disabled. Closes https://github.com/pandas-dev/pandas/issues/26952 --- pandas/tests/test_downstream.py | 36 +++++++++------------------------ 1 file changed, 9 insertions(+), 27 deletions(-) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 14d3ee5ac4fe2..9fe8b0f9563ef 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -1,7 +1,6 @@ """ Testing that we work in the downstream packages """ -import builtins import importlib import subprocess import sys @@ -134,30 +133,13 @@ def test_pyarrow(df): tm.assert_frame_equal(result, df) -def test_missing_required_dependency(monkeypatch): +def test_missing_required_dependency(): # GH 23868 - original_import = __import__ - - def mock_import_fail(name, *args, **kwargs): - if name == "numpy": - raise ImportError("cannot import name numpy") - elif name == "pytz": - raise ImportError("cannot import name some_dependency") - elif name == "dateutil": - raise ImportError("cannot import name some_other_dependency") - else: - return original_import(name, *args, **kwargs) - - expected_msg = ( - "Unable to import required dependencies:" - "\nnumpy: cannot import name numpy" - "\npytz: cannot import name some_dependency" - "\ndateutil: cannot import name some_other_dependency" - ) - - import pandas as pd - - with monkeypatch.context() as m: - m.setattr(builtins, "__import__", mock_import_fail) - with pytest.raises(ImportError, match=expected_msg): - importlib.reload(pd) + # use the -S flag to disable site-packages + call = ['python', '-S', '-c', 'import pandas'] + + with pytest.raises(subprocess.CalledProcessError) as exc: + subprocess.check_output(call, stderr=subprocess.STDOUT) + + output = exc.value.stdout.decode() + assert all(x in output for x in ['numpy', 'pytz', 'dateutil']) From a4a18a9a694ba2641ec3ba98afc20615b2d39ad7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 20 Jun 2019 17:51:18 -0700 Subject: [PATCH 04/34] Assorted cleanups (#26975) --- pandas/core/internals/managers.py | 19 ------------------- pandas/io/formats/format.py | 2 +- pandas/io/sql.py | 2 +- pandas/tests/frame/test_constructors.py | 5 +++-- pandas/tests/frame/test_missing.py | 15 ++++++++++----- 5 files changed, 15 insertions(+), 28 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 907498c7ff350..7fe34279c0482 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -23,7 +23,6 @@ from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algos -from pandas.core.arrays.sparse import _maybe_to_sparse from pandas.core.base import PandasObject from pandas.core.index import Index, MultiIndex, ensure_index from pandas.core.indexing import maybe_convert_indices @@ -1727,10 +1726,6 @@ def form_blocks(arrays, names, axes): object_blocks = _simple_blockify(items_dict['ObjectBlock'], np.object_) blocks.extend(object_blocks) - if len(items_dict['SparseBlock']) > 0: - sparse_blocks = _sparse_blockify(items_dict['SparseBlock']) - blocks.extend(sparse_blocks) - if len(items_dict['CategoricalBlock']) > 0: cat_blocks = [make_block(array, klass=CategoricalBlock, placement=[i]) for i, _, array in items_dict['CategoricalBlock']] @@ -1797,20 +1792,6 @@ def _multi_blockify(tuples, dtype=None): return new_blocks -def _sparse_blockify(tuples, dtype=None): - """ return an array of blocks that potentially have different dtypes (and - are sparse) - """ - - new_blocks = [] - for i, names, array in tuples: - array = _maybe_to_sparse(array) - block = make_block(array, placement=[i]) - new_blocks.append(block) - - return new_blocks - - def _stack_arrays(tuples, dtype): # fml diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 8655fb05f34e2..b2ef45b15e549 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1567,7 +1567,7 @@ def __call__(self, num): formatted = format_str.format(mant=mant, prefix=prefix) - return formatted # .strip() + return formatted def set_eng_float_format(accuracy=3, use_eng_prefix=False): diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 1e3fe2ade6ab7..6cb57077be76a 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -623,7 +623,7 @@ def insert_data(self): # GH 9086: Ensure we return datetimes with timezone info # Need to return 2-D data; DatetimeIndex is 1D d = b.values.to_pydatetime() - d = np.expand_dims(d, axis=0) + d = np.atleast_2d(d) else: # convert to microsecond resolution for datetime.datetime d = b.values.astype('M8[us]').astype(object) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 68017786eb6a6..7dc74961a2adc 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -5,6 +5,7 @@ import numpy as np import numpy.ma as ma +import numpy.ma.mrecords as mrecords import pytest from pandas.compat import PY36, is_platform_little_endian @@ -839,7 +840,7 @@ def test_constructor_maskedrecarray_dtype(self): data = np.ma.array( np.ma.zeros(5, dtype=[('date', ' Date: Thu, 20 Jun 2019 20:11:55 -0500 Subject: [PATCH 05/34] Surface NumPy FutureWarning about comparisons (#26966) --- pandas/core/indexes/base.py | 9 ++------- pandas/tests/indexes/test_numpy_compat.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 68faa3eb3e883..73abd708415a1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -76,13 +76,8 @@ def cmp_method(self, other): result = ops._comp_method_OBJECT_ARRAY(op, self.values, other) else: - - # numpy will show a DeprecationWarning on invalid elementwise - # comparisons, this will raise in the future - with warnings.catch_warnings(record=True): - warnings.filterwarnings("ignore", "elementwise", FutureWarning) - with np.errstate(all='ignore'): - result = op(self.values, np.asarray(other)) + with np.errstate(all='ignore'): + result = op(self.values, np.asarray(other)) # technically we could support bool dtyped Index # for now just return the indexing array directly diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index 460faaaf092ec..349d10f5079e8 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -80,3 +80,16 @@ def test_numpy_ufuncs_other(indices, func): else: with pytest.raises(Exception): func(idx) + + +def test_elementwise_comparison_warning(): + # https://github.com/pandas-dev/pandas/issues/22698#issuecomment-458968300 + # np.array([1, 2]) == 'a' returns False, and produces a + # FutureWarning that it'll be [False, False] in the future. + # We just want to ensure that comes through. + # When NumPy dev actually enforces this change, we'll need to skip + # this test. + idx = Index([1, 2]) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + idx == 'a' From 58cbf81f472932d5190a88141fd2e8079fa6b021 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Abdullah=20=C4=B0hsan=20Se=C3=A7er?= Date: Fri, 21 Jun 2019 04:19:27 +0300 Subject: [PATCH 06/34] BUG: Fix skiplist init error with empty window (#26940) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/_libs/window.pyx | 9 +++++++++ pandas/tests/test_window.py | 11 +++++++++++ 3 files changed, 21 insertions(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index e6bc422b52e89..8767a0c2d5ea1 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -742,6 +742,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.groupby.SeriesGroupBy.transform` where transforming an empty group would raise a ``ValueError`` (:issue:`26208`) - Bug in :meth:`pandas.core.frame.DataFrame.groupby` where passing a :class:`pandas.core.groupby.grouper.Grouper` would return incorrect groups when using the ``.groups`` accessor (:issue:`26326`) - Bug in :meth:`pandas.core.groupby.GroupBy.agg` where incorrect results are returned for uint64 columns. (:issue:`26310`) +- Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where MemoryError is raised with empty window (:issue:`26005`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 48b554ca02a9d..3305fea06f003 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1099,6 +1099,10 @@ def roll_median_c(ndarray[float64_t] values, int64_t win, int64_t minp, use_mock=False) output = np.empty(N, dtype=float) + if win == 0: + output[:] = NaN + return output + sl = skiplist_init(win) if sl == NULL: raise MemoryError("skiplist_init failed") @@ -1486,6 +1490,11 @@ def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, minp, index, closed, use_mock=False) output = np.empty(N, dtype=float) + + if win == 0: + output[:] = NaN + return output + skiplist = skiplist_init(win) if skiplist == NULL: raise MemoryError("skiplist_init failed") diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index bc6946cbade4c..31baf4475214f 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -608,6 +608,17 @@ def tests_empty_df_rolling(self, roller): result = DataFrame(index=pd.DatetimeIndex([])).rolling(roller).sum() tm.assert_frame_equal(result, expected) + def test_empty_window_median_quantile(self): + # GH 26005 + expected = pd.Series([np.nan, np.nan, np.nan]) + roll = pd.Series(np.arange(3)).rolling(0) + + result = roll.median() + tm.assert_series_equal(result, expected) + + result = roll.quantile(0.1) + tm.assert_series_equal(result, expected) + def test_missing_minp_zero(self): # https://github.com/pandas-dev/pandas/pull/18921 # minp=0 From fa92585678c0d80a484f5a6e1b561106002fef78 Mon Sep 17 00:00:00 2001 From: Chuanzhu Xu Date: Thu, 20 Jun 2019 21:48:23 -0400 Subject: [PATCH 07/34] Add type hint for (core.arrays).ranges (#26936) --- pandas/core/arrays/_ranges.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py index 4fbb8ae9f9aee..7a83b7960a6e7 100644 --- a/pandas/core/arrays/_ranges.py +++ b/pandas/core/arrays/_ranges.py @@ -3,14 +3,19 @@ (and possibly TimedeltaArray/PeriodArray) """ +from typing import Tuple + import numpy as np from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp -from pandas.tseries.offsets import Tick, generate_range +from pandas.tseries.offsets import DateOffset, Tick, generate_range -def generate_regular_range(start, end, periods, freq): +def generate_regular_range(start: Timestamp, + end: Timestamp, + periods: int, + freq: DateOffset) -> Tuple[np.ndarray, str]: """ Generate a range of dates with the spans between dates described by the given `freq` DateOffset. @@ -79,7 +84,10 @@ def generate_regular_range(start, end, periods, freq): return values, tz -def _generate_range_overflow_safe(endpoint, periods, stride, side='start'): +def _generate_range_overflow_safe(endpoint: int, + periods: int, + stride: int, + side: str = 'start') -> int: """ Calculate the second endpoint for passing to np.arange, checking to avoid an integer overflow. Catch OverflowError and re-raise @@ -146,7 +154,10 @@ def _generate_range_overflow_safe(endpoint, periods, stride, side='start'): return _generate_range_overflow_safe(midpoint, remaining, stride, side) -def _generate_range_overflow_safe_signed(endpoint, periods, stride, side): +def _generate_range_overflow_safe_signed(endpoint: int, + periods: int, + stride: int, + side: str) -> int: """ A special case for _generate_range_overflow_safe where `periods * stride` can be calculated without overflowing int64 bounds. From 7f8dd723a594b3b8ea03d6b87d7b031699ba9250 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Jun 2019 20:48:36 -0500 Subject: [PATCH 08/34] CLN: Deduplicate show_versions (#26816) --- doc/source/install.rst | 1 + pandas/compat/_optional.py | 1 + pandas/io/pytables.py | 7 +- pandas/tests/io/test_pytables_missing.py | 14 ++++ pandas/util/_print_versions.py | 102 ++++++++++------------- pandas/util/_test_decorators.py | 17 ++++ 6 files changed, 81 insertions(+), 61 deletions(-) create mode 100644 pandas/tests/io/test_pytables_missing.py diff --git a/doc/source/install.rst b/doc/source/install.rst index 1c1f0c1d4cf8e..ee4b36f898e31 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -286,6 +286,7 @@ psycopg2 PostgreSQL engine for sqlalchemy pyarrow 0.9.0 Parquet and feather reading / writing pymysql MySQL engine for sqlalchemy pyreadstat SPSS files (.sav) reading +pytables 3.4.2 HDF5 reading / writing qtpy Clipboard I/O s3fs 0.0.8 Amazon S3 access xarray 0.8.2 pandas-like API for N-dimensional data diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 4a7b8c4e88649..875edb3d3f1dd 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -19,6 +19,7 @@ "s3fs": "0.0.8", "scipy": "0.19.0", "sqlalchemy": "1.1.4", + "tables": "3.4.2", "xarray": "0.8.2", "xlrd": "1.1.0", "xlwt": "1.2.0", diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 983b1286eec91..79d6d8563a162 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -19,6 +19,7 @@ from pandas._libs import lib, writers as libwriters from pandas._libs.tslibs import timezones +from pandas.compat._optional import import_optional_dependency from pandas.errors import PerformanceWarning from pandas.core.dtypes.common import ( @@ -448,11 +449,7 @@ def __init__(self, path, mode=None, complevel=None, complib=None, if 'format' in kwargs: raise ValueError('format is not a defined argument for HDFStore') - try: - import tables # noqa - except ImportError as ex: # pragma: no cover - raise ImportError('HDFStore requires PyTables, "{ex!s}" problem ' - 'importing'.format(ex=ex)) + tables = import_optional_dependency("tables") if complib is not None and complib not in tables.filters.all_complibs: raise ValueError( diff --git a/pandas/tests/io/test_pytables_missing.py b/pandas/tests/io/test_pytables_missing.py new file mode 100644 index 0000000000000..4ceb80889c989 --- /dev/null +++ b/pandas/tests/io/test_pytables_missing.py @@ -0,0 +1,14 @@ +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas.util.testing as tm + + +@td.skip_if_installed("tables") +def test_pytables_raises(): + df = pd.DataFrame({"A": [1, 2]}) + with pytest.raises(ImportError, match="tables"): + with tm.ensure_clean("foo.h5") as path: + df.to_hdf(path, "df") diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index a5c86c2cc80b3..5e2e013c4afcc 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -1,5 +1,4 @@ import codecs -import importlib import locale import os import platform @@ -7,6 +6,9 @@ import subprocess import sys +from pandas.compat._optional import ( + VERSIONS, _get_version, import_optional_dependency) + def get_sys_info(): "Returns system information as a dict" @@ -58,60 +60,49 @@ def get_sys_info(): def show_versions(as_json=False): sys_info = get_sys_info() - deps = [ - # (MODULE_NAME, f(mod) -> mod version) - ("pandas", lambda mod: mod.__version__), - ("pytest", lambda mod: mod.__version__), - ("pip", lambda mod: mod.__version__), - ("setuptools", lambda mod: mod.__version__), - ("Cython", lambda mod: mod.__version__), - ("numpy", lambda mod: mod.version.version), - ("scipy", lambda mod: mod.version.version), - ("pyarrow", lambda mod: mod.__version__), - ("xarray", lambda mod: mod.__version__), - ("IPython", lambda mod: mod.__version__), - ("sphinx", lambda mod: mod.__version__), - ("patsy", lambda mod: mod.__version__), - ("dateutil", lambda mod: mod.__version__), - ("pytz", lambda mod: mod.VERSION), - ("blosc", lambda mod: mod.__version__), - ("bottleneck", lambda mod: mod.__version__), - ("tables", lambda mod: mod.__version__), - ("numexpr", lambda mod: mod.__version__), - ("feather", lambda mod: mod.__version__), - ("matplotlib", lambda mod: mod.__version__), - ("openpyxl", lambda mod: mod.__version__), - ("xlrd", lambda mod: mod.__VERSION__), - ("xlwt", lambda mod: mod.__VERSION__), - ("xlsxwriter", lambda mod: mod.__version__), - ("lxml.etree", lambda mod: mod.__version__), - ("bs4", lambda mod: mod.__version__), - ("html5lib", lambda mod: mod.__version__), - ("sqlalchemy", lambda mod: mod.__version__), - ("pymysql", lambda mod: mod.__version__), - ("psycopg2", lambda mod: mod.__version__), - ("jinja2", lambda mod: mod.__version__), - ("s3fs", lambda mod: mod.__version__), - ("fastparquet", lambda mod: mod.__version__), - ("pandas_gbq", lambda mod: mod.__version__), - ("pandas_datareader", lambda mod: mod.__version__), - ("gcsfs", lambda mod: mod.__version__), + 'pandas', + # required + 'numpy', + 'pytz', + 'dateutil', + # install / build, + 'pip', + 'setuptools', + 'Cython', + # test + 'pytest', + 'hypothesis', + # docs + "sphinx", + # Other, need a min version + "blosc", + "feather", + "xlsxwriter", + "lxml.etree", + "html5lib", + "pymysql", + "psycopg2", + "jinja2", + # Other, not imported. + "IPython", + "pandas_datareader", ] - deps_blob = list() - for (modname, ver_f) in deps: - try: - if modname in sys.modules: - mod = sys.modules[modname] - else: - mod = importlib.import_module(modname) - ver = ver_f(mod) - deps_blob.append((modname, ver)) - except ImportError: - deps_blob.append((modname, None)) + deps.extend(list(VERSIONS)) + deps_blob = [] - if (as_json): + for modname in deps: + mod = import_optional_dependency(modname, + raise_on_missing=False, + on_version="ignore") + if mod: + ver = _get_version(mod) + else: + ver = None + deps_blob.append((modname, ver)) + + if as_json: try: import json except ImportError: @@ -126,16 +117,15 @@ def show_versions(as_json=False): json.dump(j, f, indent=2) else: - + maxlen = max(len(x) for x in deps) + tpl = '{{k:<{maxlen}}}: {{stat}}'.format(maxlen=maxlen) print("\nINSTALLED VERSIONS") print("------------------") - for k, stat in sys_info: - print("{k}: {stat}".format(k=k, stat=stat)) - + print(tpl.format(k=k, stat=stat)) print("") for k, stat in deps_blob: - print("{k}: {stat}".format(k=k, stat=stat)) + print(tpl.format(k=k, stat=stat)) def main(): diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 0cb82c0028c90..fd9c9d07a974e 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -100,6 +100,23 @@ def _skip_if_no_scipy(): safe_import('scipy.signal')) +def skip_if_installed( + package: str, +) -> MarkDecorator: + """ + Skip a test if a package is installed. + + Parameters + ---------- + package : str + The name of the package. + """ + return pytest.mark.skipif( + safe_import(package), + reason="Skipping because {} is installed.".format(package) + ) + + def skip_if_no( package: str, min_version: Optional[str] = None From 388d22c3d1e6804dbc1390e41db1d7277b1d8c66 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 20 Jun 2019 19:01:59 -0700 Subject: [PATCH 09/34] BUG: avoid overflow in Bday generate_range, closes #24252 (#26651) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/_libs/tslibs/conversion.pyx | 9 +++++++++ pandas/tests/arithmetic/test_timedelta64.py | 9 +++++---- pandas/tests/indexes/datetimes/test_date_range.py | 13 +++++++++++++ pandas/tests/scalar/timestamp/test_timestamp.py | 7 +++++++ pandas/tests/tseries/offsets/test_offsets.py | 2 +- pandas/tseries/offsets.py | 14 +++++++++++++- 7 files changed, 49 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 8767a0c2d5ea1..a6b74865f6619 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -600,6 +600,7 @@ Datetimelike - Bug in :meth:`isin` for datetimelike indexes; :class:`DatetimeIndex`, :class:`TimedeltaIndex` and :class:`PeriodIndex` where the ``levels`` parameter was ignored. (:issue:`26675`) - Bug in :func:`to_datetime` which raises ``TypeError`` for ``format='%Y%m%d'`` when called for invalid integer dates with length >= 6 digits with ``errors='ignore'`` - Bug when comparing a :class:`PeriodIndex` against a zero-dimensional numpy array (:issue:`26689`) +- Bug in :func:`date_range` with unnecessary ``OverflowError`` being raised for very large or very small dates (:issue:`26651`) Timedelta ^^^^^^^^^ diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 04bb4454462a7..0a3f4ed3cc91d 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -275,6 +275,10 @@ cdef convert_to_tsobject(object ts, object tz, object unit, - iso8601 string object - python datetime object - another timestamp object + + Raises + ------ + OutOfBoundsDatetime : ts cannot be converted within implementation bounds """ cdef: _TSObject obj @@ -294,6 +298,11 @@ cdef convert_to_tsobject(object ts, object tz, object unit, if obj.value != NPY_NAT: dt64_to_dtstruct(obj.value, &obj.dts) elif is_integer_object(ts): + try: + ts = ts + except OverflowError: + # GH#26651 re-raise as OutOfBoundsDatetime + raise OutOfBoundsDatetime(ts) if ts == NPY_NAT: obj.value = NPY_NAT else: diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index ead9876e7c2a8..2dff9a6088de8 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -5,7 +5,8 @@ import numpy as np import pytest -from pandas.errors import NullFrequencyError, PerformanceWarning +from pandas.errors import ( + NullFrequencyError, OutOfBoundsDatetime, PerformanceWarning) import pandas as pd from pandas import ( @@ -479,10 +480,10 @@ def test_tdi_add_timestamp_nat_masking(self): def test_tdi_add_overflow(self): # See GH#14068 - msg = "too (big|large) to convert" - with pytest.raises(OverflowError, match=msg): + # preliminary test scalar analogue of vectorized tests below + with pytest.raises(OutOfBoundsDatetime): pd.to_timedelta(106580, 'D') + Timestamp('2000') - with pytest.raises(OverflowError, match=msg): + with pytest.raises(OutOfBoundsDatetime): Timestamp('2000') + pd.to_timedelta(106580, 'D') _NaT = int(pd.NaT) + 1 diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 7f03793d880b0..1545cc52eb1f4 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -740,6 +740,19 @@ def test_bdays_and_open_boundaries(self, closed): expected = pd.date_range(bday_start, bday_end, freq='D') tm.assert_index_equal(result, expected) + def test_bday_near_overflow(self): + # GH#24252 avoid doing unnecessary addition that _would_ overflow + start = pd.Timestamp.max.floor("D").to_pydatetime() + rng = pd.date_range(start, end=None, periods=1, freq='B') + expected = pd.DatetimeIndex([start], freq='B') + tm.assert_index_equal(rng, expected) + + def test_bday_overflow_error(self): + # GH#24252 check that we get OutOfBoundsDatetime and not OverflowError + start = pd.Timestamp.max.floor("D").to_pydatetime() + with pytest.raises(OutOfBoundsDatetime): + pd.date_range(start, periods=2, freq='B') + class TestCustomDateRange: diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 773b4e6f21a19..4b6b0dac916c6 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -463,6 +463,13 @@ def test_invalid_date_kwarg_with_string_input(self, arg): with pytest.raises(ValueError): Timestamp('2010-10-10 12:59:59.999999999', **kwarg) + def test_out_of_bounds_integer_value(self): + # GH#26651 check that we raise OutOfBoundsDatetime, not OverflowError + with pytest.raises(OutOfBoundsDatetime): + Timestamp(Timestamp.max.value * 2) + with pytest.raises(OutOfBoundsDatetime): + Timestamp(Timestamp.min.value * 2) + def test_out_of_bounds_value(self): one_us = np.timedelta64(1).astype('timedelta64[us]') diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 8c8a2f75c4a47..a1ad792e57bde 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -115,7 +115,7 @@ def test_apply_out_of_range(self, tz_naive_fixture): assert t.tzinfo == result.tzinfo except OutOfBoundsDatetime: - raise + pass except (ValueError, KeyError): # we are creating an invalid offset # so ignore diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index c1764b3845fce..00837d36d9508 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -97,6 +97,8 @@ def wrapper(self, other): if tz is not None and result.tzinfo is None: result = conversion.localize_pydatetime(result, tz) + result = Timestamp(result) + return result return wrapper @@ -2330,7 +2332,7 @@ def apply(self, other): # an exception, when we call using the + operator, # we directly call the known method result = other.__add__(self) - if result == NotImplemented: + if result is NotImplemented: raise OverflowError return result elif isinstance(other, (datetime, np.datetime64, date)): @@ -2467,6 +2469,11 @@ def generate_range(start=None, end=None, periods=None, offset=BDay()): while cur <= end: yield cur + if cur == end: + # GH#24252 avoid overflows by not performing the addition + # in offset.apply unless we have to + break + # faster than cur + offset next_date = offset.apply(cur) if next_date <= cur: @@ -2477,6 +2484,11 @@ def generate_range(start=None, end=None, periods=None, offset=BDay()): while cur >= end: yield cur + if cur == end: + # GH#24252 avoid overflows by not performing the addition + # in offset.apply unless we have to + break + # faster than cur + offset next_date = offset.apply(cur) if next_date >= cur: From 984514ef76166be37b19a6166c1868fa7d98f904 Mon Sep 17 00:00:00 2001 From: Christopher Whelan Date: Fri, 21 Jun 2019 04:03:02 +0200 Subject: [PATCH 10/34] BENCH: fix noisy asv benchmarks that were running on exhausted generators (#26772) --- asv_bench/benchmarks/ctors.py | 7 +++++++ asv_bench/benchmarks/frame_ctor.py | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index 1c6841a296377..42adede631a01 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -55,7 +55,14 @@ class SeriesConstructors: [False, True], ['float', 'int']] + # Generators get exhausted on use, so run setup before every call + number = 1 + repeat = (3, 250, 10) + def setup(self, data_fmt, with_index, dtype): + if data_fmt in (gen_of_str, gen_of_tuples) and with_index: + raise NotImplementedError('Series constructors do not support ' + 'using generators with indexes') N = 10**4 if dtype == 'float': arr = np.random.randn(N) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 19c2a913e8494..9533938b30fac 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -72,6 +72,10 @@ class FromRecords: params = [None, 1000] param_names = ['nrows'] + # Generators get exhausted on use, so run setup before every call + number = 1 + repeat = (3, 250, 10) + def setup(self, nrows): N = 100000 self.gen = ((x, (x * 20), (x * 100)) for x in range(N)) From 4850b287b4134885d0ca8f63650326d3525e274c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Jun 2019 21:06:34 -0500 Subject: [PATCH 11/34] Fix matplotlib converter registering warning (#26770) --- pandas/plotting/_core.py | 17 ++++----- pandas/plotting/_matplotlib/__init__.py | 6 ++++ pandas/plotting/_matplotlib/boxplot.py | 6 +++- pandas/plotting/_matplotlib/core.py | 7 ++-- pandas/plotting/_matplotlib/hist.py | 5 ++- pandas/plotting/_matplotlib/misc.py | 8 ++++- pandas/plotting/_matplotlib/style.py | 2 +- pandas/plotting/_matplotlib/timeseries.py | 4 +-- pandas/plotting/_matplotlib/tools.py | 3 +- pandas/tests/plotting/test_converter.py | 25 ++++++++++++-- pandas/tests/plotting/test_datetimelike.py | 40 ++++------------------ pandas/util/_test_decorators.py | 2 +- 12 files changed, 69 insertions(+), 56 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 81f5b5cb0f74c..78c7082c69b6b 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -5,19 +5,16 @@ from pandas.core.dtypes.common import is_integer, is_list_like from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -import pandas from pandas.core.base import PandasObject from pandas.core.generic import _shared_doc_kwargs, _shared_docs -# Automatically registering converters was deprecated in 0.21, but -# the deprecation warning wasn't showing until 0.24 -# This block will be eventually removed, but it's not clear when -if pandas.get_option('plotting.matplotlib.register_converters'): - try: - from .misc import register - register(explicit=False) - except ImportError: - pass +# Trigger matplotlib import, which implicitly registers our +# converts. Implicit registration is deprecated, and when enforced +# we can lazily import matplotlib. +try: + import pandas.plotting._matplotlib # noqa +except ImportError: + pass df_kind = """- 'scatter' : scatter plot - 'hexbin' : hexbin plot""" diff --git a/pandas/plotting/_matplotlib/__init__.py b/pandas/plotting/_matplotlib/__init__.py index 5cfb6843db9ed..1b775d03349d0 100644 --- a/pandas/plotting/_matplotlib/__init__.py +++ b/pandas/plotting/_matplotlib/__init__.py @@ -1,3 +1,5 @@ +from pandas._config import get_option + from pandas.plotting._matplotlib.boxplot import ( BoxPlot, boxplot, boxplot_frame, boxplot_frame_groupby) from pandas.plotting._matplotlib.converter import deregister, register @@ -11,6 +13,10 @@ from pandas.plotting._matplotlib.timeseries import tsplot from pandas.plotting._matplotlib.tools import table +if get_option("plotting.matplotlib.register_converters"): + register(explicit=False) + + __all__ = ['LinePlot', 'BarPlot', 'BarhPlot', 'HistPlot', 'BoxPlot', 'KdePlot', 'AreaPlot', 'PiePlot', 'ScatterPlot', 'HexBinPlot', 'hist_series', 'hist_frame', 'boxplot', 'boxplot_frame', 'boxplot_frame_groupby', diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index b8a7da5270fc0..f8bc531e3c344 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -1,7 +1,6 @@ from collections import namedtuple import warnings -from matplotlib import pyplot as plt from matplotlib.artist import setp import numpy as np @@ -11,6 +10,7 @@ import pandas as pd from pandas.io.formats.printing import pprint_thing +from pandas.plotting._matplotlib import converter from pandas.plotting._matplotlib.core import LinePlot, MPLPlot from pandas.plotting._matplotlib.style import _get_standard_colors from pandas.plotting._matplotlib.tools import _flatten, _subplots @@ -215,6 +215,7 @@ def boxplot(data, column=None, by=None, ax=None, fontsize=None, rot=0, grid=True, figsize=None, layout=None, return_type=None, **kwds): + import matplotlib.pyplot as plt # validate return_type: if return_type not in BoxPlot._valid_return_types: raise ValueError("return_type must be {'axes', 'dict', 'both'}") @@ -296,6 +297,8 @@ def plot_group(keys, values, ax): def boxplot_frame(self, column=None, by=None, ax=None, fontsize=None, rot=0, grid=True, figsize=None, layout=None, return_type=None, **kwds): + import matplotlib.pyplot as plt + converter._WARN = False # no warning for pandas plots ax = boxplot(self, column=column, by=by, ax=ax, fontsize=fontsize, grid=grid, rot=rot, figsize=figsize, layout=layout, return_type=return_type, **kwds) @@ -306,6 +309,7 @@ def boxplot_frame(self, column=None, by=None, ax=None, fontsize=None, rot=0, def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, rot=0, grid=True, ax=None, figsize=None, layout=None, sharex=False, sharey=True, **kwds): + converter._WARN = False # no warning for pandas plots if subplots is True: naxes = len(grouped) fig, axes = _subplots(naxes=naxes, squeeze=False, diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index a7049afee80b0..5fb4d201223bd 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -2,7 +2,6 @@ from typing import Optional # noqa import warnings -import matplotlib.pyplot as plt import numpy as np from pandas._config import get_option @@ -61,6 +60,8 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=None, secondary_y=False, colormap=None, table=False, layout=None, **kwds): + import matplotlib.pyplot as plt + converter._WARN = False # no warning for pandas plots self.data = data self.by = by @@ -103,7 +104,7 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=None, self.rot = self._default_rot if grid is None: - grid = False if secondary_y else self.plt.rcParams['axes.grid'] + grid = False if secondary_y else plt.rcParams['axes.grid'] self.grid = grid self.legend = legend @@ -618,6 +619,8 @@ def _get_ax(self, i): @classmethod def get_default_ax(cls, ax): + import matplotlib.pyplot as plt + if ax is None and len(plt.get_fignums()) > 0: with plt.rc_context(): ax = plt.gca() diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 585c407e33311..d34c0cb6a3889 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -1,6 +1,5 @@ import warnings -import matplotlib.pyplot as plt import numpy as np from pandas.core.dtypes.common import is_integer, is_list_like @@ -10,6 +9,7 @@ import pandas.core.common as com from pandas.io.formats.printing import pprint_thing +from pandas.plotting._matplotlib import converter from pandas.plotting._matplotlib.core import LinePlot, MPLPlot from pandas.plotting._matplotlib.tools import ( _flatten, _set_ticks_props, _subplots) @@ -203,6 +203,7 @@ def _grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None, def plot_group(group, ax): ax.hist(group.dropna().values, bins=bins, **kwargs) + converter._WARN = False # no warning for pandas plots xrot = xrot or rot fig, axes = _grouped_plot(plot_group, data, column=column, @@ -220,6 +221,7 @@ def plot_group(group, ax): def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, figsize=None, bins=10, **kwds): + import matplotlib.pyplot as plt if by is None: if kwds.get('layout', None) is not None: raise ValueError("The 'layout' keyword is not supported when " @@ -261,6 +263,7 @@ def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, sharey=False, figsize=None, layout=None, bins=10, **kwds): + converter._WARN = False # no warning for pandas plots if by is not None: axes = _grouped_hist(data, column=column, by=by, ax=ax, grid=grid, figsize=figsize, sharex=sharex, sharey=sharey, diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index dacc9ef04f819..663a3c5153fac 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -2,7 +2,6 @@ import matplotlib.lines as mlines import matplotlib.patches as patches -import matplotlib.pyplot as plt import numpy as np from pandas.core.dtypes.missing import notna @@ -105,6 +104,7 @@ def _get_marker_compat(marker): def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): + import matplotlib.pyplot as plt def normalize(series): a = min(series) @@ -169,6 +169,7 @@ def normalize(series): def andrews_curves(frame, class_column, ax=None, samples=200, color=None, colormap=None, **kwds): + import matplotlib.pyplot as plt def function(amplitudes): def f(t): @@ -224,6 +225,7 @@ def f(t): def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): + import matplotlib.pyplot as plt # random.sample(ndarray, int) fails on python 3.3, sigh data = list(series.values) samplings = [random.sample(data, size) for _ in range(samples)] @@ -270,6 +272,7 @@ def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, use_columns=False, xticks=None, colormap=None, axvlines=True, axvlines_kwds=None, sort_labels=False, **kwds): + import matplotlib.pyplot as plt if axvlines_kwds is None: axvlines_kwds = {'linewidth': 1, 'color': 'black'} @@ -336,6 +339,7 @@ def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, def lag_plot(series, lag=1, ax=None, **kwds): # workaround because `c='b'` is hardcoded in matplotlibs scatter method + import matplotlib.pyplot as plt kwds.setdefault('c', plt.rcParams['patch.facecolor']) data = series.values @@ -350,6 +354,8 @@ def lag_plot(series, lag=1, ax=None, **kwds): def autocorrelation_plot(series, ax=None, **kwds): + import matplotlib.pyplot as plt + n = len(series) data = np.asarray(series) if ax is None: diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index 80a15942a2867..8c9e3ea330dd3 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -3,7 +3,6 @@ import matplotlib.cm as cm import matplotlib.colors -import matplotlib.pyplot as plt import numpy as np from pandas.core.dtypes.common import is_list_like @@ -13,6 +12,7 @@ def _get_standard_colors(num_colors=None, colormap=None, color_type='default', color=None): + import matplotlib.pyplot as plt if color is None and colormap is not None: if isinstance(colormap, str): cmap = colormap diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 30038b599a386..e36ffed10d94f 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -3,8 +3,6 @@ import functools import warnings -from matplotlib import pylab -import matplotlib.pyplot as plt import numpy as np from pandas._libs.tslibs.frequencies import ( @@ -42,6 +40,7 @@ def tsplot(series, plotf, ax=None, **kwargs): .. deprecated:: 0.23.0 Use Series.plot() instead """ + import matplotlib.pyplot as plt warnings.warn("'tsplot' is deprecated and will be removed in a " "future version. Please use Series.plot() instead.", FutureWarning, stacklevel=2) @@ -323,6 +322,7 @@ def format_dateaxis(subplot, freq, index): default, changing the limits of the x axis will intelligently change the positions of the ticks. """ + from matplotlib import pylab # handle index specific formatting # Note: DatetimeIndex does not use this diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index f6393fc76892f..e491cfc3309a0 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -2,7 +2,6 @@ from math import ceil import warnings -import matplotlib.pyplot as plt import matplotlib.table import matplotlib.ticker as ticker import numpy as np @@ -168,6 +167,7 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, # Four polar axes plt.subplots(2, 2, subplot_kw=dict(polar=True)) """ + import matplotlib.pyplot as plt if subplot_kw is None: subplot_kw = {} @@ -345,6 +345,7 @@ def _get_xlim(lines): def _set_ticks_props(axes, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None): + import matplotlib.pyplot as plt for ax in _flatten(axes): if xlabelsize is not None: plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 39cd48ff35f96..92d207e46b7ab 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -12,11 +12,30 @@ from pandas import Index, Period, Series, Timestamp, date_range import pandas.util.testing as tm +from pandas.plotting import ( + deregister_matplotlib_converters, register_matplotlib_converters) from pandas.tseries.offsets import Day, Micro, Milli, Second -converter = pytest.importorskip('pandas.plotting._converter') -from pandas.plotting import (deregister_matplotlib_converters, # isort:skip - register_matplotlib_converters) +try: + from pandas.plotting._matplotlib import converter +except ImportError: + # try / except, rather than skip, to avoid internal refactoring + # causing an improprer skip + pass + +pytest.importorskip('matplotlib.pyplot') + + +def test_initial_warning(): + code = ( + "import pandas as pd; import matplotlib.pyplot as plt; " + "s = pd.Series(1, pd.date_range('2000', periods=12)); " + "fig, ax = plt.subplots(); " + "ax.plot(s.index, s.values)" + ) + call = [sys.executable, '-c', code] + out = subprocess.check_output(call, stderr=subprocess.STDOUT).decode() + assert 'Using an implicitly' in out def test_timtetonum_accepts_unicode(): diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 10743ca95e29e..c3d824389aa4d 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -374,7 +374,6 @@ def test_axis_limits(self): def _test(ax): xlim = ax.get_xlim() ax.set_xlim(xlim[0] - 5, xlim[1] + 10) - ax.get_figure().canvas.draw() result = ax.get_xlim() assert result[0] == xlim[0] - 5 assert result[1] == xlim[1] + 10 @@ -383,7 +382,6 @@ def _test(ax): expected = (Period('1/1/2000', ax.freq), Period('4/1/2000', ax.freq)) ax.set_xlim('1/1/2000', '4/1/2000') - ax.get_figure().canvas.draw() result = ax.get_xlim() assert int(result[0]) == expected[0].ordinal assert int(result[1]) == expected[1].ordinal @@ -392,7 +390,6 @@ def _test(ax): expected = (Period('1/1/2000', ax.freq), Period('4/1/2000', ax.freq)) ax.set_xlim(datetime(2000, 1, 1), datetime(2000, 4, 1)) - ax.get_figure().canvas.draw() result = ax.get_xlim() assert int(result[0]) == expected[0].ordinal assert int(result[1]) == expected[1].ordinal @@ -429,12 +426,7 @@ def test_get_finder(self): def test_finder_daily(self): day_lst = [10, 40, 252, 400, 950, 2750, 10000] - if self.mpl_ge_3_0_0 or not self.mpl_ge_2_2_3: - xpl1 = xpl2 = [Period('1999-1-1', freq='B').ordinal] * len(day_lst) - else: # 2.2.3, 2.2.4 - xpl1 = [7565, 7564, 7553, 7546, 7518, 7428, 7066] - xpl2 = [7566, 7564, 7554, 7546, 7519, 7429, 7066] - + xpl1 = xpl2 = [Period('1999-1-1', freq='B').ordinal] * len(day_lst) rs1 = [] rs2 = [] for i, n in enumerate(day_lst): @@ -457,12 +449,7 @@ def test_finder_daily(self): def test_finder_quarterly(self): yrs = [3.5, 11] - if self.mpl_ge_3_0_0 or not self.mpl_ge_2_2_3: - xpl1 = xpl2 = [Period('1988Q1').ordinal] * len(yrs) - else: # 2.2.3, 2.2.4 - xpl1 = [68, 68] - xpl2 = [72, 68] - + xpl1 = xpl2 = [Period('1988Q1').ordinal] * len(yrs) rs1 = [] rs2 = [] for i, n in enumerate(yrs): @@ -485,12 +472,7 @@ def test_finder_quarterly(self): def test_finder_monthly(self): yrs = [1.15, 2.5, 4, 11] - if self.mpl_ge_3_0_0 or not self.mpl_ge_2_2_3: - xpl1 = xpl2 = [Period('Jan 1988').ordinal] * len(yrs) - else: # 2.2.3, 2.2.4 - xpl1 = [216, 216, 204, 204] - xpl2 = [216, 216, 216, 204] - + xpl1 = xpl2 = [Period('Jan 1988').ordinal] * len(yrs) rs1 = [] rs2 = [] for i, n in enumerate(yrs): @@ -521,11 +503,7 @@ def test_finder_monthly_long(self): @pytest.mark.slow def test_finder_annual(self): - if self.mpl_ge_3_0_0 or not self.mpl_ge_2_2_3: - xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] - else: # 2.2.3, 2.2.4 - xp = [1986, 1986, 1990, 1990, 1995, 2020, 1970, 1970] - + xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] xp = [Period(x, freq='A').ordinal for x in xp] rs = [] for i, nyears in enumerate([5, 10, 19, 49, 99, 199, 599, 1001]): @@ -1093,7 +1071,6 @@ def test_time(self): df.plot(ax=ax) # verify tick labels - fig.canvas.draw() ticks = ax.get_xticks() labels = ax.get_xticklabels() for t, l in zip(ticks, labels): @@ -1120,7 +1097,6 @@ def test_time_change_xlim(self): df.plot(ax=ax) # verify tick labels - fig.canvas.draw() ticks = ax.get_xticks() labels = ax.get_xticklabels() for t, l in zip(ticks, labels): @@ -1138,7 +1114,6 @@ def test_time_change_xlim(self): ax.set_xlim('1:30', '5:00') # check tick labels again - fig.canvas.draw() ticks = ax.get_xticks() labels = ax.get_xticklabels() for t, l in zip(ticks, labels): @@ -1165,7 +1140,6 @@ def test_time_musec(self): ax = df.plot(ax=ax) # verify tick labels - fig.canvas.draw() ticks = ax.get_xticks() labels = ax.get_xticklabels() for t, l in zip(ticks, labels): @@ -1432,7 +1406,7 @@ def test_format_timedelta_ticks_narrow(self): df = DataFrame(np.random.randn(len(rng), 3), rng) fig, ax = self.plt.subplots() df.plot(fontsize=2, ax=ax) - fig.canvas.draw() + self.plt.draw() labels = ax.get_xticklabels() result_labels = [x.get_text() for x in labels] @@ -1456,7 +1430,7 @@ def test_format_timedelta_ticks_wide(self): df = DataFrame(np.random.randn(len(rng), 3), rng) fig, ax = self.plt.subplots() ax = df.plot(fontsize=2, ax=ax) - fig.canvas.draw() + self.plt.draw() labels = ax.get_xticklabels() result_labels = [x.get_text() for x in labels] @@ -1529,7 +1503,7 @@ def test_matplotlib_scatter_datetime64(self): df["time"] = date_range("2018-01-01", periods=10, freq="D") fig, ax = self.plt.subplots() ax.scatter(x="time", y="y", data=df) - fig.canvas.draw() + self.plt.draw() label = ax.get_xticklabels()[0] if self.mpl_ge_3_0_0: expected = "2017-12-08" diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index fd9c9d07a974e..ab22539f4530f 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -76,7 +76,7 @@ def safe_import(mod_name, min_version=None): def _skip_if_no_mpl(): mod = safe_import("matplotlib") if mod: - mod.use("Agg", warn=False) + mod.use("Agg", warn=True) else: return True From b9b081dc6b510c8290ded12fe751b1216843527e Mon Sep 17 00:00:00 2001 From: killerontherun1 Date: Fri, 21 Jun 2019 07:40:48 +0530 Subject: [PATCH 12/34] Docstring GL01 GL02 fixes (#26526) --- pandas/core/accessor.py | 2 +- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/interval.py | 4 ++-- pandas/core/base.py | 8 ++++---- pandas/core/computation/eval.py | 19 ++++++++++++------- pandas/core/dtypes/dtypes.py | 16 ++++++++++++---- pandas/core/dtypes/inference.py | 3 ++- pandas/core/generic.py | 6 ++++-- pandas/core/groupby/groupby.py | 5 +++-- pandas/core/indexes/interval.py | 3 ++- pandas/core/indexes/multi.py | 10 +++++++--- pandas/core/indexing.py | 3 ++- pandas/core/resample.py | 3 +-- pandas/core/reshape/merge.py | 6 ++++-- 14 files changed, 57 insertions(+), 33 deletions(-) diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 4353e0b3edd08..b092541da93e6 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -196,7 +196,7 @@ def decorator(accessor): return decorator -_doc = """\ +_doc = """ Register a custom accessor on %(klass)s objects. Parameters diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c079b860bb924..155638aca5560 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -196,7 +196,7 @@ def contains(cat, key, container): return any(loc_ in container for loc_ in loc) -_codes_doc = """\ +_codes_doc = """ The category codes of this categorical. Level codes are an array if integer which are the positions of the real diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 4f628eff43167..71f4cbae7c58d 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -987,7 +987,7 @@ def __array__(self, dtype=None): result[i] = Interval(left[i], right[i], closed) return result - _interval_shared_docs['to_tuples'] = """\ + _interval_shared_docs['to_tuples'] = """ Return an %(return_type)s of tuples of the form (left, right) Parameters @@ -1002,7 +1002,7 @@ def __array__(self, dtype=None): ------- tuples: %(return_type)s %(examples)s\ - """ + """ @Appender(_interval_shared_docs['to_tuples'] % dict( return_type='ndarray', diff --git a/pandas/core/base.py b/pandas/core/base.py index e4274e48d3227..ab9d8b9d778e5 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -638,8 +638,8 @@ def _is_builtin_func(self, arg): class IndexOpsMixin: - """ common ops mixin to support a unified interface / docs for Series / - Index + """ + Common ops mixin to support a unified interface / docs for Series / Index """ # ndarray compatibility @@ -656,8 +656,8 @@ def transpose(self, *args, **kwargs): nv.validate_transpose(args, kwargs) return self - T = property(transpose, doc="Return the transpose, which is by " - "definition self.") + T = property(transpose, doc="""\nReturn the transpose, which is by + definition self.\n""") @property def _is_homogeneous_type(self): diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 8f6c271af4a58..ef4639a3afe4c 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -1,6 +1,7 @@ #!/usr/bin/env python -"""Top level ``eval`` module. +""" +Top level ``eval`` module. """ import tokenize @@ -15,7 +16,8 @@ def _check_engine(engine): - """Make sure a valid engine is passed. + """ + Make sure a valid engine is passed. Parameters ---------- @@ -31,7 +33,6 @@ def _check_engine(engine): Returns ------- string engine - """ from pandas.core.computation.check import _NUMEXPR_INSTALLED @@ -60,7 +61,8 @@ def _check_engine(engine): def _check_parser(parser): - """Make sure a valid parser is passed. + """ + Make sure a valid parser is passed. Parameters ---------- @@ -88,7 +90,8 @@ def _check_resolvers(resolvers): def _check_expression(expr): - """Make sure an expression is not an empty string + """ + Make sure an expression is not an empty string Parameters ---------- @@ -105,7 +108,8 @@ def _check_expression(expr): def _convert_expression(expr): - """Convert an object to an expression. + """ + Convert an object to an expression. Thus function converts an object to an expression (a unicode string) and checks to make sure it isn't empty after conversion. This is used to @@ -155,7 +159,8 @@ def _check_for_locals(expr, stack_level, parser): def eval(expr, parser='pandas', engine=None, truediv=True, local_dict=None, global_dict=None, resolvers=(), level=0, target=None, inplace=False): - """Evaluate a Python expression as a string using various backends. + """ + Evaluate a Python expression as a string using various backends. The following arithmetic operations are supported: ``+``, ``-``, ``*``, ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index a56ee72cf1910..7fe8ce7d71683 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -631,12 +631,16 @@ def __init__(self, unit="ns", tz=None): @property def unit(self): - """The precision of the datetime data.""" + """ + The precision of the datetime data. + """ return self._unit @property def tz(self): - """The timezone.""" + """ + The timezone. + """ return self._tz @classmethod @@ -777,7 +781,9 @@ def __new__(cls, freq=None): @property def freq(self): - """The frequency object of this PeriodDtype.""" + """ + The frequency object of this PeriodDtype. + """ return self._freq @classmethod @@ -944,7 +950,9 @@ def __new__(cls, subtype=None): @property def subtype(self): - """The dtype of the Interval bounds.""" + """ + The dtype of the Interval bounds. + """ return self._subtype @classmethod diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 63cb4d85ca308..02ee777bbe7f3 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -427,7 +427,8 @@ def is_named_tuple(obj): def is_hashable(obj): - """Return True if hash(obj) will succeed, False otherwise. + """ + Return True if hash(obj) will succeed, False otherwise. Some types will pass a test against collections.abc.Hashable but fail when they are actually hashed with hash(). diff --git a/pandas/core/generic.py b/pandas/core/generic.py index dba88495d8128..360576ffdb00a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1838,7 +1838,8 @@ def __iter__(self): # can we get a better explanation of this? def keys(self): - """Get the 'info axis' (see Indexing for more) + """ + Get the 'info axis' (see Indexing for more) This is index for Series, columns for DataFrame. @@ -1850,7 +1851,8 @@ def keys(self): return self._info_axis def iteritems(self): - """Iterate over (label, values) on info axis + """ + Iterate over (label, values) on info axis This is index for Series, columns for DataFrame and so on. """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2b190c53da53d..43950f2f503c8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -172,7 +172,7 @@ class providing the base-class of operations. {examples} """) -_pipe_template = """\ +_pipe_template = """ Apply a function `func` with arguments to this %(klass)s object and return the function's result. @@ -223,7 +223,8 @@ class providing the base-class of operations. Examples -------- -%(examples)s""" +%(examples)s +""" _transform_template = """ Call function producing a like-indexed %(klass)s on each group and diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 896935fa72adb..577d0221cd8da 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -769,7 +769,8 @@ def _find_non_overlapping_monotonic_bounds(self, key): return start, stop def get_loc(self, key, method=None): - """Get integer location, slice or boolean mask for requested label. + """ + Get integer location, slice or boolean mask for requested label. Parameters ---------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0f457ba799928..0d6e75f95f863 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1149,7 +1149,7 @@ def _set_names(self, names, level=None, validate=True): self.levels[l].rename(name, inplace=True) names = property(fset=_set_names, fget=_get_names, - doc="Names of levels in MultiIndex") + doc="""\nNames of levels in MultiIndex\n""") @Appender(_index_shared_docs['_get_grouper_for_level']) def _get_grouper_for_level(self, mapper, level): @@ -1823,12 +1823,16 @@ def remove_unused_levels(self): @property def nlevels(self): - """Integer number of levels in this MultiIndex.""" + """ + Integer number of levels in this MultiIndex. + """ return len(self.levels) @property def levshape(self): - """A tuple with the length of each level.""" + """ + A tuple with the length of each level. + """ return tuple(len(x) for x in self.levels) def __reduce__(self): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 7f4827be6dff7..6a21adb1d16ae 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1311,7 +1311,8 @@ def _get_slice_axis(self, slice_obj, axis=None): class _IXIndexer(_NDFrameIndexer): - """A primarily label-location based indexer, with integer position + """ + A primarily label-location based indexer, with integer position fallback. Warning: Starting in 0.20.0, the .ix indexer is deprecated, in diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 874973846a006..d1d99d28e59b6 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -204,8 +204,7 @@ def _assure_grouper(self): >>> df.resample('2D').pipe(lambda x: x.max() - x.min()) A 2012-08-02 1 - 2012-08-04 1 - """) + 2012-08-04 1""") @Appender(_pipe_template) def pipe(self, func, *args, **kwargs): return super().pipe(func, *args, **kwargs) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 1a80b35629356..d21ad58e752c2 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -133,7 +133,8 @@ def merge_ordered(left, right, on=None, left_by=None, right_by=None, fill_method=None, suffixes=('_x', '_y'), how='outer'): - """Perform merge with optional filling/interpolation designed for ordered + """ + Perform merge with optional filling/interpolation designed for ordered data like time series data. Optionally perform group-wise merge (see examples) @@ -240,7 +241,8 @@ def merge_asof(left, right, on=None, tolerance=None, allow_exact_matches=True, direction='backward'): - """Perform an asof merge. This is similar to a left-join except that we + """ + Perform an asof merge. This is similar to a left-join except that we match on nearest key rather than equal keys. Both DataFrames must be sorted by the key. From c275dbfcee0fa9644fa2718ad76fd91ca056069b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 21 Jun 2019 09:05:08 +0200 Subject: [PATCH 13/34] BUG: catch out-of-bounds datetime64 in Series/DataFrame constructor (#26848) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/dtypes/cast.py | 4 ++- pandas/core/internals/construction.py | 5 +++- pandas/tests/test_base.py | 38 +++++++++++++++++++++++++++ 4 files changed, 46 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a6b74865f6619..a897f364d8066 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -600,6 +600,7 @@ Datetimelike - Bug in :meth:`isin` for datetimelike indexes; :class:`DatetimeIndex`, :class:`TimedeltaIndex` and :class:`PeriodIndex` where the ``levels`` parameter was ignored. (:issue:`26675`) - Bug in :func:`to_datetime` which raises ``TypeError`` for ``format='%Y%m%d'`` when called for invalid integer dates with length >= 6 digits with ``errors='ignore'`` - Bug when comparing a :class:`PeriodIndex` against a zero-dimensional numpy array (:issue:`26689`) +- Bug in constructing a ``Series`` or ``DataFrame`` from a numpy ``datetime64`` array with a non-ns unit and out-of-bound timestamps generating rubbish data, which will now correctly raise an ``OutOfBoundsDatetime`` error (:issue:`26206`). - Bug in :func:`date_range` with unnecessary ``OverflowError`` being raised for very large or very small dates (:issue:`26651`) Timedelta diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2f66e9ed46aa0..c68d469d291e7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1038,6 +1038,8 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): .tz_convert(dtype.tz)) elif is_timedelta64: value = to_timedelta(value, errors=errors)._values + except OutOfBoundsDatetime: + raise except (AttributeError, ValueError, TypeError): pass @@ -1063,7 +1065,7 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): dtype = value.dtype if dtype.kind == 'M' and dtype != _NS_DTYPE: - value = value.astype(_NS_DTYPE) + value = tslibs.conversion.ensure_datetime64ns(value) elif dtype.kind == 'm' and dtype != _TD_DTYPE: value = to_timedelta(value) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2616f0aa97d0d..f564ac13dc41d 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -8,7 +8,7 @@ import numpy.ma as ma from pandas._libs import lib -from pandas._libs.tslibs import IncompatibleFrequency +from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime from pandas.compat import raise_with_traceback from pandas.core.dtypes.cast import ( @@ -700,6 +700,9 @@ def _try_cast(arr, take_fast_path, dtype, copy, raise_cast_failure): elif not is_extension_type(subarr): subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) + except OutOfBoundsDatetime: + # in case of out of bound datetime64 -> always raise + raise except (ValueError, TypeError): if is_categorical_dtype(dtype): # We *do* allow casting to categorical, since we know diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 3b4f85e680f6e..d24ed9433f4f7 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1341,3 +1341,41 @@ def test_to_numpy_dtype(as_series): expected = np.array(['2000-01-01T05', '2001-01-01T05'], dtype='M8[ns]') tm.assert_numpy_array_equal(result, expected) + + +class TestConstruction: + # test certain constructor behaviours on dtype inference across Series, + # Index and DataFrame + + @pytest.mark.parametrize("klass", [ + Series, + lambda x, **kwargs: DataFrame({'a': x}, **kwargs)['a'], + pytest.param(lambda x, **kwargs: DataFrame(x, **kwargs)[0], + marks=pytest.mark.xfail), + Index, + ]) + @pytest.mark.parametrize("a", [ + np.array(['2263-01-01'], dtype='datetime64[D]'), + np.array([datetime(2263, 1, 1)], dtype=object), + np.array([np.datetime64('2263-01-01', 'D')], dtype=object), + np.array(["2263-01-01"], dtype=object) + ], ids=['datetime64[D]', 'object-datetime.datetime', + 'object-numpy-scalar', 'object-string']) + def test_constructor_datetime_outofbound(self, a, klass): + # GH-26853 (+ bug GH-26206 out of bound non-ns unit) + + # No dtype specified (dtype inference) + # datetime64[non-ns] raise error, other cases result in object dtype + # and preserve original data + if a.dtype.kind == 'M': + with pytest.raises(pd.errors.OutOfBoundsDatetime): + klass(a) + else: + result = klass(a) + assert result.dtype == 'object' + tm.assert_numpy_array_equal(result.to_numpy(), a) + + # Explicit dtype specified + # Forced conversion fails for all -> all cases raise error + with pytest.raises(pd.errors.OutOfBoundsDatetime): + klass(a, dtype='datetime64[ns]') From 224362951942e1f4e05fb8948596620aedac26d9 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Fri, 21 Jun 2019 10:37:17 +0100 Subject: [PATCH 14/34] PLOT: Add option to specify the plotting backend (#26753) --- doc/source/user_guide/options.rst | 6 +++++ doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/config_init.py | 36 +++++++++++++++++++++++++++ pandas/plotting/_core.py | 11 ++++---- pandas/plotting/_misc.py | 9 +------ pandas/tests/plotting/test_backend.py | 33 ++++++++++++++++++++++++ pandas/tests/plotting/test_misc.py | 2 +- 7 files changed, 84 insertions(+), 14 deletions(-) create mode 100644 pandas/tests/plotting/test_backend.py diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index 4b466c2c44d49..4d0def435cb1e 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -431,6 +431,12 @@ compute.use_bottleneck True Use the bottleneck library computation if it is installed. compute.use_numexpr True Use the numexpr library to accelerate computation if it is installed. +plotting.backend matplotlib Change the plotting backend to a different + backend than the current matplotlib one. + Backends can be implemented as third-party + libraries implementing the pandas plotting + API. They can use other plotting libraries + like Bokeh, Altair, etc. plotting.matplotlib.register_converters True Register custom converters with matplotlib. Set to False to de-register. ======================================= ============ ================================== diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a897f364d8066..77b689569d57f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -132,6 +132,7 @@ Other Enhancements - :class:`DatetimeIndex` and :class:`TimedeltaIndex` now have a ``mean`` method (:issue:`24757`) - :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`) - Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`) +- Added new option ``plotting.backend`` to be able to select a plotting backend different than the existing ``matplotlib`` one. Use ``pandas.set_option('plotting.backend', '')`` where `` Date: Fri, 21 Jun 2019 16:19:43 +0200 Subject: [PATCH 15/34] COMPAT: reading generic PyTables Table format fails with sub-selection (#26818) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/pytables.py | 36 ++++++++++-- pandas/tests/io/pytables/test_compat.py | 76 +++++++++++++++++++++++++ pandas/tests/io/test_pytables.py | 2 +- 4 files changed, 108 insertions(+), 7 deletions(-) create mode 100644 pandas/tests/io/pytables/test_compat.py diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 77b689569d57f..467cb5a40213c 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -699,6 +699,7 @@ I/O - Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`) - Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`) - Bug in :meth:`DataFrame.to_html` where header numbers would ignore display options when rounding (:issue:`17280`) +- Bug in :func:`read_hdf` where reading a table from an HDF5 file written directly with PyTables fails with a ``ValueError`` when using a sub-selection via the ``start`` or ``stop`` arguments (:issue:`11188`) - Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`) - Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`) - Improved the explanation for the failure when value labels are repeated in Stata dta files and suggested work-arounds (:issue:`25772`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 79d6d8563a162..17d580bae5cf1 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1624,7 +1624,8 @@ def infer(self, handler): new_self.read_metadata(handler) return new_self - def convert(self, values, nan_rep, encoding, errors): + def convert(self, values, nan_rep, encoding, errors, start=None, + stop=None): """ set the values from this selection: take = take ownership """ # values is a recarray @@ -1813,10 +1814,29 @@ class GenericIndexCol(IndexCol): def is_indexed(self): return False - def convert(self, values, nan_rep, encoding, errors): - """ set the values from this selection: take = take ownership """ + def convert(self, values, nan_rep, encoding, errors, start=None, + stop=None): + """ set the values from this selection: take = take ownership + + Parameters + ---------- + + values : np.ndarray + nan_rep : str + encoding : str + errors : str + start : int, optional + Table row number: the start of the sub-selection. + stop : int, optional + Table row number: the end of the sub-selection. Values larger than + the underlying table's row count are normalized to that. + """ + + start = start if start is not None else 0 + stop = (min(stop, self.table.nrows) + if stop is not None else self.table.nrows) + self.values = Int64Index(np.arange(stop - start)) - self.values = Int64Index(np.arange(self.table.nrows)) return self def get_attr(self): @@ -2159,7 +2179,8 @@ def validate_attr(self, append): raise ValueError("appended items dtype do not match existing " "items dtype in table!") - def convert(self, values, nan_rep, encoding, errors): + def convert(self, values, nan_rep, encoding, errors, start=None, + stop=None): """set the data from this selection (and convert to the correct dtype if we can) """ @@ -3431,8 +3452,11 @@ def read_axes(self, where, **kwargs): # convert the data for a in self.axes: a.set_info(self.info) + # `kwargs` may contain `start` and `stop` arguments if passed to + # `store.select()`. If set they determine the index size. a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding, - errors=self.errors) + errors=self.errors, start=kwargs.get('start'), + stop=kwargs.get('stop')) return True diff --git a/pandas/tests/io/pytables/test_compat.py b/pandas/tests/io/pytables/test_compat.py new file mode 100644 index 0000000000000..d74e1218ebdb0 --- /dev/null +++ b/pandas/tests/io/pytables/test_compat.py @@ -0,0 +1,76 @@ +import pytest + +import pandas as pd +from pandas.tests.io.test_pytables import ensure_clean_path +from pandas.util.testing import assert_frame_equal + +tables = pytest.importorskip('tables') + + +@pytest.fixture +def pytables_hdf5_file(): + """Use PyTables to create a simple HDF5 file.""" + + table_schema = { + 'c0': tables.Time64Col(pos=0), + 'c1': tables.StringCol(5, pos=1), + 'c2': tables.Int64Col(pos=2), + } + + t0 = 1561105000.0 + + testsamples = [ + {'c0': t0, 'c1': 'aaaaa', 'c2': 1}, + {'c0': t0 + 1, 'c1': 'bbbbb', 'c2': 2}, + {'c0': t0 + 2, 'c1': 'ccccc', 'c2': 10**5}, + {'c0': t0 + 3, 'c1': 'ddddd', 'c2': 4294967295}, + ] + + objname = 'pandas_test_timeseries' + + with ensure_clean_path('written_with_pytables.h5') as path: + # The `ensure_clean_path` context mgr removes the temp file upon exit. + with tables.open_file(path, mode='w') as f: + t = f.create_table('/', name=objname, description=table_schema) + for sample in testsamples: + for key, value in sample.items(): + t.row[key] = value + t.row.append() + + yield path, objname, pd.DataFrame(testsamples) + + +class TestReadPyTablesHDF5: + """ + A group of tests which covers reading HDF5 files written by plain PyTables + (not written by pandas). + + Was introduced for regression-testing issue 11188. + """ + + def test_read_complete(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + result = pd.read_hdf(path, key=objname) + expected = df + assert_frame_equal(result, expected) + + def test_read_with_start(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + # This is a regression test for pandas-dev/pandas/issues/11188 + result = pd.read_hdf(path, key=objname, start=1) + expected = df[1:].reset_index(drop=True) + assert_frame_equal(result, expected) + + def test_read_with_stop(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + # This is a regression test for pandas-dev/pandas/issues/11188 + result = pd.read_hdf(path, key=objname, stop=1) + expected = df[:1].reset_index(drop=True) + assert_frame_equal(result, expected) + + def test_read_with_startstop(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + # This is a regression test for pandas-dev/pandas/issues/11188 + result = pd.read_hdf(path, key=objname, start=1, stop=2) + expected = df[1:2].reset_index(drop=True) + assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 299c0feb502be..ef9dbc63d873d 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -105,7 +105,7 @@ def ensure_clean_store(path, mode='a', complevel=None, complib=None, def ensure_clean_path(path): """ return essentially a named temporary file that is not opened - and deleted on existing; if path is a list, then create and + and deleted on exiting; if path is a list, then create and return list of filenames """ try: From f2aea09e7ce6fd6beb20d2cdffa44edab6f285cc Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Fri, 21 Jun 2019 17:52:24 +0200 Subject: [PATCH 16/34] TST: tests for maybe_promote (precursor to #23982) (#25637) --- pandas/conftest.py | 44 ++ pandas/tests/dtypes/cast/test_promote.py | 677 +++++++++++++++++++++++ 2 files changed, 721 insertions(+) create mode 100644 pandas/tests/dtypes/cast/test_promote.py diff --git a/pandas/conftest.py b/pandas/conftest.py index c4285e9db038a..4bcd0ea8442e6 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -399,6 +399,10 @@ def tz_aware_fixture(request): return request.param +# Generate cartesian product of tz_aware_fixture: +tz_aware_fixture2 = tz_aware_fixture + + # ---------------------------------------------------------------- # Dtypes # ---------------------------------------------------------------- @@ -438,6 +442,46 @@ def string_dtype(request): return request.param +@pytest.fixture(params=BYTES_DTYPES) +def bytes_dtype(request): + """Parametrized fixture for bytes dtypes. + + * bytes + * 'bytes' + """ + return request.param + + +@pytest.fixture(params=OBJECT_DTYPES) +def object_dtype(request): + """Parametrized fixture for object dtypes. + + * object + * 'object' + """ + return request.param + + +@pytest.fixture(params=DATETIME64_DTYPES) +def datetime64_dtype(request): + """Parametrized fixture for datetime64 dtypes. + + * 'datetime64[ns]' + * 'M8[ns]' + """ + return request.param + + +@pytest.fixture(params=TIMEDELTA64_DTYPES) +def timedelta64_dtype(request): + """Parametrized fixture for timedelta64 dtypes. + + * 'timedelta64[ns]' + * 'm8[ns]' + """ + return request.param + + @pytest.fixture(params=FLOAT_DTYPES) def float_dtype(request): """ diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py new file mode 100644 index 0000000000000..5a5b5d47b3ccc --- /dev/null +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -0,0 +1,677 @@ +""" +These test the method maybe_promote from core/dtypes/cast.py +""" + +import datetime + +import numpy as np +import pytest + +from pandas._libs.tslibs import NaT, iNaT +from pandas.compat import is_platform_windows + +from pandas.core.dtypes.cast import maybe_promote +from pandas.core.dtypes.common import ( + is_complex_dtype, is_datetime64_dtype, is_datetime_or_timedelta_dtype, + is_float_dtype, is_integer_dtype, is_object_dtype, is_scalar, + is_string_dtype, is_timedelta64_dtype) +from pandas.core.dtypes.dtypes import DatetimeTZDtype, PandasExtensionDtype + +import pandas as pd + + +@pytest.fixture(params=[bool, 'uint8', 'int32', 'uint64', 'float32', 'float64', + 'complex64', 'complex128', 'M8[ns]', 'm8[ns]', str, + bytes, object]) +def any_numpy_dtype_reduced(request): + """ + Parameterized fixture for numpy dtypes, reduced from any_numpy_dtype. + + * bool + * 'int32' + * 'uint64' + * 'float32' + * 'float64' + * 'complex64' + * 'complex128' + * 'M8[ns]' + * 'M8[ns]' + * str + * bytes + * object + """ + return request.param + + +@pytest.fixture(params=[(True, None), (True, object), (False, None)], + ids=['True-None', 'True-object', 'False-None']) +def box(request): + """ + Parametrized fixture determining whether/how to transform fill_value. + + Since fill_value is defined on a per-test basis, the actual transformation + (based on this fixture) is executed in _check_promote. + + Returns + ------- + boxed : Boolean + Whether fill_value should be wrapped in an np.array. + box_dtype : dtype + The dtype to pass to np.array([fill_value], dtype=box_dtype). If None, + then this is passed on unmodified, and corresponds to the numpy default + dtype for the given fill_value. + + * (True, None) # fill_value wrapped in array with default dtype + * (True, object) # fill_value wrapped in array with object dtype + * (False, None) # fill_value passed on as scalar + """ + return request.param + + +def _safe_dtype_assert(left_dtype, right_dtype): + """ + Compare two dtypes without raising TypeError. + """ + if isinstance(right_dtype, PandasExtensionDtype): + # switch order of equality check because numpy dtypes (e.g. if + # left_dtype is np.object_) do not know some expected dtypes (e.g. + # DatetimeTZDtype) and would raise a TypeError in their __eq__-method. + assert right_dtype == left_dtype + else: + assert left_dtype == right_dtype + + +def _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar=None, exp_val_for_array=None): + """ + Auxiliary function to unify testing of scalar/array promotion. + + Parameters + ---------- + dtype : dtype + The value to pass on as the first argument to maybe_promote. + fill_value : scalar + The value to pass on as the second argument to maybe_promote, either as + a scalar, or boxed into an array (depending on the parameter `boxed`). + boxed : Boolean + Parameter whether fill_value should be passed to maybe_promote + directly, or wrapped in an array (of dtype box_dtype). + box_dtype : dtype + The dtype to enforce when wrapping fill_value into an np.array. + expected_dtype : dtype + The expected dtype returned by maybe_promote (by design this is the + same regardless of whether fill_value was passed as a scalar or in an + array!). + exp_val_for_scalar : scalar + The expected value for the (potentially upcast) fill_value returned by + maybe_promote. + exp_val_for_array : scalar + The expected missing value marker for the expected_dtype (which is + returned by maybe_promote when it receives an array). + """ + assert is_scalar(fill_value) + + if boxed: + # in this case, we pass on fill_value wrapped in an array of specified + # box_dtype; the expected value returned from maybe_promote is the + # missing value marker for the returned dtype. + fill_array = np.array([fill_value], dtype=box_dtype) + result_dtype, result_fill_value = maybe_promote(dtype, fill_array) + expected_fill_value = exp_val_for_array + else: + # here, we pass on fill_value as a scalar directly; the expected value + # returned from maybe_promote is fill_value, potentially upcast to the + # returned dtype. + result_dtype, result_fill_value = maybe_promote(dtype, fill_value) + expected_fill_value = exp_val_for_scalar + + _safe_dtype_assert(result_dtype, expected_dtype) + + # for equal values, also check type (relevant e.g. for int vs float, resp. + # for different datetimes and timedeltas) + match_value = (result_fill_value == expected_fill_value + # disabled type check due to too many xfails; GH 23982/25425 + # and type(result_fill_value) == type(expected_fill_value) + ) + + # for missing values, None == None and iNaT == iNaT (which is checked + # through match_value above), but np.nan != np.nan and pd.NaT != pd.NaT + match_missing = ((result_fill_value is np.nan + and expected_fill_value is np.nan) + or (result_fill_value is NaT + and expected_fill_value is NaT)) + + assert match_value or match_missing + + +def test_maybe_promote_int_with_int(): + # placeholder due to too many xfails; see GH 23982 / 25425 + pass + + +# override parametrization due to to many xfails; see GH 23982 / 25425 +@pytest.mark.parametrize('box', [(True, None), (False, None)]) +def test_maybe_promote_int_with_float(any_int_dtype, float_dtype, box): + dtype = np.dtype(any_int_dtype) + fill_dtype = np.dtype(float_dtype) + boxed, box_dtype = box # read from parametrized fixture + + if float_dtype == 'float32' and not boxed: + pytest.xfail('falsely upcasts to float64') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling int with float always upcasts to float64 + expected_dtype = np.float64 + # fill_value can be different float type + exp_val_for_scalar = np.float64(fill_value) + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +# override parametrization due to to many xfails; see GH 23982 / 25425 +@pytest.mark.parametrize('box', [(True, None), (False, None)]) +def test_maybe_promote_float_with_int(float_dtype, any_int_dtype, box): + + dtype = np.dtype(float_dtype) + fill_dtype = np.dtype(any_int_dtype) + boxed, box_dtype = box # read from parametrized fixture + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling float with int always keeps float dtype + # because: np.finfo('float32').max > np.iinfo('uint64').max + expected_dtype = dtype + # output is not a generic float, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +def test_maybe_promote_float_with_float(): + # placeholder due to too many xfails; see GH 23982 / 25425 + pass + + +def test_maybe_promote_bool_with_any(any_numpy_dtype_reduced, box): + dtype = np.dtype(bool) + fill_dtype = np.dtype(any_numpy_dtype_reduced) + boxed, box_dtype = box # read from parametrized fixture + + if boxed and fill_dtype == bool: + pytest.xfail('falsely upcasts to object') + if (boxed and box_dtype is None + and is_datetime_or_timedelta_dtype(fill_dtype)): + pytest.xfail('wrongly casts fill_value') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling bool with anything but bool casts to object + expected_dtype = np.dtype(object) if fill_dtype != bool else fill_dtype + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan if fill_dtype != bool else None + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +def test_maybe_promote_any_with_bool(any_numpy_dtype_reduced, box): + dtype = np.dtype(any_numpy_dtype_reduced) + fill_value = True + boxed, box_dtype = box # read from parametrized fixture + + if boxed and dtype == bool: + pytest.xfail('falsely upcasts to object') + if boxed and dtype not in (str, object) and box_dtype is None: + pytest.xfail('falsely upcasts to object') + if not boxed and is_datetime_or_timedelta_dtype(dtype): + pytest.xfail('raises error') + + # filling anything but bool with bool casts to object + expected_dtype = np.dtype(object) if dtype != bool else dtype + # output is not a generic bool, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + exp_val_for_array = np.nan if dtype != bool else None + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +def test_maybe_promote_bytes_with_any(): + # placeholder due to too many xfails; see GH 23982 / 25425 + pass + + +def test_maybe_promote_any_with_bytes(): + # placeholder due to too many xfails; see GH 23982 / 25425 + pass + + +def test_maybe_promote_datetime64_with_any(): + # placeholder due to too many xfails; see GH 23982 / 25425 + pass + + +# override parametrization of box to add special case for dt_dtype +@pytest.mark.parametrize('box', [ + (True, None), # fill_value wrapped in array with default dtype + # disabled due to too many xfails; see GH 23982 / 25425 + # (True, 'dt_dtype'), # fill_value in array with explicit datetime dtype + # (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value passed on as scalar +]) +@pytest.mark.parametrize('fill_value', [ + pd.Timestamp('now'), np.datetime64('now'), + datetime.datetime.now(), datetime.date.today() +], ids=['pd.Timestamp', 'np.datetime64', 'datetime.datetime', 'datetime.date']) +def test_maybe_promote_any_with_datetime64(any_numpy_dtype_reduced, + datetime64_dtype, fill_value, box): + dtype = np.dtype(any_numpy_dtype_reduced) + boxed, box_dtype = box # read from parametrized fixture + + if is_datetime64_dtype(dtype): + if (boxed and (box_dtype == object + or (box_dtype is None + and not is_datetime64_dtype(type(fill_value))))): + pytest.xfail('falsely upcasts to object') + else: + if (boxed and (box_dtype == 'dt_dtype' + or (box_dtype is None + and is_datetime64_dtype(type(fill_value))))): + pytest.xfail('mix of lack of upcasting, resp. wrong missing value') + if not boxed and is_timedelta64_dtype(dtype): + pytest.xfail('raises error') + + # special case for box_dtype + box_dtype = (np.dtype(datetime64_dtype) if box_dtype == 'dt_dtype' + else box_dtype) + + # filling datetime with anything but datetime casts to object + if is_datetime64_dtype(dtype): + expected_dtype = dtype + # for datetime dtypes, scalar values get cast to pd.Timestamp.value + exp_val_for_scalar = pd.Timestamp(fill_value).value + exp_val_for_array = iNaT + else: + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +# override parametrization due to to many xfails; see GH 23982 / 25425 +@pytest.mark.parametrize('box', [(True, object)]) +def test_maybe_promote_datetimetz_with_any_numpy_dtype( + tz_aware_fixture, any_numpy_dtype_reduced, box): + dtype = DatetimeTZDtype(tz=tz_aware_fixture) + fill_dtype = np.dtype(any_numpy_dtype_reduced) + boxed, box_dtype = box # read from parametrized fixture + + if box_dtype != object: + pytest.xfail('does not upcast correctly') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling datetimetz with any numpy dtype casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +# override parametrization due to to many xfails; see GH 23982 / 25425 +@pytest.mark.parametrize('box', [(True, None), (True, object)]) +def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, + tz_aware_fixture2, box): + dtype = DatetimeTZDtype(tz=tz_aware_fixture) + fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture2) + boxed, box_dtype = box # read from parametrized fixture + + from dateutil.tz import tzlocal + if is_platform_windows() and tz_aware_fixture2 == tzlocal(): + pytest.xfail('Cannot process fill_value with this dtype, see GH 24310') + if dtype.tz == fill_dtype.tz and boxed: + pytest.xfail('falsely upcasts') + if dtype.tz != fill_dtype.tz and not boxed: + pytest.xfail('falsely upcasts') + + # create array of given dtype; casts "1" to correct dtype + fill_value = pd.Series([10 ** 9], dtype=fill_dtype)[0] + + # filling datetimetz with datetimetz casts to object, unless tz matches + exp_val_for_scalar = fill_value + if dtype.tz == fill_dtype.tz: + expected_dtype = dtype + exp_val_for_array = NaT + else: + expected_dtype = np.dtype(object) + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('fill_value', [None, np.nan, NaT, iNaT], + ids=['None', 'np.nan', 'pd.NaT', 'iNaT']) +# override parametrization due to to many xfails; see GH 23982 / 25425 +@pytest.mark.parametrize('box', [(False, None)]) +def test_maybe_promote_datetimetz_with_na(tz_aware_fixture, fill_value, box): + + dtype = DatetimeTZDtype(tz=tz_aware_fixture) + boxed, box_dtype = box # read from parametrized fixture + + if (boxed and (box_dtype == object + or (box_dtype is None + and (fill_value is None or fill_value is NaT)))): + pytest.xfail('false upcasts to object') + # takes the opinion that DatetimeTZ should have single na-marker + # using iNaT would lead to errors elsewhere -> NaT + if not boxed and fill_value == iNaT: + pytest.xfail('wrong missing value marker') + + expected_dtype = dtype + # DatetimeTZDtype does not use iNaT as missing value marker + exp_val_for_scalar = NaT + exp_val_for_array = NaT + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('fill_value', [ + pd.Timestamp('now'), np.datetime64('now'), + datetime.datetime.now(), datetime.date.today() +], ids=['pd.Timestamp', 'np.datetime64', 'datetime.datetime', 'datetime.date']) +def test_maybe_promote_any_numpy_dtype_with_datetimetz( + any_numpy_dtype_reduced, tz_aware_fixture, fill_value, box): + dtype = np.dtype(any_numpy_dtype_reduced) + fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture) + boxed, box_dtype = box # read from parametrized fixture + + if is_datetime_or_timedelta_dtype(dtype) and not boxed: + pytest.xfail('raises error') + + fill_value = pd.Series([fill_value], dtype=fill_dtype)[0] + + # filling any numpy dtype with datetimetz casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +def test_maybe_promote_timedelta64_with_any(): + # placeholder due to too many xfails; see GH 23982 / 25425 + pass + + +@pytest.mark.parametrize('fill_value', [ + pd.Timedelta(days=1), np.timedelta64(24, 'h'), datetime.timedelta(1) +], ids=['pd.Timedelta', 'np.timedelta64', 'datetime.timedelta']) +# override parametrization of box to add special case for td_dtype +@pytest.mark.parametrize('box', [ + (True, None), # fill_value wrapped in array with default dtype + # disabled due to too many xfails; see GH 23982 / 25425 + # (True, 'td_dtype'), # fill_value in array with explicit timedelta dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value passed on as scalar +]) +def test_maybe_promote_any_with_timedelta64( + any_numpy_dtype_reduced, timedelta64_dtype, fill_value, box): + dtype = np.dtype(any_numpy_dtype_reduced) + boxed, box_dtype = box # read from parametrized fixture + + if is_timedelta64_dtype(dtype): + if (boxed and (box_dtype == object + or (box_dtype is None + and not is_timedelta64_dtype(type(fill_value))))): + pytest.xfail('falsely upcasts to object') + else: + if (boxed and box_dtype is None + and is_timedelta64_dtype(type(fill_value))): + pytest.xfail('does not upcast correctly') + if (not boxed and is_timedelta64_dtype(type(fill_value)) and ( + is_integer_dtype(dtype) or is_float_dtype(dtype) + or is_complex_dtype(dtype) + or issubclass(dtype.type, np.bytes_))): + pytest.xfail('does not upcast correctly') + if box_dtype == 'td_dtype': + pytest.xfail('falsely upcasts') + if not boxed and is_datetime64_dtype(dtype): + pytest.xfail('raises error') + + # special case for box_dtype + box_dtype = (np.dtype(timedelta64_dtype) if box_dtype == 'td_dtype' + else box_dtype) + + # filling anything but timedelta with timedelta casts to object + if is_timedelta64_dtype(dtype): + expected_dtype = dtype + # for timedelta dtypes, scalar values get cast to pd.Timedelta.value + exp_val_for_scalar = pd.Timedelta(fill_value).value + exp_val_for_array = iNaT + else: + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +def test_maybe_promote_string_with_any(string_dtype, + any_numpy_dtype_reduced, box): + dtype = np.dtype(string_dtype) + fill_dtype = np.dtype(any_numpy_dtype_reduced) + boxed, box_dtype = box # read from parametrized fixture + + if (boxed and box_dtype is None + and is_datetime_or_timedelta_dtype(fill_dtype)): + pytest.xfail('wrong missing value marker') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling string with anything casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +# override parametrization of box to add special case for str +@pytest.mark.parametrize('box', [ + # disabled due to too many xfails; see GH 23982 / 25425 + # (True, None), # fill_value wrapped in array with default dtype + # (True, 'str'), # fill_value wrapped in array with generic string-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value passed on as scalar +]) +def test_maybe_promote_any_with_string(any_numpy_dtype_reduced, + string_dtype, box): + dtype = np.dtype(any_numpy_dtype_reduced) + fill_dtype = np.dtype(string_dtype) + boxed, box_dtype = box # read from parametrized fixture + + if is_datetime_or_timedelta_dtype(dtype) and box_dtype != object: + pytest.xfail('does not upcast or raises') + if (boxed and box_dtype in (None, 'str') and ( + is_integer_dtype(dtype) or is_float_dtype(dtype) + or is_complex_dtype(dtype) + or issubclass(dtype.type, np.bytes_))): + pytest.xfail('does not upcast correctly') + + # create array of given dtype + fill_value = 'abc' + + # special case for box_dtype (cannot use fixture in parametrization) + box_dtype = fill_dtype if box_dtype == 'str' else box_dtype + + # filling anything with a string casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +def test_maybe_promote_object_with_any(object_dtype, + any_numpy_dtype_reduced, box): + dtype = np.dtype(object_dtype) + fill_dtype = np.dtype(any_numpy_dtype_reduced) + boxed, box_dtype = box # read from parametrized fixture + + if (boxed and box_dtype is None + and is_datetime_or_timedelta_dtype(fill_dtype)): + pytest.xfail('wrong missing value marker') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling object with anything stays object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +def test_maybe_promote_any_with_object(any_numpy_dtype_reduced, + object_dtype, box): + dtype = np.dtype(any_numpy_dtype_reduced) + boxed, box_dtype = box # read from parametrized fixture + + if not boxed and is_datetime_or_timedelta_dtype(dtype): + pytest.xfail('raises error') + + # create array of object dtype from a scalar value (i.e. passing + # dtypes.common.is_scalar), which can however not be cast to int/float etc. + fill_value = pd.DateOffset(1) + + # filling object with anything stays object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('fill_value', [None, np.nan, NaT, iNaT], + ids=['None', 'np.nan', 'pd.NaT', 'iNaT']) +# override parametrization due to to many xfails; see GH 23982 / 25425 +@pytest.mark.parametrize('box', [(False, None)]) +def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, + fill_value, box): + dtype = np.dtype(any_numpy_dtype_reduced) + boxed, box_dtype = box # read from parametrized fixture + + if (dtype == bytes and not boxed + and fill_value is not None and fill_value is not NaT): + pytest.xfail('does not upcast to object') + elif dtype == 'uint64' and not boxed and fill_value == iNaT: + pytest.xfail('does not upcast correctly') + elif is_datetime_or_timedelta_dtype(dtype) and boxed: + pytest.xfail('falsely upcasts to object') + elif (boxed and (is_integer_dtype(dtype) or is_float_dtype(dtype) + or is_complex_dtype(dtype)) + and fill_value is not NaT and dtype != 'uint64'): + pytest.xfail('falsely upcasts to object') + elif (boxed and dtype == 'uint64' + and (fill_value is np.nan or fill_value is None)): + pytest.xfail('falsely upcasts to object') + # below: opinionated that iNaT should be interpreted as missing value + elif (not boxed and (is_float_dtype(dtype) or is_complex_dtype(dtype)) + and fill_value == iNaT): + pytest.xfail('does not cast to missing value marker correctly') + elif ((is_string_dtype(dtype) or dtype == bool) + and not boxed and fill_value == iNaT): + pytest.xfail('does not cast to missing value marker correctly') + + if is_integer_dtype(dtype) and dtype == 'uint64' and fill_value == iNaT: + # uint64 + negative int casts to object; iNaT is considered as missing + expected_dtype = np.dtype(object) + exp_val_for_scalar = np.nan + elif is_integer_dtype(dtype) and fill_value == iNaT: + # other integer + iNaT casts to int64 + expected_dtype = np.int64 + exp_val_for_scalar = iNaT + elif is_integer_dtype(dtype) and fill_value is not NaT: + # integer + other missing value (np.nan / None) casts to float + expected_dtype = np.float64 + exp_val_for_scalar = np.nan + elif is_object_dtype(dtype) and (fill_value == iNaT or fill_value is NaT): + # inserting into object does not cast the value + # but *does* cast None to np.nan + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + elif is_datetime_or_timedelta_dtype(dtype): + # datetime / timedelta cast all missing values to iNaT + expected_dtype = dtype + exp_val_for_scalar = iNaT + elif fill_value is NaT: + # NaT upcasts everything that's not datetime/timedelta to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = NaT + elif is_float_dtype(dtype) or is_complex_dtype(dtype): + # float / complex + missing value (!= NaT) stays the same + expected_dtype = dtype + exp_val_for_scalar = np.nan + else: + # all other cases cast to object, and use np.nan as missing value + expected_dtype = np.dtype(object) + exp_val_for_scalar = np.nan + + # array case has same expected_dtype; but returns corresponding na-marker + if is_integer_dtype(expected_dtype): + # integers cannot hold NaNs; maybe_promote_with_array returns None + exp_val_for_array = None + elif is_datetime_or_timedelta_dtype(expected_dtype): + exp_val_for_array = iNaT + else: # expected_dtype = float / complex / object + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('dim', [0, 2, 3]) +def test_maybe_promote_dimensions(any_numpy_dtype_reduced, dim): + dtype = np.dtype(any_numpy_dtype_reduced) + + # create 0-dim array of given dtype; casts "1" to correct dtype + fill_array = np.array(1, dtype=dtype) + + # expand to desired dimension: + for _ in range(dim): + fill_array = np.expand_dims(fill_array, 0) + + # test against 1-dimensional case + expected_dtype, expected_missing_value = maybe_promote( + dtype, np.array([1], dtype=dtype)) + + result_dtype, result_missing_value = maybe_promote(dtype, fill_array) + + assert result_dtype == expected_dtype + # None == None, iNaT == iNaT, but np.nan != np.nan + assert ((result_missing_value == expected_missing_value) + or (result_missing_value is np.nan + and expected_missing_value is np.nan)) From ba69f95a88e372e748fb6d7f29aa4b06bad578ca Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 21 Jun 2019 10:58:28 -0500 Subject: [PATCH 17/34] Additional tests for ufunc(Series) (#26951) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/tests/series/test_ufunc.py | 253 ++++++++++++++++++++++++++++++ 2 files changed, 254 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/series/test_ufunc.py diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 467cb5a40213c..05978d500fa82 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -774,7 +774,7 @@ Sparse - Bug in :class:`SparseFrame` constructor where passing ``None`` as the data would cause ``default_fill_value`` to be ignored (:issue:`16807`) - Bug in :class:`SparseDataFrame` when adding a column in which the length of values does not match length of index, ``AssertionError`` is raised instead of raising ``ValueError`` (:issue:`25484`) - Introduce a better error message in :meth:`Series.sparse.from_coo` so it returns a ``TypeError`` for inputs that are not coo matrices (:issue:`26554`) -- Bug in :func:`numpy.modf` on a :class:`SparseArray`. Now a tuple of :class:`SparseArray` is returned. +- Bug in :func:`numpy.modf` on a :class:`SparseArray`. Now a tuple of :class:`SparseArray` is returned (:issue:`26946`). Other ^^^^^ diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py new file mode 100644 index 0000000000000..05d19452b1eac --- /dev/null +++ b/pandas/tests/series/test_ufunc.py @@ -0,0 +1,253 @@ +import string + +import numpy as np +import pytest + +import pandas as pd +import pandas.util.testing as tm + +UNARY_UFUNCS = [np.positive, np.floor, np.exp] +BINARY_UFUNCS = [ + np.add, # dunder op + np.logaddexp, +] +SPARSE = [ + pytest.param(True, + marks=pytest.mark.xfail(reason="Series.__array_ufunc__")), + False, +] +SPARSE_IDS = ['sparse', 'dense'] +SHUFFLE = [ + pytest.param(True, marks=pytest.mark.xfail(reason="GH-26945", + strict=False)), + False +] + + +@pytest.fixture +def arrays_for_binary_ufunc(): + """ + A pair of random, length-100 integer-dtype arrays, that are mostly 0. + """ + a1 = np.random.randint(0, 10, 100, dtype='int64') + a2 = np.random.randint(0, 10, 100, dtype='int64') + a1[::3] = 0 + a2[::4] = 0 + return a1, a2 + + +@pytest.mark.parametrize("ufunc", UNARY_UFUNCS) +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +def test_unary_ufunc(ufunc, sparse): + # Test that ufunc(Series) == Series(ufunc) + array = np.random.randint(0, 10, 10, dtype='int64') + array[::2] = 0 + if sparse: + array = pd.SparseArray(array, dtype=pd.SparseDtype('int', 0)) + + index = list(string.ascii_letters[:10]) + name = "name" + series = pd.Series(array, index=index, name=name) + + result = ufunc(series) + expected = pd.Series(ufunc(array), index=index, name=name) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", BINARY_UFUNCS) +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +@pytest.mark.parametrize("flip", [True, False], ids=['flipped', 'straight']) +def test_binary_ufunc_with_array(flip, sparse, ufunc, arrays_for_binary_ufunc): + # Test that ufunc(Series(a), array) == Series(ufunc(a, b)) + a1, a2 = arrays_for_binary_ufunc + if sparse: + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int', 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int', 0)) + + name = "name" # op(Series, array) preserves the name. + series = pd.Series(a1, name=name) + other = a2 + + array_args = (a1, a2) + series_args = (series, other) # ufunc(series, array) + + if flip: + array_args = reversed(array_args) + series_args = reversed(series_args) # ufunc(array, series) + + expected = pd.Series(ufunc(*array_args), name=name) + result = ufunc(*series_args) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", BINARY_UFUNCS) +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +@pytest.mark.parametrize("flip", [ + pytest.param(True, marks=pytest.mark.xfail(reason="Index should defer")), + False +], ids=['flipped', 'straight']) +def test_binary_ufunc_with_index(flip, sparse, ufunc, arrays_for_binary_ufunc): + # Test that + # * func(Series(a), Series(b)) == Series(ufunc(a, b)) + # * ufunc(Index, Series) dispatches to Series (returns a Series) + a1, a2 = arrays_for_binary_ufunc + if sparse: + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int', 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int', 0)) + + name = "name" # op(Series, array) preserves the name. + series = pd.Series(a1, name=name) + other = pd.Index(a2, name=name).astype("int64") + + array_args = (a1, a2) + series_args = (series, other) # ufunc(series, array) + + if flip: + array_args = reversed(array_args) + series_args = reversed(series_args) # ufunc(array, series) + + expected = pd.Series(ufunc(*array_args), name=name) + result = ufunc(*series_args) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", BINARY_UFUNCS) +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +@pytest.mark.parametrize("shuffle", [True, False], ids=['unaligned', + 'aligned']) +@pytest.mark.parametrize("flip", [True, False], ids=['flipped', 'straight']) +def test_binary_ufunc_with_series(flip, shuffle, sparse, ufunc, + arrays_for_binary_ufunc): + # Test that + # * func(Series(a), Series(b)) == Series(ufunc(a, b)) + # with alignment between the indices + + if flip and shuffle: + pytest.xfail(reason="Fix with Series.__array_ufunc__") + + a1, a2 = arrays_for_binary_ufunc + if sparse: + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int', 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int', 0)) + + name = "name" # op(Series, array) preserves the name. + series = pd.Series(a1, name=name) + other = pd.Series(a2, name=name) + + idx = np.random.permutation(len(a1)) + + if shuffle: + other = other.take(idx) + a2 = a2.take(idx) + # alignment, so the expected index is the first index in the op. + if flip: + index = other.align(series)[0].index + else: + index = series.align(other)[0].index + else: + index = series.index + + array_args = (a1, a2) + series_args = (series, other) # ufunc(series, array) + + if flip: + array_args = tuple(reversed(array_args)) + series_args = tuple(reversed(series_args)) # ufunc(array, series) + + expected = pd.Series(ufunc(*array_args), index=index, name=name) + result = ufunc(*series_args) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", BINARY_UFUNCS) +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +@pytest.mark.parametrize("flip", [True, False]) +def test_binary_ufunc_scalar(ufunc, sparse, flip, arrays_for_binary_ufunc): + # Test that + # * ufunc(Series, scalar) == Series(ufunc(array, scalar)) + # * ufunc(Series, scalar) == ufunc(scalar, Series) + array, _ = arrays_for_binary_ufunc + if sparse: + array = pd.SparseArray(array) + other = 2 + series = pd.Series(array, name="name") + + series_args = (series, other) + array_args = (array, other) + + if flip: + series_args = tuple(reversed(series_args)) + array_args = tuple(reversed(array_args)) + + expected = pd.Series(ufunc(*array_args), name="name") + result = ufunc(*series_args) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.divmod]) # any others? +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +@pytest.mark.parametrize("shuffle", SHUFFLE) +@pytest.mark.filterwarnings("ignore:divide by zero:RuntimeWarning") +def test_multiple_ouput_binary_ufuncs(ufunc, sparse, shuffle, + arrays_for_binary_ufunc): + # Test that + # the same conditions from binary_ufunc_scalar apply to + # ufuncs with multiple outputs. + if sparse and ufunc is np.divmod: + pytest.skip("sparse divmod not implemented.") + + a1, a2 = arrays_for_binary_ufunc + + if sparse: + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int', 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int', 0)) + + s1 = pd.Series(a1) + s2 = pd.Series(a2) + + if shuffle: + # ensure we align before applying the ufunc + s2 = s2.sample(frac=1) + + expected = ufunc(a1, a2) + assert isinstance(expected, tuple) + + result = ufunc(s1, s2) + assert isinstance(result, tuple) + tm.assert_series_equal(result[0], pd.Series(expected[0])) + tm.assert_series_equal(result[1], pd.Series(expected[1])) + + +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +def test_multiple_ouput_ufunc(sparse, arrays_for_binary_ufunc): + # Test that the same conditions from unary input apply to multi-output + # ufuncs + array, _ = arrays_for_binary_ufunc + + if sparse: + array = pd.SparseArray(array) + + series = pd.Series(array, name="name") + result = np.modf(series) + expected = np.modf(array) + + assert isinstance(result, tuple) + assert isinstance(expected, tuple) + + tm.assert_series_equal(result[0], pd.Series(expected[0], name="name")) + tm.assert_series_equal(result[1], pd.Series(expected[1], name="name")) + + +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +@pytest.mark.parametrize("ufunc", BINARY_UFUNCS) +@pytest.mark.xfail(reason="Series.__array_ufunc__") +def test_binary_ufunc_drops_series_name(ufunc, sparse, + arrays_for_binary_ufunc): + # Drop the names when they differ. + a1, a2 = arrays_for_binary_ufunc + s1 = pd.Series(a1, name='a') + s2 = pd.Series(a2, name='b') + + result = ufunc(s1, s2) + assert result.name is None From dfcd2b2c575d474014908802d9cedf1ac3259635 Mon Sep 17 00:00:00 2001 From: robbuckley <20515024+robbuckley@users.noreply.github.com> Date: Fri, 21 Jun 2019 17:25:32 +0100 Subject: [PATCH 18/34] BLD: fix build error for PyPy on macOS (#26536) (#26862) --- doc/source/whatsnew/v0.25.0.rst | 5 +++++ setup.py | 18 +++++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 05978d500fa82..275be4ff58e99 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -776,6 +776,11 @@ Sparse - Introduce a better error message in :meth:`Series.sparse.from_coo` so it returns a ``TypeError`` for inputs that are not coo matrices (:issue:`26554`) - Bug in :func:`numpy.modf` on a :class:`SparseArray`. Now a tuple of :class:`SparseArray` is returned (:issue:`26946`). +Build Changes +^^^^^^^^^^^^^ + +- Fix install error with PyPy on macOS (:issue:`26536`) + Other ^^^^^ diff --git a/setup.py b/setup.py index 4579bbfa59797..389e8553eb3a3 100755 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ import pkg_resources import platform -from distutils.sysconfig import get_config_var +from distutils.sysconfig import get_config_vars import sys import shutil from distutils.version import LooseVersion @@ -442,19 +442,19 @@ def run(self): if debugging_symbols_requested: extra_compile_args.append('-g') -# For mac, ensure extensions are built for macos 10.9 when compiling on a -# 10.9 system or above, overriding distuitls behaviour which is to target -# the version that python was built for. This may be overridden by setting +# Build for at least macOS 10.9 when compiling on a 10.9 system or above, +# overriding CPython distuitls behaviour which is to target the version that +# python was built for. This may be overridden by setting # MACOSX_DEPLOYMENT_TARGET before calling setup.py if is_platform_mac(): if 'MACOSX_DEPLOYMENT_TARGET' not in os.environ: - current_system = LooseVersion(platform.mac_ver()[0]) - python_target = LooseVersion( - get_config_var('MACOSX_DEPLOYMENT_TARGET')) - if python_target < '10.9' and current_system >= '10.9': + current_system = platform.mac_ver()[0] + python_target = get_config_vars().get('MACOSX_DEPLOYMENT_TARGET', + current_system) + if (LooseVersion(python_target) < '10.9' and + LooseVersion(current_system) >= '10.9'): os.environ['MACOSX_DEPLOYMENT_TARGET'] = '10.9' - # enable coverage by building cython files by setting the environment variable # "PANDAS_CYTHON_COVERAGE" (with a Truthy value) or by running build_ext # with `--with-cython-coverage`enabled From 9088f5ebbaf098bd7113bfba7eaa6dcdbf0b4c3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Abdullah=20=C4=B0hsan=20Se=C3=A7er?= Date: Fri, 21 Jun 2019 20:01:34 +0300 Subject: [PATCH 19/34] BUG: Fix rolling median and quantile with closed='left' and closed='neither' (#26005) (#26910) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/_libs/window.pyx | 50 +++++++++++++++++---------------- pandas/tests/test_window.py | 19 +++++++++++++ 3 files changed, 46 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 275be4ff58e99..5e5a2aed3ac03 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -747,6 +747,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.frame.DataFrame.groupby` where passing a :class:`pandas.core.groupby.grouper.Grouper` would return incorrect groups when using the ``.groups`` accessor (:issue:`26326`) - Bug in :meth:`pandas.core.groupby.GroupBy.agg` where incorrect results are returned for uint64 columns. (:issue:`26310`) - Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where MemoryError is raised with empty window (:issue:`26005`) +- Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where incorrect results are returned with ``closed='left'`` and ``closed='neither'`` (:issue:`26005`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 3305fea06f003..df86f395d6097 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1116,21 +1116,15 @@ def roll_median_c(ndarray[float64_t] values, int64_t win, int64_t minp, if i == 0: # setup - val = values[i] - if notnan(val): - nobs += 1 - err = skiplist_insert(sl, val) != 1 - if err: - break - - else: - - # calculate deletes - for j in range(start[i - 1], s): + for j in range(s, e): val = values[j] if notnan(val): - skiplist_remove(sl, val) - nobs -= 1 + nobs += 1 + err = skiplist_insert(sl, val) != 1 + if err: + break + + else: # calculate adds for j in range(end[i - 1], e): @@ -1141,6 +1135,13 @@ def roll_median_c(ndarray[float64_t] values, int64_t win, int64_t minp, if err: break + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + if notnan(val): + skiplist_remove(sl, val) + nobs -= 1 + if nobs >= minp: midpoint = (nobs / 2) if nobs % 2: @@ -1507,19 +1508,13 @@ def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, if i == 0: # setup - val = values[i] - if notnan(val): - nobs += 1 - skiplist_insert(skiplist, val) - - else: - - # calculate deletes - for j in range(start[i - 1], s): + for j in range(s, e): val = values[j] if notnan(val): - skiplist_remove(skiplist, val) - nobs -= 1 + nobs += 1 + skiplist_insert(skiplist, val) + + else: # calculate adds for j in range(end[i - 1], e): @@ -1528,6 +1523,13 @@ def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, nobs += 1 skiplist_insert(skiplist, val) + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + if notnan(val): + skiplist_remove(skiplist, val) + nobs -= 1 + if nobs >= minp: if nobs == 1: # Single value in skip list diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 31baf4475214f..9524a78dae16c 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -594,6 +594,25 @@ def test_closed_min_max_minp(self, func, closed, expected): expected = pd.Series(expected, index=ser.index) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("closed,expected", [ + ('right', [0, 0.5, 1, 2, 3, 4, 5, 6, 7, 8]), + ('both', [0, 0.5, 1, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), + ('neither', [np.nan, 0, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), + ('left', [np.nan, 0, 0.5, 1, 2, 3, 4, 5, 6, 7]) + ]) + def test_closed_median_quantile(self, closed, expected): + # GH 26005 + ser = pd.Series(data=np.arange(10), + index=pd.date_range('2000', periods=10)) + roll = ser.rolling('3D', closed=closed) + expected = pd.Series(expected, index=ser.index) + + result = roll.median() + tm.assert_series_equal(result, expected) + + result = roll.quantile(0.5) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('roller', ['1s', 1]) def tests_empty_df_rolling(self, roller): # GH 15819 Verifies that datetime and integer rolling windows can be From 171615a35574a1fc9c4e8260edca7d1e08e9c302 Mon Sep 17 00:00:00 2001 From: Jan-Philip Gehrcke Date: Fri, 21 Jun 2019 19:55:28 +0200 Subject: [PATCH 20/34] CLN: move pytables tests to tests/io/pytables dir (#26986) --- pandas/tests/io/pytables/__init__.py | 0 pandas/tests/io/pytables/test_compat.py | 2 +- pandas/tests/io/{ => pytables}/test_pytables.py | 0 pandas/tests/io/{ => pytables}/test_pytables_missing.py | 0 4 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 pandas/tests/io/pytables/__init__.py rename pandas/tests/io/{ => pytables}/test_pytables.py (100%) rename pandas/tests/io/{ => pytables}/test_pytables_missing.py (100%) diff --git a/pandas/tests/io/pytables/__init__.py b/pandas/tests/io/pytables/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/io/pytables/test_compat.py b/pandas/tests/io/pytables/test_compat.py index d74e1218ebdb0..34ed066dd3748 100644 --- a/pandas/tests/io/pytables/test_compat.py +++ b/pandas/tests/io/pytables/test_compat.py @@ -1,7 +1,7 @@ import pytest import pandas as pd -from pandas.tests.io.test_pytables import ensure_clean_path +from pandas.tests.io.pytables.test_pytables import ensure_clean_path from pandas.util.testing import assert_frame_equal tables = pytest.importorskip('tables') diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py similarity index 100% rename from pandas/tests/io/test_pytables.py rename to pandas/tests/io/pytables/test_pytables.py diff --git a/pandas/tests/io/test_pytables_missing.py b/pandas/tests/io/pytables/test_pytables_missing.py similarity index 100% rename from pandas/tests/io/test_pytables_missing.py rename to pandas/tests/io/pytables/test_pytables_missing.py From 9aef32db29925bec7a0372b92a63cfc4e78398c2 Mon Sep 17 00:00:00 2001 From: pilkibun <51503352+pilkibun@users.noreply.github.com> Date: Fri, 21 Jun 2019 21:19:58 +0000 Subject: [PATCH 21/34] BUG: Handle NA values for ExtensionArrays in Series.count (#26836) --- doc/source/whatsnew/v0.25.0.rst | 7 ++++++- pandas/core/series.py | 2 +- pandas/tests/extension/base/methods.py | 7 +++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 5e5a2aed3ac03..19636f42c6129 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -782,11 +782,16 @@ Build Changes - Fix install error with PyPy on macOS (:issue:`26536`) +ExtensionArray +^^^^^^^^^^^^^^ + +- Bug in :func:`factorize` when passing an ``ExtensionArray`` with a custom ``na_sentinel`` (:issue:`25696`). +- :meth:`Series.count` miscounts NA values in ExtensionArrays (:issue:`26835`) + Other ^^^^^ - Removed unused C functions from vendored UltraJSON implementation (:issue:`26198`) -- Bug in :func:`factorize` when passing an ``ExtensionArray`` with a custom ``na_sentinel`` (:issue:`25696`). - Allow :class:`Index` and :class:`RangeIndex` to be passed to numpy ``min`` and ``max`` functions (:issue:`26125`) .. _whatsnew_0.250.contributors: diff --git a/pandas/core/series.py b/pandas/core/series.py index c4a449154860f..11e578e74f6e7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1659,7 +1659,7 @@ def count(self, level=None): 2 """ if level is None: - return notna(com.values_from_object(self)).sum() + return notna(self.array).sum() if isinstance(level, str): level = self.index._get_level_number(level) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 1852edaa9e748..c8fd4d1b708e5 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -30,6 +30,13 @@ def test_count(self, data_missing): expected = pd.Series([0, 1]) self.assert_series_equal(result, expected) + def test_series_count(self, data_missing): + # GH#26835 + ser = pd.Series(data_missing) + result = ser.count() + expected = 1 + assert result == expected + def test_apply_simple_series(self, data): result = pd.Series(data).apply(id) assert isinstance(result, pd.Series) From a14874f3e85bae66c32ab1ae4c1cd8a72e741ffe Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 21 Jun 2019 17:10:44 -0500 Subject: [PATCH 22/34] xfail test_missing_required_dependency test (#26993) --- pandas/tests/test_downstream.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 9fe8b0f9563ef..bb662e99664e2 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -133,13 +133,20 @@ def test_pyarrow(df): tm.assert_frame_equal(result, df) +@pytest.mark.xfail(reason="pandas-wheels-50", strict=False) def test_missing_required_dependency(): # GH 23868 - # use the -S flag to disable site-packages - call = ['python', '-S', '-c', 'import pandas'] + # To ensure proper isolation, we pass these flags + # -S : disable site-packages + # -s : disable user site-packages + # -E : disable PYTHON* env vars, especially PYTHONPATH + # And, that's apparently not enough, so we give up. + # https://github.com/MacPython/pandas-wheels/pull/50 + call = ['python', '-sSE', '-c', 'import pandas'] with pytest.raises(subprocess.CalledProcessError) as exc: subprocess.check_output(call, stderr=subprocess.STDOUT) output = exc.value.stdout.decode() - assert all(x in output for x in ['numpy', 'pytz', 'dateutil']) + for name in ['numpy', 'pytz', 'dateutil']: + assert name in output From 2b9b58dadf8a4e02b94747b6c8b22bec4b6eeefd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 22 Jun 2019 10:19:03 -0700 Subject: [PATCH 23/34] BLD: use unsigned instead of signed for lengths, avoid build warnings (#26759) --- pandas/_libs/parsers.pyx | 18 ++++++++-------- pandas/_libs/src/parser/tokenizer.c | 32 +++++++++++++++++------------ pandas/_libs/src/parser/tokenizer.h | 18 ++++++++-------- 3 files changed, 37 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 88b918e9cc515..b73b70caf1597 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -119,24 +119,24 @@ cdef extern from "parser/tokenizer.h": # where to write out tokenized data char *stream - int64_t stream_len - int64_t stream_cap + uint64_t stream_len + uint64_t stream_cap # Store words in (potentially ragged) matrix for now, hmm char **words int64_t *word_starts # where we are in the stream - int64_t words_len - int64_t words_cap - int64_t max_words_cap # maximum word cap encountered + uint64_t words_len + uint64_t words_cap + uint64_t max_words_cap # maximum word cap encountered char *pword_start # pointer to stream start of current field int64_t word_start # position start of current field int64_t *line_start # position in words for start of line int64_t *line_fields # Number of fields in each line - int64_t lines # Number of lines observed - int64_t file_lines # Number of lines observed (with bad/skipped) - int64_t lines_cap # Vector capacity + uint64_t lines # Number of lines observed + uint64_t file_lines # Number of lines observed (with bad/skipped) + uint64_t lines_cap # Vector capacity # Tokenizing stuff ParserState state @@ -168,7 +168,7 @@ cdef extern from "parser/tokenizer.h": int header # Boolean: 1: has header, 0: no header int64_t header_start # header row start - int64_t header_end # header row end + uint64_t header_end # header row end void *skipset PyObject *skipfunc diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 723bf56a79512..3146e49455609 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -71,9 +71,9 @@ static void free_if_not_null(void **ptr) { */ -static void *grow_buffer(void *buffer, int64_t length, int64_t *capacity, +static void *grow_buffer(void *buffer, uint64_t length, uint64_t *capacity, int64_t space, int64_t elsize, int *error) { - int64_t cap = *capacity; + uint64_t cap = *capacity; void *newbuffer = buffer; // Can we fit potentially nbytes tokens (+ null terminators) in the stream? @@ -248,7 +248,7 @@ void parser_del(parser_t *self) { } static int make_stream_space(parser_t *self, size_t nbytes) { - int64_t i, cap, length; + uint64_t i, cap, length; int status; void *orig_ptr, *newptr; @@ -263,7 +263,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { ("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", nbytes)) self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len, - (int64_t*)&self->stream_cap, nbytes * 2, + &self->stream_cap, nbytes * 2, sizeof(char), &status); TRACE( ("make_stream_space: self->stream=%p, self->stream_len = %zu, " @@ -305,7 +305,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { self->words = (char **)grow_buffer((void *)self->words, length, - (int64_t*)&self->words_cap, nbytes, + &self->words_cap, nbytes, sizeof(char *), &status); TRACE( ("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, " @@ -336,7 +336,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { cap = self->lines_cap; self->line_start = (int64_t *)grow_buffer((void *)self->line_start, self->lines + 1, - (int64_t*)&self->lines_cap, nbytes, + &self->lines_cap, nbytes, sizeof(int64_t), &status); TRACE(( "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", @@ -471,7 +471,7 @@ static int end_line(parser_t *self) { return 0; } - if (!(self->lines <= (int64_t) self->header_end + 1) && + if (!(self->lines <= self->header_end + 1) && (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) { // increment file line count self->file_lines++; @@ -507,7 +507,7 @@ static int end_line(parser_t *self) { } } else { // missing trailing delimiters - if ((self->lines >= (int64_t) self->header_end + 1) && + if ((self->lines >= self->header_end + 1) && fields < ex_fields) { // might overrun the buffer when closing fields if (make_stream_space(self, ex_fields - fields) < 0) { @@ -651,7 +651,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { stream = self->stream + self->stream_len; \ slen = self->stream_len; \ self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + (int64_t)line_limit) { \ + if (line_limit > 0 && self->lines == start_lines + line_limit) { \ goto linelimit; \ } @@ -666,7 +666,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { stream = self->stream + self->stream_len; \ slen = self->stream_len; \ self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + (int64_t)line_limit) { \ + if (line_limit > 0 && self->lines == start_lines + line_limit) { \ goto linelimit; \ } @@ -737,7 +737,8 @@ int skip_this_line(parser_t *self, int64_t rownum) { int tokenize_bytes(parser_t *self, size_t line_limit, int64_t start_lines) { - int64_t i, slen; + int64_t i; + uint64_t slen; int should_skip; char c; char *stream; @@ -1203,7 +1204,8 @@ static int parser_handle_eof(parser_t *self) { } int parser_consume_rows(parser_t *self, size_t nrows) { - int64_t i, offset, word_deletions, char_count; + int64_t offset, word_deletions; + uint64_t char_count, i; if (nrows > self->lines) { nrows = self->lines; @@ -1229,6 +1231,8 @@ int parser_consume_rows(parser_t *self, size_t nrows) { self->stream_len -= char_count; /* move token metadata */ + // Note: We should always have words_len < word_deletions, so this + // subtraction will remain appropriately-typed. for (i = 0; i < self->words_len - word_deletions; ++i) { offset = i + word_deletions; @@ -1242,6 +1246,8 @@ int parser_consume_rows(parser_t *self, size_t nrows) { self->word_start -= char_count; /* move line metadata */ + // Note: We should always have self->lines - nrows + 1 >= 0, so this + // subtraction will remain appropriately-typed. for (i = 0; i < self->lines - nrows + 1; ++i) { offset = i + nrows; self->line_start[i] = self->line_start[offset] - word_deletions; @@ -1265,7 +1271,7 @@ int parser_trim_buffers(parser_t *self) { size_t new_cap; void *newptr; - int64_t i; + uint64_t i; /** * Before we free up space and trim, we should diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index b6d5d6937f4db..66ef1887d6bc3 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -104,24 +104,24 @@ typedef struct parser_t { // where to write out tokenized data char *stream; - int64_t stream_len; - int64_t stream_cap; + uint64_t stream_len; + uint64_t stream_cap; // Store words in (potentially ragged) matrix for now, hmm char **words; int64_t *word_starts; // where we are in the stream - int64_t words_len; - int64_t words_cap; - int64_t max_words_cap; // maximum word cap encountered + uint64_t words_len; + uint64_t words_cap; + uint64_t max_words_cap; // maximum word cap encountered char *pword_start; // pointer to stream start of current field int64_t word_start; // position start of current field int64_t *line_start; // position in words for start of line int64_t *line_fields; // Number of fields in each line - int64_t lines; // Number of (good) lines observed - int64_t file_lines; // Number of lines (including bad or skipped) - int64_t lines_cap; // Vector capacity + uint64_t lines; // Number of (good) lines observed + uint64_t file_lines; // Number of lines (including bad or skipped) + uint64_t lines_cap; // Vector capacity // Tokenizing stuff ParserState state; @@ -153,7 +153,7 @@ typedef struct parser_t { int header; // Boolean: 1: has header, 0: no header int64_t header_start; // header row start - int64_t header_end; // header row end + uint64_t header_end; // header row end void *skipset; PyObject *skipfunc; From b4d4ec5a36acda40f13a8c3c3e19262c095d4c41 Mon Sep 17 00:00:00 2001 From: Steven Date: Sat, 22 Jun 2019 15:19:37 -0400 Subject: [PATCH 24/34] DOC: df.astype example using dictionary (#26994) * DOC: df.astype example using dictionary (#26990) --- pandas/core/generic.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 360576ffdb00a..b08c101356157 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5622,6 +5622,31 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): Examples -------- + Create a DataFrame: + + >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> df = pd.DataFrame(data=d) + >>> df.dtypes + col1 int64 + col2 int64 + dtype: object + + Cast all columns to int32: + + >>> df.astype('int32').dtypes + col1 int32 + col2 int32 + dtype: object + + Cast col1 to int32 using a dictionary: + + >>> df.astype({'col1': 'int32'}).dtypes + col1 int32 + col2 int64 + dtype: object + + Create a series: + >>> ser = pd.Series([1, 2], dtype='int32') >>> ser 0 1 From e27eea8635b73082f93d44f0003f6ec5b92596a6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 23 Jun 2019 02:05:26 -0700 Subject: [PATCH 25/34] TST: fix flaky test (#27004) --- pandas/tests/series/indexing/test_datetime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index cba1444846d0c..a8120ec9c5c58 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -425,7 +425,7 @@ def test_datetime_indexing(): """ -@pytest.fixture(scope='module') +@pytest.fixture def dups(): dates = [datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 3), From cf74b0272af2e13e5b9ce40c8bf42df750ddc560 Mon Sep 17 00:00:00 2001 From: 1_x7 Date: Sun, 23 Jun 2019 04:58:06 -0700 Subject: [PATCH 26/34] DOC: Do not mention private classes in the documentation (#26997) --- ci/code_checks.sh | 4 ++-- pandas/core/generic.py | 10 +++++----- pandas/core/groupby/groupby.py | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index a16580679ff54..ac86815569a0c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -256,8 +256,8 @@ fi ### DOCSTRINGS ### if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then - MSG='Validate docstrings (GL03, GL06, GL07, GL09, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA05)' ; echo $MSG - $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL06,GL07,GL09,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA05 + MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA05)' ; echo $MSG + $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL04,GL05,GL06,GL07,GL09,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA05 RET=$(($RET + $?)) ; echo $MSG "DONE" fi diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b08c101356157..0e2a6a0cac414 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -56,7 +56,7 @@ # able to share _shared_docs = dict() _shared_doc_kwargs = dict( - axes='keywords for axes', klass='NDFrame', + axes='keywords for axes', klass='Series/DataFrame', axes_single_arg='int or labels for object', args_transpose='axes to permute (int or label for object)', optional_by=""" @@ -1940,7 +1940,7 @@ def __array_wrap__(self, result, context=None): def to_dense(self): """ - Return dense representation of NDFrame (as opposed to sparse). + Return dense representation of Series/DataFrame (as opposed to sparse). .. deprecated:: 0.25.0 @@ -9036,7 +9036,7 @@ def tshift(self, periods=1, freq=None, axis=0): Returns ------- - shifted : NDFrame + shifted : Series/DataFrame Notes ----- @@ -10272,12 +10272,12 @@ def _find_valid_index(self, how): return idx @Appender(_shared_docs['valid_index'] % {'position': 'first', - 'klass': 'NDFrame'}) + 'klass': 'Series/DataFrame'}) def first_valid_index(self): return self._find_valid_index('first') @Appender(_shared_docs['valid_index'] % {'position': 'last', - 'klass': 'NDFrame'}) + 'klass': 'Series/DataFrame'}) def last_valid_index(self): return self._find_valid_index('last') diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 43950f2f503c8..64cacd60da30f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -630,14 +630,14 @@ def curried(x): def get_group(self, name, obj=None): """ - Construct NDFrame from group with provided name. + Construct DataFrame from group with provided name. Parameters ---------- name : object the name of the group to get as a DataFrame - obj : NDFrame, default None - the NDFrame to take the DataFrame out of. If + obj : DataFrame, default None + the DataFrame to take the DataFrame out of. If it is None, the object groupby was called on will be used From 83fe8d78b6b086f3ceabe81cd420a3c7affe9aba Mon Sep 17 00:00:00 2001 From: Min ho Kim Date: Tue, 25 Jun 2019 01:52:41 +1000 Subject: [PATCH 27/34] CLN: Fix typos (mainly in docs and comments) (#27007) --- asv_bench/benchmarks/offset.py | 2 +- .../comparison/comparison_with_sas.rst | 2 +- .../comparison/comparison_with_stata.rst | 2 +- doc/source/user_guide/io.rst | 6 +++--- doc/source/user_guide/timeseries.rst | 2 +- doc/source/whatsnew/v0.10.1.rst | 2 +- doc/source/whatsnew/v0.14.0.rst | 2 +- doc/source/whatsnew/v0.14.1.rst | 2 +- doc/source/whatsnew/v0.19.0.rst | 2 +- doc/source/whatsnew/v0.21.0.rst | 4 ++-- doc/source/whatsnew/v0.23.0.rst | 2 +- doc/source/whatsnew/v0.23.1.rst | 2 +- doc/source/whatsnew/v0.24.0.rst | 10 +++++----- doc/source/whatsnew/v0.25.0.rst | 4 ++-- doc/source/whatsnew/v0.8.0.rst | 2 +- pandas/_libs/tslibs/fields.pyx | 2 +- pandas/compat/numpy/__init__.py | 2 +- pandas/conftest.py | 2 +- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/datetimes.py | 2 +- pandas/core/arrays/integer.py | 2 +- pandas/core/arrays/period.py | 2 +- pandas/core/arrays/sparse.py | 4 ++-- pandas/core/base.py | 2 +- pandas/core/computation/pytables.py | 2 +- pandas/core/dtypes/common.py | 4 ++-- pandas/core/frame.py | 12 ++++++------ pandas/core/generic.py | 12 ++++++------ pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/grouper.py | 2 +- pandas/core/indexes/base.py | 6 +++--- pandas/core/indexes/category.py | 2 +- pandas/core/indexes/datetimes.py | 2 +- pandas/core/indexes/interval.py | 2 +- pandas/core/indexes/multi.py | 6 +++--- pandas/core/indexes/range.py | 2 +- pandas/core/indexing.py | 2 +- pandas/core/internals/construction.py | 2 +- pandas/core/missing.py | 2 +- pandas/core/panel.py | 2 +- pandas/core/resample.py | 4 ++-- pandas/core/reshape/tile.py | 2 +- pandas/core/series.py | 2 +- pandas/core/sparse/scipy_sparse.py | 2 +- pandas/core/strings.py | 8 ++++---- pandas/core/tools/datetimes.py | 2 +- pandas/io/formats/excel.py | 2 +- pandas/io/json/json.py | 2 +- pandas/io/json/normalize.py | 2 +- pandas/io/json/table_schema.py | 2 +- pandas/io/pytables.py | 10 +++++----- pandas/io/stata.py | 2 +- pandas/plotting/_matplotlib/tools.py | 2 +- pandas/tests/arithmetic/test_numeric.py | 4 ++-- pandas/tests/arithmetic/test_object.py | 2 +- pandas/tests/arithmetic/test_period.py | 2 +- pandas/tests/arithmetic/test_timedelta64.py | 4 ++-- pandas/tests/arrays/test_datetimelike.py | 2 +- pandas/tests/arrays/test_integer.py | 4 ++-- pandas/tests/dtypes/test_common.py | 2 +- pandas/tests/extension/base/ops.py | 2 +- pandas/tests/extension/json/test_json.py | 2 +- pandas/tests/frame/test_combine_concat.py | 2 +- pandas/tests/frame/test_constructors.py | 2 +- pandas/tests/frame/test_indexing.py | 4 ++-- pandas/tests/frame/test_nonunique_indexes.py | 2 +- pandas/tests/groupby/test_apply.py | 2 +- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/groupby/test_timegrouper.py | 2 +- .../tests/indexes/datetimes/test_partial_slicing.py | 2 +- pandas/tests/indexes/datetimes/test_tools.py | 6 +++--- pandas/tests/indexes/interval/test_construction.py | 2 +- pandas/tests/indexes/multi/test_missing.py | 2 +- pandas/tests/indexes/test_category.py | 2 +- pandas/tests/indexing/multiindex/test_xs.py | 2 +- pandas/tests/indexing/test_coercion.py | 2 +- pandas/tests/indexing/test_floats.py | 2 +- pandas/tests/indexing/test_iloc.py | 2 +- pandas/tests/indexing/test_loc.py | 2 +- pandas/tests/io/excel/test_writers.py | 2 +- pandas/tests/io/formats/test_format.py | 2 +- pandas/tests/io/pytables/test_pytables.py | 10 +++++----- pandas/tests/io/test_parquet.py | 2 +- pandas/tests/io/test_sql.py | 2 +- pandas/tests/plotting/test_frame.py | 2 +- pandas/tests/plotting/test_series.py | 2 +- pandas/tests/reshape/merge/test_merge.py | 6 +++--- pandas/tests/reshape/test_concat.py | 4 ++-- pandas/tests/scalar/timedelta/test_timedelta.py | 2 +- pandas/tests/scalar/timestamp/test_unary_ops.py | 2 +- pandas/tests/series/test_missing.py | 2 +- pandas/tests/test_algos.py | 2 +- pandas/tests/test_base.py | 2 +- pandas/tests/test_multilevel.py | 2 +- pandas/tests/test_window.py | 6 +++--- pandas/tests/tseries/offsets/test_offsets.py | 2 +- pandas/tseries/offsets.py | 2 +- pandas/util/testing.py | 6 +++--- scripts/validate_docstrings.py | 2 +- 99 files changed, 148 insertions(+), 148 deletions(-) diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py index 26e344758596f..9b738e699a5b3 100644 --- a/asv_bench/benchmarks/offset.py +++ b/asv_bench/benchmarks/offset.py @@ -9,7 +9,7 @@ pass hcal = pd.tseries.holiday.USFederalHolidayCalendar() -# These offests currently raise a NotImplimentedError with .apply_index() +# These offsets currently raise a NotImplimentedError with .apply_index() non_apply = [pd.offsets.Day(), pd.offsets.BYearEnd(), pd.offsets.BYearBegin(), diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst index fc12c8524d3bf..cbedeec737ec0 100644 --- a/doc/source/getting_started/comparison/comparison_with_sas.rst +++ b/doc/source/getting_started/comparison/comparison_with_sas.rst @@ -660,7 +660,7 @@ example, to subtract the mean for each observation by smoker group. run; -pandas ``groubpy`` provides a ``transform`` mechanism that allows +pandas ``groupby`` provides a ``transform`` mechanism that allows these type of operations to be succinctly expressed in one operation. diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst index bf2b03176ecd8..c354ed7872cb4 100644 --- a/doc/source/getting_started/comparison/comparison_with_stata.rst +++ b/doc/source/getting_started/comparison/comparison_with_stata.rst @@ -634,7 +634,7 @@ For example, to subtract the mean for each observation by smoker group. generate adj_total_bill = total_bill - group_bill -pandas ``groubpy`` provides a ``transform`` mechanism that allows +pandas ``groupby`` provides a ``transform`` mechanism that allows these type of operations to be succinctly expressed in one operation. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 30a42de2ab287..7caaec62c0a8a 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -488,7 +488,7 @@ specification: .. versionadded:: 0.21.0 -Specifying ``dtype='cateogry'`` will result in an unordered ``Categorical`` +Specifying ``dtype='category'`` will result in an unordered ``Categorical`` whose ``categories`` are the unique values observed in the data. For more control on the categories and order, create a :class:`~pandas.api.types.CategoricalDtype` ahead of time, and pass that for @@ -1679,7 +1679,7 @@ S3 URLs are handled as well but require installing the `S3Fs df = pd.read_csv('s3://pandas-test/tips.csv') -If your S3 bucket requires cedentials you will need to set them as environment +If your S3 bucket requires credentials you will need to set them as environment variables or in the ``~/.aws/credentials`` config file, refer to the `S3Fs documentation on credentials `_. @@ -2078,7 +2078,7 @@ Dates written in nanoseconds need to be read back in nanoseconds: json = dfj2.to_json(date_unit='ns') - # Try to parse timestamps as millseconds -> Won't Work + # Try to parse timestamps as milliseconds -> Won't Work dfju = pd.read_json(json, date_unit='ms') dfju diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index f27e9c677d925..7bdec001a688f 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1133,7 +1133,7 @@ Valid business hours are distinguished by whether it started from valid ``Busine pd.Timestamp('2014-08-01 17:00') + bh pd.Timestamp('2014-08-01 23:00') + bh - # Although 2014-08-02 is Satuaday, + # Although 2014-08-02 is Saturday, # it is valid because it starts from 08-01 (Friday). pd.Timestamp('2014-08-02 04:00') + bh diff --git a/doc/source/whatsnew/v0.10.1.rst b/doc/source/whatsnew/v0.10.1.rst index b5b2b889732cd..7d51ded1cad19 100644 --- a/doc/source/whatsnew/v0.10.1.rst +++ b/doc/source/whatsnew/v0.10.1.rst @@ -170,7 +170,7 @@ combined result, by using ``where`` on a selector table. df_mt, selector='df1_mt') store - # indiviual tables were created + # individual tables were created store.select('df1_mt') store.select('df2_mt') diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index d61b9a40438f8..f049006808c0f 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -816,7 +816,7 @@ Enhancements - Implemented ``Panel.pct_change`` (:issue:`6904`) - Added ``how`` option to rolling-moment functions to dictate how to handle resampling; :func:`rolling_max` defaults to max, :func:`rolling_min` defaults to min, and all others default to mean (:issue:`6297`) -- ``CustomBuisnessMonthBegin`` and ``CustomBusinessMonthEnd`` are now available (:issue:`6866`) +- ``CustomBusinessMonthBegin`` and ``CustomBusinessMonthEnd`` are now available (:issue:`6866`) - :meth:`Series.quantile` and :meth:`DataFrame.quantile` now accept an array of quantiles. - :meth:`~DataFrame.describe` now accepts an array of percentiles to include in the summary statistics (:issue:`4196`) diff --git a/doc/source/whatsnew/v0.14.1.rst b/doc/source/whatsnew/v0.14.1.rst index 98ebbd6a52344..fcfb22d074554 100644 --- a/doc/source/whatsnew/v0.14.1.rst +++ b/doc/source/whatsnew/v0.14.1.rst @@ -247,7 +247,7 @@ Bug Fixes - Bug in ``DatetimeIndex`` comparison doesn't handle ``NaT`` properly (:issue:`7529`) - Bug in passing input with ``tzinfo`` to some offsets ``apply``, ``rollforward`` or ``rollback`` resets ``tzinfo`` or raises ``ValueError`` (:issue:`7465`) - Bug in ``DatetimeIndex.to_period``, ``PeriodIndex.asobject``, ``PeriodIndex.to_timestamp`` doesn't preserve ``name`` (:issue:`7485`) -- Bug in ``DatetimeIndex.to_period`` and ``PeriodIndex.to_timestanp`` handle ``NaT`` incorrectly (:issue:`7228`) +- Bug in ``DatetimeIndex.to_period`` and ``PeriodIndex.to_timestamp`` handle ``NaT`` incorrectly (:issue:`7228`) - Bug in ``offsets.apply``, ``rollforward`` and ``rollback`` may return normal ``datetime`` (:issue:`7502`) - Bug in ``resample`` raises ``ValueError`` when target contains ``NaT`` (:issue:`7227`) - Bug in ``Timestamp.tz_localize`` resets ``nanosecond`` info (:issue:`7534`) diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index de29a1eb93709..fe9fdd7448923 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -1513,7 +1513,7 @@ Bug Fixes - Bug in ``Series`` comparison may output incorrect result if rhs contains ``NaT`` (:issue:`9005`) - Bug in ``Series`` and ``Index`` comparison may output incorrect result if it contains ``NaT`` with ``object`` dtype (:issue:`13592`) - Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) -- Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) +- Bug in ``Period`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) - Bug in ``pd.set_eng_float_format()`` that would prevent NaN and Inf from formatting (:issue:`11981`) - Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) - Clean some compile time warnings in datetime parsing (:issue:`13607`) diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst index 5c6f1d1af6b54..44b50437a6dfe 100644 --- a/doc/source/whatsnew/v0.21.0.rst +++ b/doc/source/whatsnew/v0.21.0.rst @@ -263,7 +263,7 @@ Now, to find prices per store/product, we can simply do: See the :ref:`documentation ` for more. -.. _whatsnew_0210.enhancements.reanme_categories: +.. _whatsnew_0210.enhancements.rename_categories: ``Categorical.rename_categories`` accepts a dict-like ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -418,7 +418,7 @@ New Behavior, without regard to the bottleneck installation: s.sum() -Note that this also changes the sum of an empty ``Series``. Previously this always returned 0 regardless of a ``bottlenck`` installation: +Note that this also changes the sum of an empty ``Series``. Previously this always returned 0 regardless of a ``bottleneck`` installation: .. code-block:: ipython diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index 98479fa30eb15..51efa37b55add 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -276,7 +276,7 @@ To show only observed values: df.groupby(['A', 'B', 'C'], observed=True).count() -For pivotting operations, this behavior is *already* controlled by the ``dropna`` keyword: +For pivoting operations, this behavior is *already* controlled by the ``dropna`` keyword: .. ipython:: python diff --git a/doc/source/whatsnew/v0.23.1.rst b/doc/source/whatsnew/v0.23.1.rst index f6af2990c935b..0218c3b02a413 100644 --- a/doc/source/whatsnew/v0.23.1.rst +++ b/doc/source/whatsnew/v0.23.1.rst @@ -26,7 +26,7 @@ Fixed Regressions **Comparing Series with datetime.date** We've reverted a 0.23.0 change to comparing a :class:`Series` holding datetimes and a ``datetime.date`` object (:issue:`21152`). -In pandas 0.22 and earlier, comparing a Series holding datetimes and ``datetime.date`` objects would coerce the ``datetime.date`` to a datetime before comapring. +In pandas 0.22 and earlier, comparing a Series holding datetimes and ``datetime.date`` objects would coerce the ``datetime.date`` to a datetime before comparing. This was inconsistent with Python, NumPy, and :class:`DatetimeIndex`, which never consider a datetime and ``datetime.date`` equal. In 0.23.0, we unified operations between DatetimeIndex and Series, and in the process changed comparisons between a Series of datetimes and ``datetime.date`` without warning. diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 05d6a03639a2d..086519ad75192 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1061,7 +1061,7 @@ The affected cases are: .. code-block:: ipython - # Comparison operations and arithmetic opeartions both raise ValueError. + # Comparison operations and arithmetic operations both raise ValueError. In [6]: df == (1, 2, 3) ... ValueError: Unable to coerce to Series, length must be 2: given 3 @@ -1324,7 +1324,7 @@ Deprecations - :meth:`Series.clip_lower`, :meth:`Series.clip_upper`, :meth:`DataFrame.clip_lower` and :meth:`DataFrame.clip_upper` are deprecated and will be removed in a future version. Use ``Series.clip(lower=threshold)``, ``Series.clip(upper=threshold)`` and the equivalent ``DataFrame`` methods (:issue:`24203`) - :meth:`Series.nonzero` is deprecated and will be removed in a future version (:issue:`18262`) - Passing an integer to :meth:`Series.fillna` and :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtypes is deprecated, will raise ``TypeError`` in a future version. Use ``obj.fillna(pd.Timedelta(...))`` instead (:issue:`24694`) -- ``Series.cat.categorical``, ``Series.cat.name`` and ``Sersies.cat.index`` have been deprecated. Use the attributes on ``Series.cat`` or ``Series`` directly. (:issue:`24751`). +- ``Series.cat.categorical``, ``Series.cat.name`` and ``Series.cat.index`` have been deprecated. Use the attributes on ``Series.cat`` or ``Series`` directly. (:issue:`24751`). - Passing a dtype without a precision like ``np.dtype('datetime64')`` or ``timedelta64`` to :class:`Index`, :class:`DatetimeIndex` and :class:`TimedeltaIndex` is now deprecated. Use the nanosecond-precision dtype instead (:issue:`24753`). .. _whatsnew_0240.deprecations.datetimelike_int_ops: @@ -1604,7 +1604,7 @@ Datetimelike - Bug in :class:`DatetimeIndex` where calling ``np.array(dtindex, dtype=object)`` would incorrectly return an array of ``long`` objects (:issue:`23524`) - Bug in :class:`Index` where passing a timezone-aware :class:`DatetimeIndex` and `dtype=object` would incorrectly raise a ``ValueError`` (:issue:`23524`) - Bug in :class:`Index` where calling ``np.array(dtindex, dtype=object)`` on a timezone-naive :class:`DatetimeIndex` would return an array of ``datetime`` objects instead of :class:`Timestamp` objects, potentially losing nanosecond portions of the timestamps (:issue:`23524`) -- Bug in :class:`Categorical.__setitem__` not allowing setting with another ``Categorical`` when both are undordered and have the same categories, but in a different order (:issue:`24142`) +- Bug in :class:`Categorical.__setitem__` not allowing setting with another ``Categorical`` when both are unordered and have the same categories, but in a different order (:issue:`24142`) - Bug in :func:`date_range` where using dates with millisecond resolution or higher could return incorrect values or the wrong number of values in the index (:issue:`24110`) - Bug in :class:`DatetimeIndex` where constructing a :class:`DatetimeIndex` from a :class:`Categorical` or :class:`CategoricalIndex` would incorrectly drop timezone information (:issue:`18664`) - Bug in :class:`DatetimeIndex` and :class:`TimedeltaIndex` where indexing with ``Ellipsis`` would incorrectly lose the index's ``freq`` attribute (:issue:`21282`) @@ -1670,7 +1670,7 @@ Timezones Offsets ^^^^^^^ -- Bug in :class:`FY5253` where date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) +- Bug in :class:`FY5253` where date offsets could incorrectly raise an ``AssertionError`` in arithmetic operations (:issue:`14774`) - Bug in :class:`DateOffset` where keyword arguments ``week`` and ``milliseconds`` were accepted and ignored. Passing these will now raise ``ValueError`` (:issue:`19398`) - Bug in adding :class:`DateOffset` with :class:`DataFrame` or :class:`PeriodIndex` incorrectly raising ``TypeError`` (:issue:`23215`) - Bug in comparing :class:`DateOffset` objects with non-DateOffset objects, particularly strings, raising ``ValueError`` instead of returning ``False`` for equality checks and ``True`` for not-equal checks (:issue:`23524`) @@ -1838,7 +1838,7 @@ Groupby/Resample/Rolling ``SeriesGroupBy`` when the grouping variable only contains NaNs and numpy version < 1.13 (:issue:`21956`). - Multiple bugs in :func:`pandas.core.window.Rolling.min` with ``closed='left'`` and a datetime-like index leading to incorrect results and also segfault. (:issue:`21704`) -- Bug in :meth:`pandas.core.resample.Resampler.apply` when passing postiional arguments to applied func (:issue:`14615`). +- Bug in :meth:`pandas.core.resample.Resampler.apply` when passing positional arguments to applied func (:issue:`14615`). - Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`). - Bug in :meth:`pandas.core.resample.Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`). - Bug in :meth:`pandas.core.groupby.SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 19636f42c6129..109005364fca6 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -119,7 +119,7 @@ Other Enhancements - ``Series.str`` has gained :meth:`Series.str.casefold` method to removes all case distinctions present in a string (:issue:`25405`) - :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`) - :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behavior of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`) -- :meth:`RangeIndex.union` now supports the ``sort`` argument. If ``sort=False`` an unsorted ``Int64Index`` is always returned. ``sort=None`` is the default and returns a mononotically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not (:issue:`24471`) +- :meth:`RangeIndex.union` now supports the ``sort`` argument. If ``sort=False`` an unsorted ``Int64Index`` is always returned. ``sort=None`` is the default and returns a monotonically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not (:issue:`24471`) - :meth:`TimedeltaIndex.intersection` now also supports the ``sort`` keyword (:issue:`24471`) - :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`) - Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:`25681`) @@ -694,7 +694,7 @@ I/O - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) - Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to :class:`Timestamp`, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`) - Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string ``"nan"`` instead of ``numpy.nan`` (:issue:`25468`) -- :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`) +- :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AssertionError`` (:issue:`25608`) - Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`) - Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`) - Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`) diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst index 575ec6b7d19f4..664325ac063c0 100644 --- a/doc/source/whatsnew/v0.8.0.rst +++ b/doc/source/whatsnew/v0.8.0.rst @@ -77,7 +77,7 @@ Time series changes and improvements interface while enabling working with nanosecond-resolution data. Also provides :ref:`easy time zone conversions `. - Enhanced support for :ref:`time zones `. Add - `tz_convert` and ``tz_lcoalize`` methods to TimeSeries and DataFrame. All + `tz_convert` and ``tz_localize`` methods to TimeSeries and DataFrame. All timestamps are stored as UTC; Timestamps from DatetimeIndex objects with time zone set will be localized to local time. Time zone conversions are therefore essentially free. User needs to know very little about pytz library now; only diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 4ebf5e587a727..2a41b5ff2339c 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -171,7 +171,7 @@ def get_start_end_field(int64_t[:] dtindex, object field, # YearBegin(), BYearBegin() use month = starting month of year. # QuarterBegin(), BQuarterBegin() use startingMonth = starting - # month of year. Other offests use month, startingMonth as ending + # month of year. Other offsets use month, startingMonth as ending # month of year. if (freqstr[0:2] in ['MS', 'QS', 'AS']) or ( diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index c738cc74e46a4..22bfab8b7c6d6 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -1,4 +1,4 @@ -""" support numpy compatiblitiy across versions """ +""" support numpy compatibility across versions """ from distutils.version import LooseVersion import re diff --git a/pandas/conftest.py b/pandas/conftest.py index 4bcd0ea8442e6..058361af343b6 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -122,7 +122,7 @@ def observed(request): """ pass in the observed keyword to groupby for [True, False] This indicates whether categoricals should return values for values which are not in the grouper [False / None], or only values which - appear in the grouper [True]. [None] is supported for future compatiblity + appear in the grouper [True]. [None] is supported for future compatibility if we decide to change the default (and would need to warn if this parameter is not passed)""" return request.param diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c709cd9e9f0b2..20fd582179dc6 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -653,7 +653,7 @@ def factorize( ----- :meth:`pandas.factorize` offers a `sort` keyword as well. """ - # Impelmentor note: There are two ways to override the behavior of + # Implementer note: There are two ways to override the behavior of # pandas.factorize # 1. _values_for_factorize and _from_factorize. # Specify the values passed to pandas' internal factorization diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index d415dbbdaf0a3..6e7217762a3fb 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -676,7 +676,7 @@ def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): def _has_same_tz(self, other): zzone = self._timezone - # vzone sholdn't be None if value is non-datetime like + # vzone shouldn't be None if value is non-datetime like if isinstance(other, np.datetime64): # convert to Timestamp as np.datetime64 doesn't have tz attr other = Timestamp(other) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 589e98f016f69..07d5664f98714 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -28,7 +28,7 @@ class _IntegerDtype(ExtensionDtype): An ExtensionDtype to hold a single size & kind of integer dtype. These specific implementations are subclasses of the non-public - _IntegerDtype. For example we have Int8Dtype to represnt signed int 8s. + _IntegerDtype. For example we have Int8Dtype to represent signed int 8s. The attributes name & type are set when these subclasses are created. """ diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index ece05567d3343..3a9322773fc69 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -854,7 +854,7 @@ def dt64arr_to_periodarr(data, freq, tz=None): ------- ordinals : ndarray[int] freq : Tick - The frequencey extracted from the Series or DatetimeIndex if that's + The frequency extracted from the Series or DatetimeIndex if that's used. """ diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 3dda6868a80da..d692fe6d7cabe 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -562,7 +562,7 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): * 'block': Stores a `block` and `block_length` for each contiguous *span* of sparse values. This is best when sparse data tends to be clumped together, with large - regsions of ``fill-value`` values between sparse values. + regions of ``fill-value`` values between sparse values. * 'integer': uses an integer to store the location of each sparse value. @@ -1316,7 +1316,7 @@ def _concat_same_type(cls, to_concat): sp_index = IntIndex(length, indices) else: - # when concatentating block indices, we don't claim that you'll + # when concatenating block indices, we don't claim that you'll # get an identical index as concating the values and then # creating a new index. We don't want to spend the time trying # to merge blocks across arrays in `to_concat`, so the resulting diff --git a/pandas/core/base.py b/pandas/core/base.py index ab9d8b9d778e5..30e800cb9bd73 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1137,7 +1137,7 @@ def __iter__(self): ------- iterator """ - # We are explicity making element iterators. + # We are explicitly making element iterators. if is_datetimelike(self._values): return map(com.maybe_box_datetimelike, self._values) elif is_extension_array_dtype(self._values): diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 2a762b5ee24b6..25cfa8fe17697 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -410,7 +410,7 @@ def visit_Assign(self, node, **kwargs): return self.visit(cmpr) def visit_Subscript(self, node, **kwargs): - # only allow simple suscripts + # only allow simple subscripts value = self.visit(node.value) slobj = self.visit(node.slice) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index ce99d150880c6..b2b74e2a70ca9 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1835,7 +1835,7 @@ def is_complex_dtype(arr_or_dtype): Returns ------- boolean - Whether or not the array or dtype is of a compex dtype. + Whether or not the array or dtype is of a complex dtype. Examples -------- @@ -1929,7 +1929,7 @@ def _is_dtype_type(arr_or_dtype, condition): Returns ------- - bool : if the condition is satisifed for the arr_or_dtype + bool : if the condition is satisfied for the arr_or_dtype """ if arr_or_dtype is None: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6746844f4b1fa..fd2e1e3e41ced 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2145,7 +2145,7 @@ def to_parquet(self, fname, engine='auto', compression='snappy', col_space='The minimum width of each column in CSS length ' 'units. An int is assumed to be px units.\n\n' ' .. versionadded:: 0.25.0\n' - ' Abillity to use str') + ' Ability to use str') @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) def to_html(self, buf=None, columns=None, col_space=None, header=True, @@ -5312,7 +5312,7 @@ def combine(self, other, func, fill_value=None, overwrite=True): this_mask = isna(series) other_mask = isna(otherSeries) - # don't overwrite columns unecessarily + # don't overwrite columns unnecessarily # DO propagate if this column is not in the intersection if not overwrite and other_mask.all(): result[col] = this[col].copy() @@ -5572,7 +5572,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None, else: mask = notna(this) - # don't overwrite columns unecessarily + # don't overwrite columns unnecessarily if mask.all(): continue @@ -6508,7 +6508,7 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, 2 13 dtype: int64 - Retuning a list-like will result in a Series + Returning a list-like will result in a Series >>> df.apply(lambda x: [1, 2], axis=1) 0 [1, 2] @@ -6993,7 +6993,7 @@ def round(self, decimals=0, *args, **kwargs): 3 0.2 0.2 With a dict, the number of places for specific columns can be - specfified with the column names as key and the number of decimal + specified with the column names as key and the number of decimal places as value >>> df.round({'dogs': 1, 'cats': 0}) @@ -7004,7 +7004,7 @@ def round(self, decimals=0, *args, **kwargs): 3 0.2 0.0 Using a Series, the number of places for specific columns can be - specfified with the column names as index and the number of + specified with the column names as index and the number of decimal places as value >>> decimals = pd.Series([0, 1], index=['cats', 'dogs']) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0e2a6a0cac414..992c83e66090e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3193,7 +3193,7 @@ def _slice(self, slobj, axis=0, kind=None): result = result.__finalize__(self) # this could be a view - # but only in a single-dtyped view slicable case + # but only in a single-dtyped view sliceable case is_copy = axis != 0 or result._is_view result._set_is_copy(self, copy=is_copy) return result @@ -3243,7 +3243,7 @@ def _check_setitem_copy(self, stacklevel=4, t='setting', force=False): force : boolean, default False if True, then force showing an error - validate if we are doing a settitem on a chained copy. + validate if we are doing a setitem on a chained copy. If you call this function, be sure to set the stacklevel such that the user will see the error *at the level of setting* @@ -3644,7 +3644,7 @@ class animal locomotion result.index = new_index # this could be a view - # but only in a single-dtyped view slicable case + # but only in a single-dtyped view sliceable case result._set_is_copy(self, copy=not result._is_view) return result @@ -6488,7 +6488,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, for c, src in to_replace.items(): if c in value and c in self: # object conversion is handled in - # series.replace which is called recursivelly + # series.replace which is called recursively res[c] = res[c].replace(to_replace=src, value=value[c], inplace=False, @@ -6724,7 +6724,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, Note how the last entry in column 'a' is interpolated differently, because there is no entry after it to use for interpolation. Note how the first entry in column 'b' remains ``NaN``, because there - is no entry befofe it to use for interpolation. + is no entry before it to use for interpolation. >>> df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0), ... (np.nan, 2.0, np.nan, np.nan), @@ -9576,7 +9576,7 @@ def describe(self, percentiles=None, include=None, exclude=None): DataFrame.max: Maximum of the values in the object. DataFrame.min: Minimum of the values in the object. DataFrame.mean: Mean of the values. - DataFrame.std: Standard deviation of the obersvations. + DataFrame.std: Standard deviation of the observations. DataFrame.select_dtypes: Subset of a DataFrame including/excluding columns based on their dtype. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 35ffa552913ae..91be320a3e674 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1316,7 +1316,7 @@ def _apply_to_column_groupbys(self, func): return func(self) def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None): - """Calcuate pct_change of each value to previous entry in group""" + """Calculate pct_change of each value to previous entry in group""" # TODO: Remove this conditional when #23918 is fixed if freq: return self.apply(lambda x: x.pct_change(periods=periods, diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index febfdc7bdf908..d0f28bed4399b 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -494,7 +494,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, # not an iterable of keys. In the meantime, we attempt to provide # a warning. We can assume that the user wanted a list of keys when # the key is not in the index. We just have to be careful with - # unhashble elements of `key`. Any unhashable elements implies that + # unhashable elements of `key`. Any unhashable elements implies that # they wanted a list of keys. # https://github.com/pandas-dev/pandas/issues/18314 is_tuple = isinstance(key, tuple) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 73abd708415a1..cb5b4a6c8993c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1131,7 +1131,7 @@ def to_flat_index(self): .. versionadded:: 0.24.0 - This is implemented for compatability with subclass implementations + This is implemented for compatibility with subclass implementations when chaining. Returns @@ -1486,7 +1486,7 @@ def _get_level_values(self, level): Return an Index of values for requested level. This is primarily useful to get an individual level of values from a - MultiIndex, but is provided on Index as well for compatability. + MultiIndex, but is provided on Index as well for compatibility. Parameters ---------- @@ -3885,7 +3885,7 @@ def _try_convert_to_int_index(cls, data, copy, name, dtype): from .numeric import Int64Index, UInt64Index if not is_unsigned_integer_dtype(dtype): # skip int64 conversion attempt if uint-like dtype is passed, as - # this could return Int64Index when UInt64Index is what's desrired + # this could return Int64Index when UInt64Index is what's desired try: res = data.astype('i8', copy=False) if (res == data).all(): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 122c30ae7dfd5..3d3774ce48e8b 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -534,7 +534,7 @@ def _can_reindex(self, indexer): @Appender(_index_shared_docs['where']) def where(self, cond, other=None): # TODO: Investigate an alternative implementation with - # 1. copy the underyling Categorical + # 1. copy the underlying Categorical # 2. setitem with `cond` and `other` # 3. Rebuild CategoricalIndex. if other is None: diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e6d876436c986..5ce670d9fe33e 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -376,7 +376,7 @@ def _is_dates_only(self): def __reduce__(self): - # we use a special reudce here because we need + # we use a special reduce here because we need # to simply set the .tz (and not reinterpret it) d = dict(data=self._data) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 577d0221cd8da..49f657332bbbf 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -390,7 +390,7 @@ def itemsize(self): 'a future version') warnings.warn(msg, FutureWarning, stacklevel=2) - # supress the warning from the underlying left/right itemsize + # suppress the warning from the underlying left/right itemsize with warnings.catch_warnings(): warnings.simplefilter('ignore') return self.left.itemsize + self.right.itemsize diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0d6e75f95f863..a06d304fb5a22 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -947,7 +947,7 @@ def f(l): def memory_usage(self, deep=False): # we are overwriting our base class to avoid # computing .values here which could materialize - # a tuple representation uncessarily + # a tuple representation unnecessarily return self._nbytes(deep) @cache_readonly @@ -1074,7 +1074,7 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False, sentinel = '' # GH3547 # use value of sparsify as sentinel, unless it's an obvious - # "Truthey" value + # "Truthy" value if sparsify not in [True, 1]: sentinel = sparsify # little bit of a kludge job for #1217 @@ -2729,7 +2729,7 @@ def convert_indexer(start, stop, step, indexer=indexer, return m if isinstance(key, slice): - # handle a slice, returnig a slice if we can + # handle a slice, returning a slice if we can # otherwise a boolean indexer try: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index ab39969af8db0..47dad1788e021 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -525,7 +525,7 @@ def _union(self, other, sort): sort : False or None, default None Whether to sort resulting index. ``sort=None`` returns a - mononotically increasing ``RangeIndex`` if possible or a sorted + monotonically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not. ``sort=False`` always returns an unsorted ``Int64Index`` diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6a21adb1d16ae..f6aa54f4836d9 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -955,7 +955,7 @@ def _getitem_lowerdim(self, tup): def _getitem_nested_tuple(self, tup): # we have a nested tuple so have at least 1 multi-index level - # we should be able to match up the dimensionaility here + # we should be able to match up the dimensionality here # we have too many indexers for our dim, but have at least 1 # multi-index dimension, try to see if we have something like diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index f564ac13dc41d..d766d7f06d34a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -588,7 +588,7 @@ def sanitize_array(data, index, dtype=None, copy=False, subarr = data # everything else in this block must also handle ndarray's, - # becuase we've unwrapped PandasArray into an ndarray. + # because we've unwrapped PandasArray into an ndarray. if dtype is not None: subarr = data.astype(dtype) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index cdb3b77567829..4230b212f567a 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -619,7 +619,7 @@ def mask_zero_div_zero(x, y, result, copy=False): def dispatch_missing(op, left, right, result): """ - Fill nulls caused by division by zero, casting to a diffferent dtype + Fill nulls caused by division by zero, casting to a different dtype if necessary. Parameters diff --git a/pandas/core/panel.py b/pandas/core/panel.py index c65a73bd0d3f0..9d6b7333ca39f 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1392,7 +1392,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None, Parameters ---------- other : Panel, or object coercible to Panel - The object from which the caller will be udpated. + The object from which the caller will be updated. join : {'left', 'right', 'outer', 'inner'}, default 'left' How individual DataFrames are joined. overwrite : bool, default True diff --git a/pandas/core/resample.py b/pandas/core/resample.py index d1d99d28e59b6..632b5a9c5e002 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -48,7 +48,7 @@ class Resampler(_GroupBy): groupby : a TimeGrouper object axis : int, default 0 kind : str or None - 'period', 'timestamp' to override default index treatement + 'period', 'timestamp' to override default index treatment Returns ------- @@ -1602,7 +1602,7 @@ def _take_new_index(obj, indexer, new_index, axis=0): def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): """ - Adjust the `first` Timestamp to the preceeding Timestamp that resides on + Adjust the `first` Timestamp to the preceding Timestamp that resides on the provided offset. Adjust the `last` Timestamp to the following Timestamp that resides on the provided offset. Input Timestamps that already reside on the offset will be adjusted depending on the type of diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 8c29bdc2a974c..96124331e43ef 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -429,7 +429,7 @@ def _convert_bin_to_numeric_type(bins, dtype): def _convert_bin_to_datelike_type(bins, dtype): """ - Convert bins to a DatetimeIndex or TimedeltaIndex if the orginal dtype is + Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is datelike Parameters diff --git a/pandas/core/series.py b/pandas/core/series.py index 11e578e74f6e7..730a96f5435a1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1481,7 +1481,7 @@ def iteritems(self): Lazily iterate over (index, value) tuples. This method returns an iterable tuple (index, value). This is - convienient if you want to create a lazy iterator. Note that the + convenient if you want to create a lazy iterator. Note that the methods Series.items and Series.iteritems are the same methods. Returns diff --git a/pandas/core/sparse/scipy_sparse.py b/pandas/core/sparse/scipy_sparse.py index 0dd8958e93c13..7ff0f46575661 100644 --- a/pandas/core/sparse/scipy_sparse.py +++ b/pandas/core/sparse/scipy_sparse.py @@ -42,7 +42,7 @@ def get_indexers(levels): values_ilabels = [x[0] for x in values_ilabels] # # performance issues with groupby ################################### - # TODO: these two lines can rejplace the code below but + # TODO: these two lines can replace the code below but # groupby is too slow (in some cases at least) # labels_to_i = ss.groupby(level=levels, sort=sort_labels).first() # labels_to_i[:] = np.arange(labels_to_i.shape[0]) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 6ebfbc8bb0ee0..710b29c6a6536 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2058,7 +2058,7 @@ def _get_series_list(self, others, ignore_index=False): # self._orig is either Series or Index idx = self._orig if isinstance(self._orig, Index) else self._orig.index - err_msg = ('others must be Series, Index, DataFrame, np.ndarrary or ' + err_msg = ('others must be Series, Index, DataFrame, np.ndarray or ' 'list-like (either containing only strings or containing ' 'only objects of type Series/Index/list-like/np.ndarray)') @@ -2155,7 +2155,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): Parameters ---------- - others : Series, Index, DataFrame, np.ndarrary or list-like + others : Series, Index, DataFrame, np.ndarray or list-like Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and other list-likes of strings must have the same length as the calling Series/Index, with the exception of indexed objects (i.e. @@ -2571,7 +2571,7 @@ def rsplit(self, pat=None, n=-1, expand=False): 0 Linda van der Berg 1 George Pitt - Rivers - To return a Series containining tuples instead of a DataFrame: + To return a Series containing tuples instead of a DataFrame: >>> s.str.partition('-', expand=False) 0 (Linda van der Berg, , ) @@ -3292,7 +3292,7 @@ def rindex(self, sub, start=0, end=None): The ``s5.str.istitle`` method checks for whether all words are in title case (whether only the first letter of each word is capitalized). Words are - assumed to be as any sequence of non-numeric characters seperated by + assumed to be as any sequence of non-numeric characters separated by whitespace characters. >>> s5.str.istitle() diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 8e6331fe44e6b..5893ff0e0dd8f 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -363,7 +363,7 @@ def _adjust_to_origin(arg, origin, unit): raise ValueError("incompatible 'arg' type for given " "'origin'='julian'") - # premptively check this for a nice range + # preemptively check this for a nice range j_max = Timestamp.max.to_julian_date() - j0 j_min = Timestamp.min.to_julian_date() - j0 if np.any(arg > j_max) or np.any(arg < j_min): diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 4db00e34b39e2..5792f6e2a5a08 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -557,7 +557,7 @@ def _format_hierarchical_rows(self): # MultiIndex columns require an extra row # with index names (blank if None) for - # unambigous round-trip, unless not merging, + # unambiguous round-trip, unless not merging, # in which case the names all go on one row Issue #11328 if isinstance(self.columns, ABCMultiIndex) and self.merge_cells: self.rowcounter += 1 diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 7bafa15bb1979..f14b615471ccc 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -180,7 +180,7 @@ def __init__(self, obj, orient, date_format, double_precision, self.schema = build_table_schema(obj, index=self.index) - # NotImplementd on a column MultiIndex + # NotImplemented on a column MultiIndex if obj.ndim == 2 and isinstance(obj.columns, MultiIndex): raise NotImplementedError( "orient='table' is not supported for MultiIndex") diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index fa4e35b08bf6e..2d8bc20b1195e 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -79,7 +79,7 @@ def nested_to_record(ds, prefix="", sep=".", level=0): else: newkey = prefix + sep + k - # only dicts gets recurse-flattend + # only dicts gets recurse-flattened # only at level>1 do we rename the rest of the keys if not isinstance(v, dict): if level != 0: # so we skip copying for top level, common case diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/table_schema.py index 7742bc717b184..a54f5cdf723a3 100644 --- a/pandas/io/json/table_schema.py +++ b/pandas/io/json/table_schema.py @@ -142,7 +142,7 @@ def convert_json_field_to_pandas_type(field): 'int64' >>> convert_json_field_to_pandas_type({'name': 'a_categorical', 'type': 'any', - 'contraints': {'enum': [ + 'constraints': {'enum': [ 'a', 'b', 'c']}, 'ordered': True}) 'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)' diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 17d580bae5cf1..97d5b1dd2a1e5 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -824,7 +824,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, raise ValueError( "all tables must have exactly the same nrows!") - # axis is the concentation axes + # axis is the concentration axes axis = list({t.non_index_axes[0][0] for t in tbls})[0] def func(_start, _stop, _where): @@ -948,7 +948,7 @@ def append(self, key, value, format=None, append=True, columns=None, of the object are indexed. See `here `__. min_itemsize : dict of columns that specify minimum string sizes - nan_rep : string to use as string nan represenation + nan_rep : string to use as string nan representation chunksize : size to chunk the writing expectedrows : expected TOTAL row size of this table encoding : default None, provide an encoding for strings @@ -1343,7 +1343,7 @@ def error(t): else: - # distiguish between a frame/table + # distinguish between a frame/table tt = 'legacy_panel' try: fields = group.table._v_attrs.fields @@ -3316,7 +3316,7 @@ def validate_version(self, where=None): warnings.warn(ws, IncompatibilityWarning) def validate_min_itemsize(self, min_itemsize): - """validate the min_itemisze doesn't contain items that are not in the + """validate the min_itemsize doesn't contain items that are not in the axes this needs data_columns to be defined """ if min_itemsize is None: @@ -3500,7 +3500,7 @@ def validate_data_columns(self, data_columns, min_itemsize): def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, min_itemsize=None, **kwargs): """ create and return the axes - leagcy tables create an indexable column, indexable index, + legacy tables create an indexable column, indexable index, non-indexable fields Parameters diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d8dfd15477974..00b7a29b27b63 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2708,7 +2708,7 @@ class StataWriter117(StataWriter): Each label must be 80 characters or smaller. convert_strl : list List of columns names to convert to Stata StrL format. Columns with - more than 2045 characters are aautomatically written as StrL. + more than 2045 characters are automatically written as StrL. Smaller columns can be converted by including the column name. Using StrLs can reduce output file size when strings are longer than 8 characters, and either frequently repeated or sparse. diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index e491cfc3309a0..acb5ab7b8e04b 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -133,7 +133,7 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, Number of rows and columns of the subplot grid. If not specified, calculated from naxes and layout_type - layout_type : {'box', 'horziontal', 'vertical'}, default 'box' + layout_type : {'box', 'horizontal', 'vertical'}, default 'box' Specify how to layout the subplot grid. fig_kw : Other keyword arguments to be passed to the figure() call. diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 256ee930b4cda..f58f8981317df 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -1,4 +1,4 @@ -# Arithmetc tests for DataFrame/Series/Index/Array classes that should +# Arithmetic tests for DataFrame/Series/Index/Array classes that should # behave identically. # Specifically for numeric dtypes from collections import abc @@ -587,7 +587,7 @@ def test_operators_frame(self): tm.assert_series_equal(ts / ts, ts / df['A'], check_names=False) - # TODO: this came from tests.series.test_analytics, needs cleannup and + # TODO: this came from tests.series.test_analytics, needs cleanup and # de-duplication with test_modulo above def test_modulo2(self): with np.errstate(all='ignore'): diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 8d67e02d514ff..dd931939ddf51 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -1,4 +1,4 @@ -# Arithmetc tests for DataFrame/Series/Index/Array classes that should +# Arithmetic tests for DataFrame/Series/Index/Array classes that should # behave identically. # Specifically for object dtype from decimal import Decimal diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index e254312e39724..bc1b78bf944d1 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -1,4 +1,4 @@ -# Arithmetc tests for DataFrame/Series/Index/Array classes that should +# Arithmetic tests for DataFrame/Series/Index/Array classes that should # behave identically. # Specifically for Period dtype import operator diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 2dff9a6088de8..047900c3d7586 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1,4 +1,4 @@ -# Arithmetc tests for DataFrame/Series/Index/Array classes that should +# Arithmetic tests for DataFrame/Series/Index/Array classes that should # behave identically. from datetime import datetime, timedelta @@ -48,7 +48,7 @@ def test_compare_timedelta64_zerodim(self): tdi >= np.array(4) def test_compare_timedelta_series(self): - # regresssion test for GH#5963 + # regression test for GH#5963 s = pd.Series([timedelta(days=1), timedelta(days=2)]) actual = s > timedelta(days=1) expected = pd.Series([False, True]) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 2f42ec5bae2b0..2337d8363155c 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -450,7 +450,7 @@ def test_concat_same_type_invalid(self, datetime_index): arr._concat_same_type([arr, other]) def test_concat_same_type_different_freq(self): - # we *can* concatentate DTI with different freqs. + # we *can* concatenate DTI with different freqs. a = DatetimeArray(pd.date_range('2000', periods=2, freq='D', tz='US/Central')) b = DatetimeArray(pd.date_range('2000', periods=2, freq='H', diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 066eadc9b68bc..65f7628370ad4 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -164,13 +164,13 @@ def _check_op(self, s, op_name, other, exc=None): self._check_op_integer(result, expected, mask, s, op_name, other) def _check_op_float(self, result, expected, mask, s, op_name, other): - # check comparisions that are resulting in float dtypes + # check comparisons that are resulting in float dtypes expected[mask] = np.nan tm.assert_series_equal(result, expected) def _check_op_integer(self, result, expected, mask, s, op_name, other): - # check comparisions that are resulting in integer dtypes + # check comparisons that are resulting in integer dtypes # to compare properly, we convert the expected # to float, mask to nans and convert infs diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index c7a62dfe77c37..675abec661b5a 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -417,7 +417,7 @@ def test_is_datetime_or_timedelta_dtype(): assert not com.is_datetime_or_timedelta_dtype(pd.Series([1, 2])) assert not com.is_datetime_or_timedelta_dtype(np.array(['a', 'b'])) - # TODO(jreback), this is sligthly suspect + # TODO(jreback), this is slightly suspect assert not com.is_datetime_or_timedelta_dtype( DatetimeTZDtype("ns", "US/Eastern")) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 2ac68c52d53c7..708eb9c7c8c43 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -36,7 +36,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): op(s, other) def _check_divmod_op(self, s, op, other, exc=Exception): - # divmod has multiple return values, so check separatly + # divmod has multiple return values, so check separately if exc is None: result_div, result_mod = op(s, other) if op is divmod: diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 97c329e0a5c92..89d30b0a3cc06 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -25,7 +25,7 @@ def data(): # Why the while loop? NumPy is unable to construct an ndarray from # equal-length ndarrays. Many of our operations involve coercing the # EA to an ndarray of objects. To avoid random test failures, we ensure - # that our data is coercable to an ndarray. Several tests deal with only + # that our data is coercible to an ndarray. Several tests deal with only # the first two elements, so that's what we'll check. while len(data[0]) == len(data[1]): diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 9683beb20def5..faa86acb1584f 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -664,7 +664,7 @@ def test_combine_first_mixed_bug(self): expected = Series([True, True, False], name=2) assert_series_equal(result, expected) - # GH 3593, converting datetime64[ns] incorrecly + # GH 3593, converting datetime64[ns] incorrectly df0 = DataFrame({"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 7dc74961a2adc..c6508072cb8c7 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -149,7 +149,7 @@ def _check_mixed_dtypes(df, dtypes=None): if d in df: assert(df.dtypes[d] == d) - # mixed floating and integer coexinst in the same frame + # mixed floating and integer coexist in the same frame df = _make_mixed_dtypes_df('float') _check_mixed_dtypes(df) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 40785c6a1d321..3c9558d5cbd10 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -2729,7 +2729,7 @@ def _check_set(df, cond, check_dtypes=True): cond = df >= 0 _check_set(df, cond) - # aligining + # aligning cond = (df >= 0)[1:] _check_set(df, cond) @@ -3691,7 +3691,7 @@ def test_assigning_ops(self): df.at["j", "cats"] = "c" # Assigning a Category to parts of a int/... column uses the values of - # the Catgorical + # the Categorical df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index d46ce41fc7f03..e7583adff403b 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -307,7 +307,7 @@ def check(result, expected=None): with pytest.raises(ValueError, match=msg): df[df.A > 6] - # dup aligining operations should work + # dup aligning operations should work # GH 5185 df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3]) df2 = DataFrame([1, 2, 3], index=[1, 2, 3]) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5bea749febc76..0fb8673e6274a 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -133,7 +133,7 @@ def f(g): def test_group_apply_once_per_group(df, group_names): # GH2936, GH7739, GH10519, GH2656, GH12155, GH20084, GH21417 - # This test should ensure that a function is only evaluted + # This test should ensure that a function is only evaluated # once per group. Previously the function has been evaluated twice # on the first group to check if the Cython index slider is safe to use # This test ensures that the side effect (append to list) is only triggered diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 87b57b0609b36..3da3ab22b643b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -130,7 +130,7 @@ def func(dataf): assert isinstance(result, DataFrame) # GH5592 - # inconcistent return type + # inconsistent return type df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb', 'Pony', 'Pony'], B=Series( np.arange(7), dtype='int64'), C=date_range( diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index ef05e6ada4890..4ca470d316e5c 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -92,7 +92,7 @@ def test_groupby_with_timegrouper_methods(self, should_sort): def test_timegrouper_with_reg_groups(self): # GH 3794 - # allow combinateion of timegrouper/reg groups + # allow combination of timegrouper/reg groups df_original = DataFrame({ 'Branch': 'A A A A A A A B'.split(), diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 085e62ed9341e..6ec8568ce7242 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -94,7 +94,7 @@ def test_slice_duplicate_monotonic(self): def test_monotone_DTI_indexing_bug(self): # GH 19362 - # Testing accessing the first element in a montononic descending + # Testing accessing the first element in a monotonic descending # partial string indexing. df = pd.DataFrame(list(range(5))) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index ea33e563b31be..2a5ae92cb59f5 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -201,7 +201,7 @@ def test_to_datetime_with_non_exact(self, cache): def test_parse_nanoseconds_with_formula(self, cache): # GH8989 - # trunctaing the nanoseconds when a format was provided + # truncating the nanoseconds when a format was provided for v in ["2012-01-01 09:00:00.000000001", "2012-01-01 09:00:00.000001", "2012-01-01 09:00:00.001", @@ -383,7 +383,7 @@ def test_to_datetime_now(self): def test_to_datetime_today(self): # See GH#18666 # Test with one timezone far ahead of UTC and another far behind, so - # one of these will _almost_ alawys be in a different day from UTC. + # one of these will _almost_ always be in a different day from UTC. # Unfortunately this test between 12 and 1 AM Samoa time # this both of these timezones _and_ UTC will all be in the same day, # so this test will not detect the regression introduced in #18666. @@ -606,7 +606,7 @@ def test_to_datetime_tz_psycopg2(self, cache): ], tz=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None)) assert is_datetime64_ns_dtype(i) - # tz coerceion + # tz coercion result = pd.to_datetime(i, errors='coerce', cache=cache) tm.assert_index_equal(result, i) diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py index 7a54ad5c180a4..eb9b573cce91d 100644 --- a/pandas/tests/indexes/interval/test_construction.py +++ b/pandas/tests/indexes/interval/test_construction.py @@ -272,7 +272,7 @@ def test_constructor_errors(self): IntervalIndex.from_tuples(tuples) def test_na_tuples(self): - # tuple (NA, NA) evaluates the same as NA as an elemenent + # tuple (NA, NA) evaluates the same as NA as an element na_tuple = [(0, 1), (np.nan, np.nan), (2, 3)] idx_na_tuple = IntervalIndex.from_tuples(na_tuple) idx_na_element = IntervalIndex.from_tuples([(0, 1), np.nan, (2, 3)]) diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py index 518c12bb20e13..1928c303a1bcd 100644 --- a/pandas/tests/indexes/multi/test_missing.py +++ b/pandas/tests/indexes/multi/test_missing.py @@ -74,7 +74,7 @@ def test_dropna(): idx.dropna(how='xxx') # GH26408 - # test if missing values are dropped for mutiindex constructed + # test if missing values are dropped for multiindex constructed # from codes and values idx = MultiIndex(levels=[[np.nan, None, pd.NaT, "128", 2], [np.nan, None, pd.NaT, "128", 2]], diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index d38fa20a9335c..d89d282fb785b 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -676,7 +676,7 @@ def test_get_loc(self): with pytest.raises(KeyError): i.get_loc('NOT-EXIST') - # non-unique, slicable + # non-unique, sliceable cidx3 = CategoricalIndex(list('aabbb'), categories=list('abc')) idx3 = Index(list('aabbb')) diff --git a/pandas/tests/indexing/multiindex/test_xs.py b/pandas/tests/indexing/multiindex/test_xs.py index f9117341e3a78..bbc55c75c5b77 100644 --- a/pandas/tests/indexing/multiindex/test_xs.py +++ b/pandas/tests/indexing/multiindex/test_xs.py @@ -52,7 +52,7 @@ def test_xs_loc_equality(multiindex_dataframe_random_data): def test_xs_missing_values_in_index(): # see gh-6574 - # missing values in returned index should be preserrved + # missing values in returned index should be preserved acc = [ ('a', 'abcde', 1), ('b', 'bbcde', 2), diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 8b2b0b349e203..e9c1b85e7d40c 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -665,7 +665,7 @@ def test_where_index_period(self): class TestFillnaSeriesCoercion(CoercionBase): - # not indexing, but place here for consisntency + # not indexing, but place here for consistency method = 'fillna' diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 9a2aae08dbb15..ada613110d9bf 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -35,7 +35,7 @@ def test_scalar_error(self): # float_indexers should raise exceptions # on appropriate Index types & accessors # this duplicates the code below - # but is spefically testing for the error + # but is specifically testing for the error # message for index in [tm.makeStringIndex, tm.makeUnicodeIndex, diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 4fa26dc67ba0c..6b5ad66e268df 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -20,7 +20,7 @@ def test_iloc_exceeds_bounds(self): # iloc should allow indexers that exceed the bounds df = DataFrame(np.random.random_sample((20, 5)), columns=list('ABCDE')) - # lists of positions should raise IndexErrror! + # lists of positions should raise IndexError! msg = 'positional indexers are out-of-bounds' with pytest.raises(IndexError, match=msg): df.iloc[:, [0, 1, 2, 3, 4, 5]] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 5f5718fe3eac3..11d0fa2602baa 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -16,7 +16,7 @@ class TestLoc(Base): def test_loc_getitem_dups(self): # GH 5678 - # repeated gettitems on a dup index returning a ndarray + # repeated getitems on a dup index returning a ndarray df = DataFrame( np.random.random_sample((20, 5)), index=['ABCDE' [x % 5] for x in range(20)]) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 961d781764b67..ea75e97bace0b 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -730,7 +730,7 @@ def test_to_excel_multiindex_dates( assert recons.index.names == ('time', 'foo') def test_to_excel_multiindex_no_write_index(self, engine, ext): - # Test writing and re-reading a MI witout the index. GH 5616. + # Test writing and re-reading a MI without the index. GH 5616. # Initial non-MI frame. frame1 = DataFrame({'a': [10, 20], 'b': [30, 40], 'c': [50, 60]}) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index edb7c2136825d..0eeb0e6eb2f2d 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -533,7 +533,7 @@ def test_to_string_with_formatters_unicode(self): assert result == ' c/\u03c3\n' + '0 1\n1 2\n2 3' def test_east_asian_unicode_false(self): - # not alighned properly because of east asian width + # not aligned properly because of east asian width # mid col df = DataFrame({'a': ['あ', 'いいい', 'う', 'ええええええ'], diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index ef9dbc63d873d..413c11ba2f9fe 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -225,7 +225,7 @@ def test_long_strings(self): def test_api(self): # GH4584 - # API issue when to_hdf doesn't acdept append AND format args + # API issue when to_hdf doesn't accept append AND format args with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() @@ -2656,7 +2656,7 @@ def test_select(self): expected = df.reindex(columns=['A', 'B']) tm.assert_frame_equal(expected, result) - # equivalentsly + # equivalently result = store.select('df', [("columns=['A', 'B']")]) expected = df.reindex(columns=['A', 'B']) tm.assert_frame_equal(expected, result) @@ -3284,7 +3284,7 @@ def test_frame_select_complex2(self): expected = read_hdf(hh, 'df', where='l1=[2, 3, 4]') - # sccope with list like + # scope with list like l = selection.index.tolist() # noqa store = HDFStore(hh) result = store.select('df', where='l1=l') @@ -3308,7 +3308,7 @@ def test_frame_select_complex2(self): result = read_hdf(hh, 'df', where='l1=list(selection.index)') assert_frame_equal(result, expected) - # sccope with index + # scope with index store = HDFStore(hh) result = store.select('df', where='l1=index') @@ -5164,7 +5164,7 @@ def test_legacy_datetimetz_object(self, datapath): assert_frame_equal(result, expected) def test_dst_transitions(self): - # make sure we are not failing on transaitions + # make sure we are not failing on transitions with ensure_clean_store(self.path) as store: times = pd.date_range("2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index afdd83ba9bb8c..db5c92fb681a2 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -305,7 +305,7 @@ def test_write_index(self, engine): check_round_trip(df, engine) def test_write_multiindex(self, pa): - # Not suppoprted in fastparquet as of 0.1.3 or older pyarrow version + # Not supported in fastparquet as of 0.1.3 or older pyarrow version engine = pa df = pd.DataFrame({'A': [1, 2, 3]}) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index e651892bde0a0..b053afa4dd7d5 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1338,7 +1338,7 @@ def check(col): # this is parsed on Travis (linux), but not on macosx for some reason # even with the same versions of psycopg2 & sqlalchemy, possibly a - # Postgrsql server version difference + # Postgresql server version difference col = df.DateColWithTz assert is_datetime64tz_dtype(col.dtype) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 4ee918fa48dab..06c753d1b8e21 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -2952,7 +2952,7 @@ def test_plain_axes(self): fig.add_axes([0.2, 0.2, 0.2, 0.2]) Series(rand(10)).plot(ax=ax) - # suppliad ax itself is a plain Axes, but because the cmap keyword + # supplied ax itself is a plain Axes, but because the cmap keyword # a new ax is created for the colorbar -> also multiples axes (GH11520) df = DataFrame({'a': randn(8), 'b': randn(8)}) fig = self.plt.figure() diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 9dabb35196741..9a954b522333d 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -529,7 +529,7 @@ def test_df_series_secondary_legend(self): assert ax.right_ax.get_yaxis().get_visible() tm.close() - # seconcary -> secondary (without passing ax) + # secondary -> secondary (without passing ax) _, ax = self.plt.subplots() ax = df.plot(secondary_y=True, ax=ax) s.plot(legend=True, secondary_y=True, ax=ax) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index b487f865b68a4..8eb4141555260 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -603,7 +603,7 @@ def test_other_datetime_unit(self): 'datetime64[ns]']: df2 = s.astype(dtype).to_frame('days') - # coerces to datetime64[ns], thus sholuld not be affected + # coerces to datetime64[ns], thus should not be affected assert df2['days'].dtype == 'datetime64[ns]' result = df1.merge(df2, left_on='entity_id', right_index=True) @@ -1243,9 +1243,9 @@ def test_merge_incompat_infer_boolean_object(self): ([0, 1], pd.Series([False, True], dtype=bool)), ]) def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals): - # these are explicity allowed incompat merges, that pass thru + # these are explicitly allowed incompat merges, that pass thru # the result type is dependent on if the values on the rhs are - # inferred, otherwise these will be coereced to object + # inferred, otherwise these will be coerced to object df1 = DataFrame({'A': df1_vals}) df2 = DataFrame({'A': df2_vals}) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 1420d4420e430..4f65251ebd923 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2198,7 +2198,7 @@ def test_categorical_concat(self, sort): def test_categorical_concat_gh7864(self): # GH 7864 - # make sure ordering is preserverd + # make sure ordering is preserved df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list('abbaae')}) df["grade"] = Categorical(df["raw_grade"]) df['grade'].cat.set_categories(['e', 'a', 'b']) @@ -2265,7 +2265,7 @@ def test_categorical_index_preserver(self): }).set_index('B') tm.assert_frame_equal(result, expected) - # wrong catgories + # wrong categories df3 = DataFrame({'A': a, 'B': Categorical(b, categories=list('abe')) }).set_index('B') msg = "categories must match existing categories when appending" diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index f10876531e66a..469072970133d 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -213,7 +213,7 @@ def test_conversion(self): assert isinstance(td64, np.timedelta64) - # this is NOT equal and cannot be roundtriped (because of the nanos) + # this is NOT equal and cannot be roundtripped (because of the nanos) td = Timedelta('1 days, 10:11:12.012345678') assert td != td.to_pytimedelta() diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index 657008856482f..8b13458050ce8 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -112,7 +112,7 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): ]) @pytest.mark.parametrize('rounder', ['ceil', 'floor', 'round']) def test_round_minute_freq(self, test_input, freq, expected, rounder): - # Ensure timestamps that shouldnt round dont! + # Ensure timestamps that shouldn't round dont! # GH#21262 dt = Timestamp(test_input) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 5328a58e3fbff..94050f7526444 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -634,7 +634,7 @@ def test_timedelta64_nan(self): # td np.float64 -> another float-object somewher on + # casting to -> np.float64 -> another float-object somewhere on # the way could lead jepardize this behavior comps = [np.nan] # could be casted to float64 values = [np.nan] diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index d24ed9433f4f7..d82b205803b09 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -206,7 +206,7 @@ def check_ops_properties(self, props, filter=None, ignore_failures=False): result = getattr(o, op) - # these couuld be series, arrays or scalars + # these could be series, arrays or scalars if isinstance(result, Series) and isinstance(expected, Series): tm.assert_series_equal(result, expected) elif isinstance(result, Index) and isinstance(expected, Index): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index e8d6b3bcaa77f..aa9c9bb05f877 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1840,7 +1840,7 @@ def test_sort_index_and_reconstruction(self): # 15622 # lexsortedness should be identical - # across MultiIndex consruction methods + # across MultiIndex construction methods df = DataFrame([[1, 1], [2, 2]], index=list('ab')) expected = DataFrame([[1, 1], [2, 2], [1, 1], [2, 2]], diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 9524a78dae16c..4dfdd1c96728b 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -1387,7 +1387,7 @@ def quantile_func(x): def test_rolling_quantile_np_percentile(self): # #9413: Tests that rolling window's quantile default behavior - # is analogus to Numpy's percentile + # is analogous to Numpy's percentile row = 10 col = 5 idx = pd.date_range('20100101', periods=row, freq='B') @@ -2003,7 +2003,7 @@ def test_pairwise_with_self(self, f): # DataFrame with itself, pairwise=True # note that we may construct the 1st level of the MI - # in a non-motononic way, so compare accordingly + # in a non-monotonic way, so compare accordingly results = [] for i, df in enumerate(self.df1s): result = f(df) @@ -2154,7 +2154,7 @@ def is_constant(x): def no_nans(x): return x.notna().all().all() - # data is a tuple(object, is_contant, no_nans) + # data is a tuple(object, is_constant, no_nans) data = create_series() + create_dataframes() return [(x, is_constant(x), no_nans(x)) for x in data] diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index a1ad792e57bde..151cd2a42ecef 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -813,7 +813,7 @@ def test_call(self): assert self.offset4(self.d) == datetime(2014, 6, 30, 14) def test_sub(self): - # we have to override test_sub here becasue self.offset2 is not + # we have to override test_sub here because self.offset2 is not # defined as self._offset(2) off = self.offset2 msg = "Cannot subtract datetime from offset" diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 00837d36d9508..ac20ad1669638 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -116,7 +116,7 @@ class DateOffset(BaseOffset): off specifying n in the keywords you use, but regardless it is there for you. n is needed for DateOffset subclasses. - DateOffets work as follows. Each offset specify a set of dates + DateOffset work as follows. Each offset specify a set of dates that conform to the DateOffset. For example, Bday defines this set to be the set of dates that are weekdays (M-F). To test if a date is in the set of a DateOffset dateOffset we can use the diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 107c17c5253fb..f14b202b034d6 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1666,7 +1666,7 @@ def index_subclass_makers_generator(): def all_timeseries_index_generator(k=10): """Generator which can be iterated over to get instances of all the classes - which represent time-seires. + which represent time-series. Parameters ---------- @@ -1793,7 +1793,7 @@ def makeCustomIndex(nentries, nlevels, prefix='#', names=False, ndupe_l=None, # pass None to index constructor for no name names = None - # make singelton case uniform + # make singleton case uniform if isinstance(names, str) and nlevels == 1: names = [names] @@ -1872,7 +1872,7 @@ def makeCustomDataframe(nrows, ncols, c_idx_names=True, r_idx_names=True, N < idx_nlevels, for just the first N levels. If ndupe doesn't divide nrows/ncol, the last label might have lower multiplicity. dtype - passed to the DataFrame constructor as is, in case you wish to - have more control in conjuncion with a custom `data_gen_f` + have more control in conjunction with a custom `data_gen_f` r_idx_type, c_idx_type - "i"/"f"/"s"/"u"/"dt"/"td". If idx_type is not None, `idx_nlevels` must be 1. "i"/"f" creates an integer/float index, diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 64eaf45376b2f..dddd5eb1f1eab 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -150,7 +150,7 @@ def error(code, **kwargs): code : str Error code. message : str - Error message with varaibles replaced. + Error message with variables replaced. """ return (code, ERROR_MSGS[code].format(**kwargs)) From 8ea2d087cda0f40a4e41ce108a32859d51b4d69f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 25 Jun 2019 02:47:46 +0200 Subject: [PATCH 28/34] BUG: fix empty Series repr for subclasses (#27001) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/formats/format.py | 3 ++- pandas/tests/series/test_subclass.py | 3 +++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 109005364fca6..d10f9567188d1 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -793,6 +793,7 @@ Other - Removed unused C functions from vendored UltraJSON implementation (:issue:`26198`) - Allow :class:`Index` and :class:`RangeIndex` to be passed to numpy ``min`` and ``max`` functions (:issue:`26125`) +- Use actual class name in repr of empty objects of a ``Series`` subclass (:issue:`27001`). .. _whatsnew_0.250.contributors: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b2ef45b15e549..152e9a2e9ab3d 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -257,7 +257,8 @@ def to_string(self): footer = self._get_footer() if len(series) == 0: - return 'Series([], ' + footer + ')' + return "{name}([], {footer})".format( + name=self.series.__class__.__name__, footer=footer) fmt_index, have_header = self._get_formatted_index() fmt_values = self._get_formatted_values() diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index 563a94f4588cb..b47d339f5a5f2 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -39,6 +39,9 @@ def test_subclass_unstack(self): tm.assert_frame_equal(res, exp) + def test_subclass_empty_repr(self): + assert 'SubclassedSeries' in repr(tm.SubclassedSeries()) + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseSeriesSubclassing: From 2da45994b63062396a2b75ead738b5df8ecc8070 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 25 Jun 2019 05:19:44 -0700 Subject: [PATCH 29/34] TST: parametrize pytable test (#27032) --- pandas/core/arrays/base.py | 7 ++- pandas/core/groupby/generic.py | 2 +- pandas/core/internals/managers.py | 2 +- pandas/tests/io/pytables/test_pytables.py | 74 +++++++++++------------ 4 files changed, 41 insertions(+), 44 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 20fd582179dc6..51ad01dd6b369 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -115,6 +115,7 @@ class ExtensionArray: # ------------------------------------------------------------------------ # Constructors # ------------------------------------------------------------------------ + @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): """ @@ -286,6 +287,7 @@ def __iter__(self): # ------------------------------------------------------------------------ # Required attributes # ------------------------------------------------------------------------ + @property def dtype(self) -> ExtensionDtype: """ @@ -319,6 +321,7 @@ def nbytes(self) -> int: # ------------------------------------------------------------------------ # Additional Methods # ------------------------------------------------------------------------ + def astype(self, dtype, copy=True): """ Cast to a NumPy array with 'dtype'. @@ -479,8 +482,7 @@ def dropna(self): def shift( self, periods: int = 1, - fill_value: object = None, - ) -> ABCExtensionArray: + fill_value: object = None) -> ABCExtensionArray: """ Shift values by desired number. @@ -836,6 +838,7 @@ def copy(self, deep: bool = False) -> ABCExtensionArray: # ------------------------------------------------------------------------ # Printing # ------------------------------------------------------------------------ + def __repr__(self): from pandas.io.formats.printing import format_object_summary diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 91be320a3e674..1b4e001620286 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -480,7 +480,7 @@ def first_not_none(values): # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here so = self._selected_obj - if (so.ndim == 2 and so.dtypes.apply(is_datetimelike).any()): + if so.ndim == 2 and so.dtypes.apply(is_datetimelike).any(): result = result.apply( lambda x: to_numeric(x, errors='ignore')) date_cols = self._selected_obj.select_dtypes( diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 7fe34279c0482..592c385dd87ec 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1027,7 +1027,7 @@ def set(self, item, value): value_is_extension_type = (is_extension_type(value) or is_extension_array_dtype(value)) - # categorical/spares/datetimetz + # categorical/sparse/datetimetz if value_is_extension_type: def value_getitem(placement): diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index 413c11ba2f9fe..be318ede2df4a 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -1070,47 +1070,41 @@ def test_encoding(self): result = store.select('df', Term('columns=A', encoding='ascii')) tm.assert_frame_equal(result, expected) - def test_latin_encoding(self): - - values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'], - [b'E\xc9, 17', b'a', b'b', b'c'], - [b'EE, 17', b'', b'a', b'b', b'c'], - [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'], - [b'', b'a', b'b', b'c'], - [b'\xf8\xfc', b'a', b'b', b'c'], - [b'A\xf8\xfc', b'', b'a', b'b', b'c'], - [np.nan, b'', b'b', b'c'], - [b'A\xf8\xfc', np.nan, b'', b'b', b'c']] - - def _try_decode(x, encoding='latin-1'): - try: - return x.decode(encoding) - except AttributeError: - return x - # not sure how to remove latin-1 from code in python 2 and 3 - values = [[_try_decode(x) for x in y] for y in values] - - examples = [] - for dtype in ['category', object]: - for val in values: - examples.append(pd.Series(val, dtype=dtype)) - - def roundtrip(s, key='data', encoding='latin-1', nan_rep=''): - with ensure_clean_path(self.path) as store: - s.to_hdf(store, key, format='table', encoding=encoding, - nan_rep=nan_rep) - retr = read_hdf(store, key) - s_nan = s.replace(nan_rep, np.nan) - if is_categorical_dtype(s_nan): - assert is_categorical_dtype(retr) - assert_series_equal(s_nan, retr, check_dtype=False, - check_categorical=False) - else: - assert_series_equal(s_nan, retr) - - for s in examples: - roundtrip(s) + @pytest.mark.parametrize('val', [ + [b'E\xc9, 17', b'', b'a', b'b', b'c'], + [b'E\xc9, 17', b'a', b'b', b'c'], + [b'EE, 17', b'', b'a', b'b', b'c'], + [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'], + [b'', b'a', b'b', b'c'], + [b'\xf8\xfc', b'a', b'b', b'c'], + [b'A\xf8\xfc', b'', b'a', b'b', b'c'], + [np.nan, b'', b'b', b'c'], + [b'A\xf8\xfc', np.nan, b'', b'b', b'c'] + ]) + @pytest.mark.parametrize('dtype', ['category', object]) + def test_latin_encoding(self, dtype, val): + enc = 'latin-1' + nan_rep = '' + key = 'data' + + val = [x.decode(enc) if isinstance(x, bytes) else x for x in val] + ser = pd.Series(val, dtype=dtype) + + with ensure_clean_path(self.path) as store: + ser.to_hdf(store, key, format='table', encoding=enc, + nan_rep=nan_rep) + retr = read_hdf(store, key) + + s_nan = ser.replace(nan_rep, np.nan) + + if is_categorical_dtype(s_nan): + assert is_categorical_dtype(retr) + assert_series_equal(s_nan, retr, check_dtype=False, + check_categorical=False) + else: + assert_series_equal(s_nan, retr) + # FIXME: don't leave commented-out # fails: # for x in examples: # roundtrip(s, nan_rep=b'\xf8\xfc') From 606178a91c4003f589ec64b08f853164fd45ada2 Mon Sep 17 00:00:00 2001 From: Mak Sze Chun Date: Tue, 25 Jun 2019 20:34:25 +0800 Subject: [PATCH 30/34] Remove pandas.core.index.datetimelike from MyPy Blacklist (#26280) --- mypy.ini | 5 +---- pandas/core/algorithms.py | 3 ++- pandas/core/arrays/base.py | 4 ++-- pandas/core/indexes/datetimelike.py | 24 ++++++++++++++---------- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/mypy.ini b/mypy.ini index eea6a3b551677..f8b37ee5b8663 100644 --- a/mypy.ini +++ b/mypy.ini @@ -3,7 +3,4 @@ ignore_missing_imports=True follow_imports=silent [mypy-pandas.conftest,pandas.tests.*] -ignore_errors=True - -[mypy-pandas.core.indexes.datetimelike] -ignore_errors=True +ignore_errors=True \ No newline at end of file diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 932ac71a23ed0..ff1313c21d96f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -3,6 +3,7 @@ intended for public consumption """ from textwrap import dedent +from typing import Dict from warnings import catch_warnings, simplefilter, warn import numpy as np @@ -27,7 +28,7 @@ from pandas.core import common as com -_shared_docs = {} +_shared_docs = {} # type: Dict[str, str] # --------------- # diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 51ad01dd6b369..d1dfb6b5e8599 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -6,7 +6,7 @@ without warning. """ import operator -from typing import Any, Callable, Optional, Sequence, Tuple, Union +from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union import numpy as np @@ -26,7 +26,7 @@ _not_implemented_message = "{} does not implement {}." -_extension_array_shared_docs = dict() +_extension_array_shared_docs = dict() # type: Dict[str, str] class ExtensionArray: diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index abe2853c75c87..7c90fb11aa1bf 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -2,6 +2,7 @@ Base and utility classes for tseries type pandas objects. """ import operator +from typing import Set import warnings import numpy as np @@ -62,14 +63,17 @@ class DatetimeIndexOpsMixin(ExtensionOpsMixin): # DatetimeLikeArrayMixin assumes subclasses are mutable, so these are # properties there. They can be made into cache_readonly for Index # subclasses bc they are immutable - inferred_freq = cache_readonly(DatetimeLikeArrayMixin.inferred_freq.fget) - _isnan = cache_readonly(DatetimeLikeArrayMixin._isnan.fget) - hasnans = cache_readonly(DatetimeLikeArrayMixin._hasnans.fget) + inferred_freq = cache_readonly( + DatetimeLikeArrayMixin.inferred_freq.fget) # type: ignore + _isnan = cache_readonly(DatetimeLikeArrayMixin._isnan.fget) # type: ignore + hasnans = cache_readonly( + DatetimeLikeArrayMixin._hasnans.fget) # type: ignore _hasnans = hasnans # for index / array -agnostic code - _resolution = cache_readonly(DatetimeLikeArrayMixin._resolution.fget) - resolution = cache_readonly(DatetimeLikeArrayMixin.resolution.fget) + _resolution = cache_readonly( + DatetimeLikeArrayMixin._resolution.fget) # type: ignore + resolution = cache_readonly( + DatetimeLikeArrayMixin.resolution.fget) # type: ignore - _box_values = ea_passthrough(DatetimeLikeArrayMixin._box_values) _maybe_mask_results = ea_passthrough( DatetimeLikeArrayMixin._maybe_mask_results) __iter__ = ea_passthrough(DatetimeLikeArrayMixin.__iter__) @@ -131,11 +135,11 @@ def _ndarray_values(self): # Abstract data attributes @property - def values(self) -> np.ndarray: + def values(self): # Note: PeriodArray overrides this to return an ndarray of objects. return self._data._data - @property + @property # type: ignore # https://github.com/python/mypy/issues/1362 @Appender(DatetimeLikeArrayMixin.asi8.__doc__) def asi8(self): return self._data.asi8 @@ -762,9 +766,9 @@ class DatetimelikeDelegateMixin(PandasDelegate): boxed in an index, after being returned from the array """ # raw_methods : dispatch methods that shouldn't be boxed in an Index - _raw_methods = set() + _raw_methods = set() # type: Set[str] # raw_properties : dispatch properties that shouldn't be boxed in an Index - _raw_properties = set() + _raw_properties = set() # type: Set[str] name = None _data = None From f0919f272d9614058b5ebb5e0664d1ac6f23540f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 25 Jun 2019 05:48:13 -0700 Subject: [PATCH 31/34] BUG: Fix timedelta64+Timestamp, closes #24775 (#26916) --- doc/source/whatsnew/v0.25.0.rst | 2 ++ pandas/_libs/tslibs/c_timestamp.pyx | 12 +++++++++ .../tests/scalar/timestamp/test_arithmetic.py | 20 +++++++++++++++ .../scalar/timestamp/test_comparisons.py | 12 +++++++++ .../tests/scalar/timestamp/test_timestamp.py | 25 ------------------- 5 files changed, 46 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d10f9567188d1..a58cdc8c93ab7 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -603,6 +603,8 @@ Datetimelike - Bug when comparing a :class:`PeriodIndex` against a zero-dimensional numpy array (:issue:`26689`) - Bug in constructing a ``Series`` or ``DataFrame`` from a numpy ``datetime64`` array with a non-ns unit and out-of-bound timestamps generating rubbish data, which will now correctly raise an ``OutOfBoundsDatetime`` error (:issue:`26206`). - Bug in :func:`date_range` with unnecessary ``OverflowError`` being raised for very large or very small dates (:issue:`26651`) +- Bug where adding :class:`Timestamp` to a ``np.timedelta64`` object would raise instead of returning a :class:`Timestamp` (:issue:`24775`) +- Bug where comparing a zero-dimensional numpy array containing a ``np.datetime64`` object to a :class:`Timestamp` would incorrect raise ``TypeError`` (:issue:`26916`) Timedelta ^^^^^^^^^ diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index 6bf6b6dcea8dd..f9d1a906207fe 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -55,6 +55,9 @@ def maybe_integer_op_deprecated(obj): cdef class _Timestamp(datetime): + # higher than np.ndarray and np.matrix + __array_priority__ = 100 + def __hash__(_Timestamp self): if self.nanosecond: return hash(self.value) @@ -85,6 +88,15 @@ cdef class _Timestamp(datetime): if ndim == 0: if is_datetime64_object(other): other = self.__class__(other) + elif is_array(other): + # zero-dim array, occurs if try comparison with + # datetime64 scalar on the left hand side + # Unfortunately, for datetime64 values, other.item() + # incorrectly returns an integer, so we need to use + # the numpy C api to extract it. + other = cnp.PyArray_ToScalar(cnp.PyArray_DATA(other), + other) + other = self.__class__(other) else: return NotImplemented elif is_array(other): diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 21e1dccaefc4b..8310b140b50e0 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -112,3 +112,23 @@ def test_addition_subtraction_preserve_frequency(self): td64 = np.timedelta64(1, 'D') assert (ts + td64).freq == original_freq assert (ts - td64).freq == original_freq + + @pytest.mark.parametrize('td', [Timedelta(hours=3), + np.timedelta64(3, 'h'), + timedelta(hours=3)]) + def test_radd_tdscalar(self, td): + # GH#24775 timedelta64+Timestamp should not raise + ts = Timestamp.now() + assert td + ts == ts + td + + @pytest.mark.parametrize('other,expected_difference', [ + (np.timedelta64(-123, 'ns'), -123), + (np.timedelta64(1234567898, 'ns'), 1234567898), + (np.timedelta64(-123, 'us'), -123000), + (np.timedelta64(-123, 'ms'), -123000000) + ]) + def test_timestamp_add_timedelta64_unit(self, other, expected_difference): + ts = Timestamp(datetime.utcnow()) + result = ts + other + valdiff = result.value - ts.value + assert valdiff == expected_difference diff --git a/pandas/tests/scalar/timestamp/test_comparisons.py b/pandas/tests/scalar/timestamp/test_comparisons.py index 763cfc23ea832..b572b4607108c 100644 --- a/pandas/tests/scalar/timestamp/test_comparisons.py +++ b/pandas/tests/scalar/timestamp/test_comparisons.py @@ -156,6 +156,18 @@ def test_timestamp_compare_with_early_datetime(self): assert stamp < datetime(2700, 1, 1) assert stamp <= datetime(2700, 1, 1) + def test_compare_zerodim_array(self): + # GH#26916 + ts = Timestamp.now() + dt64 = np.datetime64('2016-01-01', 'ns') + arr = np.array(dt64) + assert arr.ndim == 0 + + result = arr < ts + assert result is True + result = arr > ts + assert result is False + def test_rich_comparison_with_unsupported_type(): # Comparisons with unsupported objects should return NotImplemented diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 4b6b0dac916c6..b9946796a4e1f 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -796,31 +796,6 @@ def test_tz_conversion_freq(self, tz_naive_fixture): class TestTimestampNsOperations: - def setup_method(self, method): - self.timestamp = Timestamp(datetime.utcnow()) - - def assert_ns_timedelta(self, modified_timestamp, expected_value): - value = self.timestamp.value - modified_value = modified_timestamp.value - - assert modified_value - value == expected_value - - def test_timedelta_ns_arithmetic(self): - self.assert_ns_timedelta(self.timestamp + np.timedelta64(-123, 'ns'), - -123) - - def test_timedelta_ns_based_arithmetic(self): - self.assert_ns_timedelta(self.timestamp + np.timedelta64( - 1234567898, 'ns'), 1234567898) - - def test_timedelta_us_arithmetic(self): - self.assert_ns_timedelta(self.timestamp + np.timedelta64(-123, 'us'), - -123000) - - def test_timedelta_ms_arithmetic(self): - time = self.timestamp + np.timedelta64(-123, 'ms') - self.assert_ns_timedelta(time, -123000000) - def test_nanosecond_string_parsing(self): ts = Timestamp('2013-05-01 07:15:45.123456789') # GH 7878 From f5587633eec08212737158df98e3afbe3afd06f3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 25 Jun 2019 09:11:51 -0700 Subject: [PATCH 32/34] TST/REF: parametrize arithmetic tests, simplify parts of core.ops (#26799) --- pandas/core/ops.py | 53 +++++++++---------- pandas/tests/arithmetic/test_datetime64.py | 32 +++++++---- pandas/tests/arithmetic/test_period.py | 18 +++++-- pandas/tests/arithmetic/test_timedelta64.py | 19 +++++-- .../offsets/test_offsets_properties.py | 8 ++- 5 files changed, 81 insertions(+), 49 deletions(-) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 86a255321f827..0b9e56fd19556 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -1077,7 +1077,7 @@ def fill_binop(left, right, fill_value): return left, right -def mask_cmp_op(x, y, op, allowed_types): +def mask_cmp_op(x, y, op): """ Apply the function `op` to only non-null points in x and y. @@ -1086,16 +1086,14 @@ def mask_cmp_op(x, y, op, allowed_types): x : array-like y : array-like op : binary operation - allowed_types : class or tuple of classes Returns ------- result : ndarray[bool] """ - # TODO: Can we make the allowed_types arg unnecessary? xrav = x.ravel() result = np.empty(x.size, dtype=bool) - if isinstance(y, allowed_types): + if isinstance(y, (np.ndarray, ABCSeries)): yrav = y.ravel() mask = notna(xrav) & notna(yrav) result[mask] = op(np.array(list(xrav[mask])), @@ -1633,39 +1631,38 @@ def _arith_method_SERIES(cls, op, special): if op in [divmod, rdivmod] else _construct_result) def na_op(x, y): - import pandas.core.computation.expressions as expressions - try: - result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs) - except TypeError: - result = masked_arith_op(x, y, op) - - result = missing.fill_zeros(result, x, y, op_name, fill_zeros) - return result - - def safe_na_op(lvalues, rvalues): """ - return the result of evaluating na_op on the passed in values + Return the result of evaluating op on the passed in values. - try coercion to object type if the native types are not compatible + If native types are not compatible, try coersion to object dtype. Parameters ---------- - lvalues : array-like - rvalues : array-like + x : array-like + y : array-like or scalar + + Returns + ------- + array-like Raises ------ - TypeError: invalid operation + TypeError : invalid operation """ + import pandas.core.computation.expressions as expressions try: - with np.errstate(all='ignore'): - return na_op(lvalues, rvalues) - except Exception: - if is_object_dtype(lvalues): - return libalgos.arrmap_object(lvalues, - lambda x: op(x, rvalues)) + result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs) + except TypeError: + result = masked_arith_op(x, y, op) + except Exception: # TODO: more specific? + if is_object_dtype(x): + return libalgos.arrmap_object(x, + lambda val: op(val, y)) raise + result = missing.fill_zeros(result, x, y, op_name, fill_zeros) + return result + def wrapper(left, right): if isinstance(right, ABCDataFrame): return NotImplemented @@ -1713,7 +1710,8 @@ def wrapper(left, right): if isinstance(rvalues, ABCSeries): rvalues = rvalues.values - result = safe_na_op(lvalues, rvalues) + with np.errstate(all='ignore'): + result = na_op(lvalues, rvalues) return construct_result(left, result, index=left.index, name=res_name, dtype=None) @@ -2136,7 +2134,6 @@ def na_op(x, y): result = masked_arith_op(x, y, op) result = missing.fill_zeros(result, x, y, op_name, fill_zeros) - return result if op_name in _op_descriptions: @@ -2183,7 +2180,7 @@ def na_op(x, y): with np.errstate(invalid='ignore'): result = op(x, y) except TypeError: - result = mask_cmp_op(x, y, op, (np.ndarray, ABCSeries)) + result = mask_cmp_op(x, y, op) return result doc = _flex_comp_doc_FRAME.format(op_name=op_name, diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index afd29852fea7e..64b4e162483f1 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -37,6 +37,27 @@ def assert_all(obj): # ------------------------------------------------------------------ # Comparisons +class TestDatetime64ArrayLikeComparisons: + # Comparison tests for datetime64 vectors fully parametrized over + # DataFrame/Series/DatetimeIndex/DateteimeArray. Ideally all comparison + # tests will eventually end up here. + + def test_compare_zerodim(self, tz_naive_fixture, box_with_array): + # Test comparison with zero-dimensional array is unboxed + tz = tz_naive_fixture + box = box_with_array + xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + dti = date_range('20130101', periods=3, tz=tz) + + other = np.array(dti.to_numpy()[0]) + + # FIXME: ValueError with transpose on tzaware + dtarr = tm.box_expected(dti, box, transpose=False) + result = dtarr <= other + expected = np.array([True, False, False]) + expected = tm.box_expected(expected, xbox, transpose=False) + tm.assert_equal(result, expected) + class TestDatetime64DataFrameComparison: @pytest.mark.parametrize('timestamps', [ @@ -339,17 +360,6 @@ def test_comparison_tzawareness_compat(self, op): class TestDatetimeIndexComparisons: - # TODO: parametrize over box - def test_compare_zerodim(self, tz_naive_fixture): - # Test comparison with zero-dimensional array is unboxed - tz = tz_naive_fixture - dti = date_range('20130101', periods=3, tz=tz) - - other = np.array(dti.to_numpy()[0]) - result = dti <= other - expected = np.array([True, False, False]) - tm.assert_numpy_array_equal(result, expected) - # TODO: moved from tests.indexes.test_base; parametrize and de-duplicate @pytest.mark.parametrize("op", [ operator.eq, operator.ne, operator.gt, operator.lt, diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index bc1b78bf944d1..413d58d9429e7 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -20,17 +20,27 @@ # Comparisons -class TestPeriodIndexComparisons: +class TestPeriodArrayLikeComparisons: + # Comparison tests for PeriodDtype vectors fully parametrized over + # DataFrame/Series/PeriodIndex/PeriodArray. Ideally all comparison + # tests will eventually end up here. - # TODO: parameterize over boxes - def test_compare_zerodim(self): + def test_compare_zerodim(self, box_with_array): # GH#26689 make sure we unbox zero-dimensional arrays + xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + pi = pd.period_range('2000', periods=4) other = np.array(pi.to_numpy()[0]) + pi = tm.box_expected(pi, box_with_array) result = pi <= other expected = np.array([True, False, False, False]) - tm.assert_numpy_array_equal(result, expected) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(result, expected) + + +class TestPeriodIndexComparisons: + # TODO: parameterize over boxes @pytest.mark.parametrize("other", ["2017", 2017]) def test_eq(self, other): diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 047900c3d7586..22b5fd452d661 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -31,22 +31,33 @@ def get_upcast_box(box, vector): # ------------------------------------------------------------------ # Timedelta64[ns] dtype Comparisons -class TestTimedelta64ArrayComparisons: - # TODO: All of these need to be parametrized over box +class TestTimedelta64ArrayLikeComparisons: + # Comparison tests for timedelta64[ns] vectors fully parametrized over + # DataFrame/Series/TimedeltaIndex/TimedeltaArray. Ideally all comparison + # tests will eventually end up here. - def test_compare_timedelta64_zerodim(self): + def test_compare_timedelta64_zerodim(self, box_with_array): # GH#26689 should unbox when comparing with zerodim array + box = box_with_array + xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + tdi = pd.timedelta_range('2H', periods=4) other = np.array(tdi.to_numpy()[0]) + tdi = tm.box_expected(tdi, box) res = tdi <= other expected = np.array([True, False, False, False]) - tm.assert_numpy_array_equal(res, expected) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(res, expected) with pytest.raises(TypeError): # zero-dim of wrong dtype should still raise tdi >= np.array(4) + +class TestTimedelta64ArrayComparisons: + # TODO: All of these need to be parametrized over box + def test_compare_timedelta_series(self): # regression test for GH#5963 s = pd.Series([timedelta(days=1), timedelta(days=2)]) diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index 50be2deca4d30..271f4ceef5f49 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -71,7 +71,10 @@ def test_on_offset_implementations(dt, offset): assert offset.onOffset(dt) == (compare == dt) -@pytest.mark.xfail +@pytest.mark.xfail(reason="res_v2 below is incorrect, needs to use the " + "commented-out version with tz_localize. " + "But with that fix in place, hypothesis then " + "has errors in timezone generation.") @given(gen_yqm_offset, gen_date_range) def test_apply_index_implementations(offset, rng): # offset.apply_index(dti)[i] should match dti[i] + offset @@ -82,6 +85,7 @@ def test_apply_index_implementations(offset, rng): res = rng + offset res_v2 = offset.apply_index(rng) + # res_v2 = offset.apply_index(rng.tz_localize(None)).tz_localize(rng.tz) assert (res == res_v2).all() assert res[0] == rng[0] + offset @@ -93,7 +97,7 @@ def test_apply_index_implementations(offset, rng): # TODO: Check randomly assorted entries, not just first/last -@pytest.mark.xfail +@pytest.mark.xfail # TODO: reason? @given(gen_yqm_offset) def test_shift_across_dst(offset): # GH#18319 check that 1) timezone is correctly normalized and From c9182df84736ce060c30d386c9f3a97614ca7778 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 25 Jun 2019 09:13:19 -0700 Subject: [PATCH 33/34] CLN: Remove never-True Block.is_sparse (#27037) --- pandas/core/dtypes/concat.py | 5 +---- pandas/core/internals/blocks.py | 34 +++++++++---------------------- pandas/core/internals/concat.py | 2 -- pandas/core/internals/managers.py | 20 +++++++----------- 4 files changed, 18 insertions(+), 43 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index a01ba7fc94f22..242885c7a9679 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -73,10 +73,7 @@ def _get_series_result_type(result, objs=None): return DataFrame # otherwise it is a SingleBlockManager (axis = 0) - if result._block.is_sparse: - return SparseSeries - else: - return objs[0]._constructor + return objs[0]._constructor def _get_frame_result_type(result, objs): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4cc6c86417b3b..92ea936944a3c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -62,9 +62,7 @@ class Block(PandasObject): is_bool = False is_object = False is_categorical = False - is_sparse = False is_extension = False - _box_to_block_values = True _can_hold_na = False _can_consolidate = True _verify_integrity = True @@ -182,10 +180,6 @@ def get_values(self, dtype=None): def to_dense(self): return self.values.view() - @property - def _na_value(self): - return np.nan - @property def fill_value(self): return np.nan @@ -1189,8 +1183,6 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): # sparse is treated like an ndarray, but needs .get_values() shaping values = self.values - if self.is_sparse: - values = self.get_values() if fill_tuple is None: fill_value = self.fill_value @@ -1411,6 +1403,9 @@ def quantile(self, qs, interpolation='linear', axis=0): ------- Block """ + # We should always have ndim == 2 becase Series dispatches to DataFrame + assert self.ndim == 2 + if self.is_datetimetz: # TODO: cleanup this special case. # We need to operate on i8 values for datetimetz @@ -1420,8 +1415,7 @@ def quantile(self, qs, interpolation='linear', axis=0): # TODO: NonConsolidatableMixin shape # Usual shape inconsistencies for ExtensionBlocks - if self.ndim > 1: - values = values[None, :] + values = values[None, :] else: values = self.get_values() values, _ = self._try_coerce_args(values, values) @@ -1433,14 +1427,11 @@ def quantile(self, qs, interpolation='linear', axis=0): qs = [qs] if is_empty: - if self.ndim == 1: - result = self._na_value - else: - # create the array of na_values - # 2d len(values) * len(qs) - result = np.repeat(np.array([self.fill_value] * len(qs)), - len(values)).reshape(len(values), - len(qs)) + # create the array of na_values + # 2d len(values) * len(qs) + result = np.repeat(np.array([self.fill_value] * len(qs)), + len(values)).reshape(len(values), + len(qs)) else: # asarray needed for Sparse, see GH#24600 # TODO: Why self.values and not values? @@ -1451,8 +1442,7 @@ def quantile(self, qs, interpolation='linear', axis=0): interpolation=interpolation) result = np.array(result, copy=False) - if self.ndim > 1: - result = result.T + result = result.T if orig_scalar and not lib.is_scalar(result): # result could be scalar in case with is_empty and self.ndim == 1 @@ -2024,10 +2014,6 @@ class DatetimeLikeBlockMixin: def _holder(self): return DatetimeArray - @property - def _na_value(self): - return tslibs.NaT - @property def fill_value(self): return tslibs.iNaT diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index d92c15e1d6f93..8f699ae24230d 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -187,8 +187,6 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): pass elif getattr(self.block, 'is_categorical', False): pass - elif getattr(self.block, 'is_sparse', False): - pass elif getattr(self.block, 'is_extension', False): pass else: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 592c385dd87ec..26b6920c119dd 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -102,16 +102,11 @@ def __init__(self, self.blocks = tuple(blocks) # type: Tuple[Block, ...] for block in blocks: - if block.is_sparse: - if len(block.mgr_locs) != 1: - raise AssertionError("Sparse block refers to multiple " - "items") - else: - if self.ndim != block.ndim: - raise AssertionError( - 'Number of Block dimensions ({block}) must equal ' - 'number of axes ({self})'.format(block=block.ndim, - self=self.ndim)) + if self.ndim != block.ndim: + raise AssertionError( + 'Number of Block dimensions ({block}) must equal ' + 'number of axes ({self})'.format(block=block.ndim, + self=self.ndim)) if do_integrity_check: self._verify_integrity() @@ -966,7 +961,7 @@ def iget(self, i, fastpath=True): """ block = self.blocks[self._blknos[i]] values = block.iget(self._blklocs[i]) - if not fastpath or not block._box_to_block_values or values.ndim != 1: + if not fastpath or values.ndim != 1: return values # fastpath shortcut for select a single-dim from a 2-dim BM @@ -1820,8 +1815,7 @@ def _shape_compat(x): def _interleaved_dtype( - blocks: List[Block] -) -> Optional[Union[np.dtype, ExtensionDtype]]: + blocks: List[Block]) -> Optional[Union[np.dtype, ExtensionDtype]]: """Find the common dtype for `blocks`. Parameters From 6b7f5a2ac9647a7ef16a6bc6ff456c7764440051 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 25 Jun 2019 12:18:29 -0500 Subject: [PATCH 34/34] re-revert --- doc/source/whatsnew/v0.25.0.rst | 34 ++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ccfb30fe5ad7a..01aa9ad97c541 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -74,6 +74,36 @@ a dict to a Series groupby aggregation (:ref:`whatsnew_0200.api_breaking.depreca See :ref:`groupby.aggregate.named` for more. +.. _whatsnew_0250.enhancements.multi_index_repr: + +Better repr for MultiIndex +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Printing of :class:`MultiIndex` instances now shows tuples of each row and ensures +that the tuple items are vertically aligned, so it's now easier to understand +the structure of the ``MultiIndex``. (:issue:`13480`): + +The repr now looks like this: + +.. ipython:: python + + pd.MultiIndex.from_product([['a', 'abc'], range(500)]) + +Previously, outputting a :class:`MultiIndex` printed all the ``levels`` and +``codes`` of the ``MultiIndex``, which was visually unappealing and made +the output more difficult to navigate. For example (limiting the range to 5): + +.. code-block:: ipython + + In [1]: pd.MultiIndex.from_product([['a', 'abc'], range(5)]) + Out[1]: MultiIndex(levels=[['a', 'abc'], [0, 1, 2, 3]], + ...: codes=[[0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 0, 1, 2, 3]]) + +In the new repr, all values will be shown, if the number of rows is smaller +than :attr:`options.display.max_seq_items` (default: 100 items). Horizontally, +the output will truncate, if it's wider than :attr:`options.display.width` +(default: 80 characters). + .. _whatsnew_0250.enhancements.other: Other Enhancements @@ -413,7 +443,7 @@ If installed, we now require: | pytest (dev) | 4.0.2 | | +-----------------+-----------------+----------+ -For `optional libraries `_ the general recommendation is to use the latest version. +For `optional libraries `_ the general recommendation is to use the latest version. The following table lists the lowest version per library that is currently being tested throughout the development of pandas. Optional libraries below the lowest tested version may still work, but are not considered supported. @@ -505,6 +535,8 @@ Other Deprecations - The :meth:`Series.ftype`, :meth:`Series.ftypes` and :meth:`DataFrame.ftypes` methods are deprecated and will be removed in a future version. Instead, use :meth:`Series.dtype` and :meth:`DataFrame.dtypes` (:issue:`26705`). - :meth:`Timedelta.resolution` is deprecated and replaced with :meth:`Timedelta.resolution_string`. In a future version, :meth:`Timedelta.resolution` will be changed to behave like the standard library :attr:`timedelta.resolution` (:issue:`21344`) +- :meth:`Series.to_sparse`, :meth:`DataFrame.to_sparse`, :meth:`Series.to_dense` and :meth:`DataFrame.to_dense` are deprecated and will be removed in a future version. (:issue:`26557`). + .. _whatsnew_0250.prior_deprecations: