From 6eab0e4ef8c5873ffbbdc956ef4d7ab3ce5388f7 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Tue, 14 Jun 2022 03:14:03 +0200 Subject: [PATCH] Avoid some FutureWarnings and DeprecationWarnings --- src/datasets/arrow_dataset.py | 4 ++-- src/datasets/features/features.py | 2 +- src/datasets/formatting/formatting.py | 4 ++-- src/datasets/formatting/jax_formatter.py | 2 +- src/datasets/formatting/tf_formatter.py | 4 +--- src/datasets/formatting/torch_formatter.py | 2 +- src/datasets/utils/stratify.py | 2 +- tests/features/test_array_xd.py | 2 +- tests/test_builder.py | 4 ++-- 9 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 22150f4ce66..0d48b87cb1f 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -280,7 +280,7 @@ def _get_output_signature( else: np_arrays.append(np.array(array)) - if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == np.bool: + if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == bool: tf_dtype = tf.int64 np_dtype = np.int64 elif np.issubdtype(np_arrays[0].dtype, np.number): @@ -3663,7 +3663,7 @@ def _feature(values: Union[float, int, str, np.ndarray]) -> "tf.train.Feature": return _float_feature([values.item()]) elif np.issubdtype(values.dtype, np.integer): return _int64_feature([values.item()]) - elif np.issubdtype(values.dtype, np.str): + elif np.issubdtype(values.dtype, str): return _bytes_feature([values.item().encode()]) else: raise ValueError(f"values={values} has dtype {values.dtype}, which cannot be serialized") diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 3e8301df08b..9ddad51e2e9 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -810,7 +810,7 @@ def __getitem__(self, item: Union[int, slice, np.ndarray]) -> Union[np.ndarray, def take( self, indices: Sequence_[int], allow_fill: bool = False, fill_value: bool = None ) -> "PandasArrayExtensionArray": - indices: np.ndarray = np.asarray(indices, dtype=np.int) + indices: np.ndarray = np.asarray(indices, dtype=int) if allow_fill: fill_value = ( self.dtype.na_value if fill_value is None else np.asarray(fill_value, dtype=self.dtype.value_type) diff --git a/src/datasets/formatting/formatting.py b/src/datasets/formatting/formatting.py index f857a53a992..ecc5b5e6606 100644 --- a/src/datasets/formatting/formatting.py +++ b/src/datasets/formatting/formatting.py @@ -194,11 +194,11 @@ def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray: array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist() if len(array) > 0: if any( - (isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape)) + (isinstance(x, np.ndarray) and (x.dtype == object or x.shape != array[0].shape)) or (isinstance(x, float) and np.isnan(x)) for x in array ): - return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": np.object}) + return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": object}) return np.array(array, copy=False, **self.np_array_kwargs) diff --git a/src/datasets/formatting/jax_formatter.py b/src/datasets/formatting/jax_formatter.py index dffe37bc5f0..0a554203be5 100644 --- a/src/datasets/formatting/jax_formatter.py +++ b/src/datasets/formatting/jax_formatter.py @@ -54,7 +54,7 @@ def _recursive_tensorize(self, data_struct: dict): # support for nested types like struct of list of struct if isinstance(data_struct, (list, np.ndarray)): data_struct = np.array(data_struct, copy=False) - if data_struct.dtype == np.object: # jax arrays cannot be instantied from an array of objects + if data_struct.dtype == object: # jax arrays cannot be instantied from an array of objects return [self.recursive_tensorize(substruct) for substruct in data_struct] return self._tensorize(data_struct) diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py index 7e835280c54..d07f1f636cc 100644 --- a/src/datasets/formatting/tf_formatter.py +++ b/src/datasets/formatting/tf_formatter.py @@ -65,9 +65,7 @@ def _tensorize(self, value): def _recursive_tensorize(self, data_struct: dict): # support for nested types like struct of list of struct if isinstance(data_struct, (list, np.ndarray)): - if ( - data_struct.dtype == np.object - ): # tensorflow tensors can sometimes be instantied from an array of objects + if data_struct.dtype == object: # tensorflow tensors can sometimes be instantied from an array of objects try: return self._tensorize(data_struct) except ValueError: diff --git a/src/datasets/formatting/torch_formatter.py b/src/datasets/formatting/torch_formatter.py index 3106a024920..c5a7d3c214f 100644 --- a/src/datasets/formatting/torch_formatter.py +++ b/src/datasets/formatting/torch_formatter.py @@ -46,7 +46,7 @@ def _recursive_tensorize(self, data_struct: dict): # support for nested types like struct of list of struct if isinstance(data_struct, (list, np.ndarray)): data_struct = np.array(data_struct, copy=False) - if data_struct.dtype == np.object: # pytorch tensors cannot be instantied from an array of objects + if data_struct.dtype == object: # pytorch tensors cannot be instantied from an array of objects return [self.recursive_tensorize(substruct) for substruct in data_struct] return self._tensorize(data_struct) diff --git a/src/datasets/utils/stratify.py b/src/datasets/utils/stratify.py index 58ea04f2f85..3a72f6fc3f2 100644 --- a/src/datasets/utils/stratify.py +++ b/src/datasets/utils/stratify.py @@ -48,7 +48,7 @@ def approximate_mode(class_counts, n_draws, rng): need_to_add -= add_now if need_to_add == 0: break - return floored.astype(np.int) + return floored.astype(int) def stratified_shuffle_split_generate_indices(y, n_train, n_test, rng, n_splits=10): diff --git a/tests/features/test_array_xd.py b/tests/features/test_array_xd.py index ae007abbe00..2f6d9d94009 100644 --- a/tests/features/test_array_xd.py +++ b/tests/features/test_array_xd.py @@ -335,7 +335,7 @@ def test_array_xd_with_none(): dummy_array = np.array([[1, 2], [3, 4]], dtype="int32") dataset = datasets.Dataset.from_dict({"foo": [dummy_array, None, dummy_array]}, features=features) arr = NumpyArrowExtractor().extract_column(dataset._data) - assert isinstance(arr, np.ndarray) and arr.dtype == np.object and arr.shape == (3,) + assert isinstance(arr, np.ndarray) and arr.dtype == object and arr.shape == (3,) np.testing.assert_equal(arr[0], dummy_array) np.testing.assert_equal(arr[2], dummy_array) assert np.isnan(arr[1]) # a single np.nan value - np.all not needed diff --git a/tests/test_builder.py b/tests/test_builder.py index 853859d40ca..c7e08e0f01b 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -837,8 +837,8 @@ def _generate_examples(self): "builder_class, kwargs", [ (DummyBuilderWithVersion, {}), - (DummyBuilderWithBuilderConfigs, {"name": "custom"}), - (DummyBuilderWithCustomBuilderConfigs, {"name": "20220501.en"}), + (DummyBuilderWithBuilderConfigs, {"config_name": "custom"}), + (DummyBuilderWithCustomBuilderConfigs, {"config_name": "20220501.en"}), (DummyBuilderWithCustomBuilderConfigs, {"date": "20220501", "language": "ca"}), ], )