diff --git a/.github/hub/update_hub_repositories.py b/.github/hub/update_hub_repositories.py index 875cbf80bd2..c923583ba7f 100644 --- a/.github/hub/update_hub_repositories.py +++ b/.github/hub/update_hub_repositories.py @@ -1,4 +1,3 @@ -import base64 import distutils.dir_util import logging import os diff --git a/.github/workflows/benchmarks.yaml b/.github/workflows/benchmarks.yaml index 8403f4ffadb..c926a708b5d 100644 --- a/.github/workflows/benchmarks.yaml +++ b/.github/workflows/benchmarks.yaml @@ -3,13 +3,16 @@ on: [push] jobs: run: runs-on: [ubuntu-latest] - container: docker://dvcorg/cml-py3:latest + container: docker://dvcorg/cml:latest steps: - uses: actions/checkout@v2 - name: cml_run env: repo_token: ${{ secrets.GITHUB_TOKEN }} run: | + # See https://github.com/actions/checkout/issues/760 + git config --global --add safe.directory /__w/datasets/datasets + # Your ML workflow goes here pip install --upgrade pip diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b4f14085ee4..5513c1c6397 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,7 +21,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: "3.6" + python-version: "3.7" - name: Install dependencies run: | python -m pip install --upgrade pip @@ -49,13 +49,7 @@ jobs: - uses: actions/checkout@v3 with: fetch-depth: 0 - - name: Set up Python 3.6 - if: ${{ matrix.os == 'ubuntu-latest' }} - uses: actions/setup-python@v4 - with: - python-version: 3.6 - name: Set up Python 3.7 - if: ${{ matrix.os == 'windows-latest' }} uses: actions/setup-python@v4 with: python-version: 3.7 @@ -63,7 +57,7 @@ jobs: run: python -m pip install --upgrade pip - name: Pin setuptools-scm if: ${{ matrix.os == 'ubuntu-latest' }} - run: echo "installing pinned version of setuptools-scm to fix seqeval installation on 3.6" && pip install "setuptools-scm==6.4.2" + run: echo "installing pinned version of setuptools-scm to fix seqeval installation on 3.7" && pip install "setuptools-scm==6.4.2" - name: Install dependencies run: | pip install .[tests] diff --git a/Makefile b/Makefile index e3615d44ed0..b7936753dba 100644 --- a/Makefile +++ b/Makefile @@ -3,14 +3,14 @@ # Check that source code meets quality standards quality: - black --check --line-length 119 --target-version py36 tests src benchmarks datasets/**/*.py metrics + black --check --line-length 119 --target-version py37 tests src benchmarks datasets/**/*.py metrics isort --check-only tests src benchmarks datasets/**/*.py metrics flake8 tests src benchmarks datasets/**/*.py metrics # Format source code automatically style: - black --line-length 119 --target-version py36 tests src benchmarks datasets/**/*.py metrics + black --line-length 119 --target-version py37 tests src benchmarks datasets/**/*.py metrics isort tests src benchmarks datasets/**/*.py metrics # Run tests for the library diff --git a/additional-tests-requirements.txt b/additional-tests-requirements.txt index a827c308c9f..00b5b8d62a3 100644 --- a/additional-tests-requirements.txt +++ b/additional-tests-requirements.txt @@ -1,4 +1,4 @@ -unbabel-comet>=1.0.0;python_version>'3.6' +unbabel-comet>=1.0.0 git+https://github.com/google-research/bleurt.git git+https://github.com/ns-moosavi/coval.git git+https://github.com/hendrycks/math.git diff --git a/docs/source/installation.md b/docs/source/installation.md index c0a1590063e..fdb8359de3c 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -1,6 +1,6 @@ # Installation -Before you start, you'll need to setup your environment and install the appropriate packages. 🤗 Datasets is tested on **Python 3.6+**. +Before you start, you'll need to setup your environment and install the appropriate packages. 🤗 Datasets is tested on **Python 3.7+**. diff --git a/setup.py b/setup.py index b058554eaed..5b6db65f0c7 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,6 @@ Then push the change with a message 'set dev version' """ -import os from setuptools import find_packages, setup @@ -74,8 +73,6 @@ "requests>=2.19.0", # progress bars in download and scripts "tqdm>=4.62.1", - # dataclasses for Python versions that don't have it - "dataclasses;python_version<'3.7'", # for fast hashing "xxhash", # for better multiprocessing @@ -105,7 +102,7 @@ BENCHMARKS_REQUIRE = [ "numpy==1.18.5", "tensorflow==2.3.0", - "torch==1.6.0", + "torch==1.7.1", "transformers==3.0.2", ] @@ -128,7 +125,7 @@ "s3fs>=2021.11.1", # aligned with fsspec[http]>=2021.11.1 "tensorflow>=2.3,!=2.6.0,!=2.6.1", "torch", - "torchaudio", + "torchaudio<0.12.0", "soundfile", "transformers", # datasets dependencies @@ -165,8 +162,6 @@ "texttable>=1.6.3", "Werkzeug>=1.0.1", "six~=1.15.0", - # metadata validation - "importlib_resources;python_version<'3.7'", ] TESTS_REQUIRE.extend(VISION_REQURE) @@ -214,6 +209,7 @@ packages=find_packages("src"), package_data={"datasets": ["py.typed", "scripts/templates/*"], "datasets.utils.resources": ["*.json", "*.yaml", "*.tsv"]}, entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]}, + python_requires=">=3.7.0", install_requires=REQUIRED_PKGS, extras_require=EXTRAS_REQUIRE, classifiers=[ @@ -224,7 +220,6 @@ "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py index 19cb4e8b845..7cdfeeee4a4 100644 --- a/src/datasets/__init__.py +++ b/src/datasets/__init__.py @@ -19,10 +19,17 @@ __version__ = "2.4.1.dev0" +import platform + import pyarrow from packaging import version +if version.parse(platform.python_version()) < version.parse("3.7"): + raise ImportWarning( + "To use `datasets`, Python>=3.7 is required, and the current version of Python doesn't match this condition." + ) + if version.parse(pyarrow.__version__).major < 6: raise ImportWarning( "To use `datasets`, the module `pyarrow>=6.0.0` is required, and the current version of `pyarrow` doesn't match this condition.\n" @@ -31,6 +38,7 @@ SCRIPTS_VERSION = "main" if version.parse(__version__).is_devrelease else __version__ +del platform del pyarrow del version diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index a7c1cf53f01..d23896d12f6 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -824,7 +824,7 @@ def __getitem__(self, item: Union[int, slice, np.ndarray]) -> Union[np.ndarray, def take( self, indices: Sequence_[int], allow_fill: bool = False, fill_value: bool = None ) -> "PandasArrayExtensionArray": - indices: np.ndarray = np.asarray(indices, dtype=np.int) + indices: np.ndarray = np.asarray(indices, dtype=int) if allow_fill: fill_value = ( self.dtype.na_value if fill_value is None else np.asarray(fill_value, dtype=self.dtype.value_type) diff --git a/src/datasets/utils/py_utils.py b/src/datasets/utils/py_utils.py index ae4dbf6d074..39be5526203 100644 --- a/src/datasets/utils/py_utils.py +++ b/src/datasets/utils/py_utils.py @@ -22,9 +22,7 @@ import functools import itertools import os -import pickle import re -import sys import types from contextlib import contextmanager from dataclasses import fields, is_dataclass @@ -32,7 +30,7 @@ from multiprocessing import Pool, RLock from shutil import disk_usage from types import CodeType, FunctionType -from typing import Callable, ClassVar, Dict, Generic, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union from urllib.parse import urlparse import dill @@ -552,19 +550,6 @@ class Pickler(dill.Pickler): dispatch = dill._dill.MetaCatchingDict(dill.Pickler.dispatch.copy()) - def save_global(self, obj, name=None): - if sys.version_info[:2] < (3, 7) and _CloudPickleTypeHintFix._is_parametrized_type_hint( - obj - ): # noqa # pragma: no branch - # Parametrized typing constructs in Python < 3.7 are not compatible - # with type checks and ``isinstance`` semantics. For this reason, - # it is easier to detect them using a duck-typing-based check - # (``_is_parametrized_type_hint``) than to populate the Pickler's - # dispatch with type-specific savers. - _CloudPickleTypeHintFix._save_parametrized_type_hint(self, obj) - else: - dill.Pickler.save_global(self, obj, name=name) - def memoize(self, obj): # don't memoize strings since two identical strings can have different python ids if type(obj) != str: @@ -610,47 +595,6 @@ def proxy(func): return proxy -class _CloudPickleTypeHintFix: - """ - Type hints can't be properly pickled in python < 3.7 - CloudPickle provided a way to make it work in older versions. - This class provide utilities to fix pickling of type hints in older versions. - from https://github.com/cloudpipe/cloudpickle/pull/318/files - """ - - def _is_parametrized_type_hint(obj): - # This is very cheap but might generate false positives. - origin = getattr(obj, "__origin__", None) # typing Constructs - values = getattr(obj, "__values__", None) # typing_extensions.Literal - type_ = getattr(obj, "__type__", None) # typing_extensions.Final - return origin is not None or values is not None or type_ is not None - - def _create_parametrized_type_hint(origin, args): - return origin[args] - - def _save_parametrized_type_hint(pickler, obj): - # The distorted type check sematic for typing construct becomes: - # ``type(obj) is type(TypeHint)``, which means "obj is a - # parametrized TypeHint" - if type(obj) is type(Literal): # pragma: no branch - initargs = (Literal, obj.__values__) - elif type(obj) is type(Final): # pragma: no branch - initargs = (Final, obj.__type__) - elif type(obj) is type(ClassVar): - initargs = (ClassVar, obj.__type__) - elif type(obj) in [type(Union), type(Tuple), type(Generic)]: - initargs = (obj.__origin__, obj.__args__) - elif type(obj) is type(Callable): - args = obj.__args__ - if args[0] is Ellipsis: - initargs = (obj.__origin__, args) - else: - initargs = (obj.__origin__, (list(args[:-1]), args[-1])) - else: # pragma: no cover - raise pickle.PicklingError(f"Datasets pickle Error: Unknown type {type(obj)}") - pickler.save_reduce(_CloudPickleTypeHintFix._create_parametrized_type_hint, initargs, obj=obj) - - @pklregister(CodeType) def _save_code(pickler, obj): """ diff --git a/tests/commands/test_dummy_data.py b/tests/commands/test_dummy_data.py index 81d5ccb4568..7402be4099d 100644 --- a/tests/commands/test_dummy_data.py +++ b/tests/commands/test_dummy_data.py @@ -1,45 +1,24 @@ import os from collections import namedtuple -from dataclasses import dataclass -from packaging import version - -from datasets import config from datasets.commands.dummy_data import DummyDataCommand -if config.PY_VERSION >= version.parse("3.7"): - DummyDataCommandArgs = namedtuple( - "DummyDataCommandArgs", - [ - "path_to_dataset", - "auto_generate", - "n_lines", - "json_field", - "xml_tag", - "match_text_files", - "keep_uncompressed", - "cache_dir", - "encoding", - ], - defaults=[False, 5, None, None, None, False, None, None], - ) -else: - - @dataclass - class DummyDataCommandArgs: - path_to_dataset: str - auto_generate: bool = False - n_lines: int = 5 - json_field: str = None - xml_tag: str = None - match_text_files: str = None - keep_uncompressed: bool = False - cache_dir: str = None - encoding: str = None - - def __iter__(self): - return iter(self.__dict__.values()) +DummyDataCommandArgs = namedtuple( + "DummyDataCommandArgs", + [ + "path_to_dataset", + "auto_generate", + "n_lines", + "json_field", + "xml_tag", + "match_text_files", + "keep_uncompressed", + "cache_dir", + "encoding", + ], + defaults=[False, 5, None, None, None, False, None, None], +) class MockDummyDataCommand(DummyDataCommand): diff --git a/tests/commands/test_test.py b/tests/commands/test_test.py index d169afc43dd..88e665fa54c 100644 --- a/tests/commands/test_test.py +++ b/tests/commands/test_test.py @@ -1,46 +1,26 @@ import json import os from collections import namedtuple -from dataclasses import dataclass - -from packaging import version from datasets import config from datasets.commands.test import TestCommand -if config.PY_VERSION >= version.parse("3.7"): - _TestCommandArgs = namedtuple( - "_TestCommandArgs", - [ - "dataset", - "name", - "cache_dir", - "data_dir", - "all_configs", - "save_infos", - "ignore_verifications", - "force_redownload", - "clear_cache", - ], - defaults=[None, None, None, False, False, False, False, False], - ) -else: - - @dataclass - class _TestCommandArgs: - dataset: str - name: str = None - cache_dir: str = None - data_dir: str = None - all_configs: bool = False - save_infos: bool = False - ignore_verifications: bool = False - force_redownload: bool = False - clear_cache: bool = False - - def __iter__(self): - return iter(self.__dict__.values()) +_TestCommandArgs = namedtuple( + "_TestCommandArgs", + [ + "dataset", + "name", + "cache_dir", + "data_dir", + "all_configs", + "save_infos", + "ignore_verifications", + "force_redownload", + "clear_cache", + ], + defaults=[None, None, None, False, False, False, False, False], +) def test_test_command(dataset_loading_script_dir): diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 6d00a100b00..afa17dff3c3 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -3118,7 +3118,7 @@ def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_param @pytest.mark.skipif( - os.name == "nt" and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), + os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"', ) # TODO: find what's wrong with CircleCI / GitHub Actions @require_s3 diff --git a/tests/test_dataset_dict.py b/tests/test_dataset_dict.py index 4fb53fc5199..3593030ecbd 100644 --- a/tests/test_dataset_dict.py +++ b/tests/test_dataset_dict.py @@ -664,7 +664,7 @@ def test_datasetdict_from_text_split(split, text_path, tmp_path): @pytest.mark.skipif( - os.name == "nt" and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), + os.name in ["nt", "posix"] and (os.getenv("CIRCLECI") == "true" or os.getenv("GITHUB_ACTIONS") == "true"), reason='On Windows CircleCI or GitHub Actions, it raises botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "http://127.0.0.1:5555/test"', ) # TODO: find what's wrong with CircleCI / GitHub Actions @require_s3 diff --git a/tests/test_fingerprint.py b/tests/test_fingerprint.py index 0ffe453fcc1..23ca0c4e0af 100644 --- a/tests/test_fingerprint.py +++ b/tests/test_fingerprint.py @@ -226,21 +226,6 @@ def globalvars_mock2_side_effect(func, *args, **kwargs): self.assertEqual(hash1, hash2) -class TypeHintDumpTest(TestCase): - def test_dump_type_hint(self): - from typing import Union - - t1 = Union[str, None] # this type is not picklable in python 3.6 - # let's check that we can pickle it anyway using our pickler, even in 3.6 - hash1 = md5(datasets.utils.py_utils.dumps(t1)).hexdigest() - t2 = Union[str] # this type is picklable in python 3.6 - hash2 = md5(datasets.utils.py_utils.dumps(t2)).hexdigest() - t3 = Union[str, None] - hash3 = md5(datasets.utils.py_utils.dumps(t3)).hexdigest() - self.assertEqual(hash1, hash3) - self.assertNotEqual(hash1, hash2) - - class HashingTest(TestCase): def test_hash_simple(self): hash1 = Hasher.hash("hello")