huggingface · albertvillanova · Oct 13, 2021 · May 5, 2021 · May 5, 2021 · May 5, 2021
diff --git a/docs/source/package_reference/main_classes.rst b/docs/source/package_reference/main_classes.rst
@@ -117,6 +117,9 @@ Dictionary with split names as keys ('train', 'test' for example), and :obj:`dat
 .. autoclass:: datasets.Array5D
     :members:
 
+.. autoclass:: datasets.Audio
+    :members:
+
 ``MetricInfo``
 ~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/setup.py b/setup.py
@@ -107,6 +107,10 @@
     "packaging",
 ]
 
+AUDIO_REQUIRE = [
+    "librosa",
+]
+
 BENCHMARKS_REQUIRE = [
     "numpy==1.18.5",
     "tensorflow==2.3.0",
@@ -118,6 +122,7 @@
     # test dependencies
     "absl-py",
     "pytest",
+    "pytest-datadir",
     "pytest-xdist",
     # optional dependencies
     "apache-beam>=2.26.0",
@@ -179,11 +184,13 @@
         ]
     )
 
+TESTS_REQUIRE += AUDIO_REQUIRE
 
 QUALITY_REQUIRE = ["black==21.4b0", "flake8==3.7.9", "isort", "pyyaml>=5.3.1"]
 
 
 EXTRAS_REQUIRE = {
+    "audio": AUDIO_REQUIRE,
     "apache-beam": ["apache-beam>=2.26.0"],
     "tensorflow": ["tensorflow>=2.2.0"],
     "tensorflow_gpu": ["tensorflow-gpu>=2.2.0"],

diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py
@@ -42,6 +42,7 @@
     Array3D,
     Array4D,
     Array5D,
+    Audio,
     ClassLabel,
     Features,
     Sequence,

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -1598,7 +1598,7 @@ def set_format(
 
         # Check that the format_type and format_kwargs are valid and make it possible to have a Formatter
         type = get_format_type_from_alias(type)
-        _ = get_formatter(type, **format_kwargs)
+        _ = get_formatter(type, features=self.features, **format_kwargs)
 
         # Check filter column
         if isinstance(columns, str):
@@ -1765,7 +1765,7 @@ def _getitem(
         Can be used to index columns (by string names) or rows (by integer index, slices, or iter of indices or bools)
         """
         format_kwargs = format_kwargs if format_kwargs is not None else {}
-        formatter = get_formatter(format_type, **format_kwargs)
+        formatter = get_formatter(format_type, features=self.features, **format_kwargs)
         pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
         formatted_output = format_table(
             pa_subtable, key, formatter=formatter, format_columns=format_columns, output_all_columns=output_all_columns

diff --git a/src/datasets/features/__init__.py b/src/datasets/features/__init__.py
@@ -0,0 +1,11 @@
+# flake8: noqa
+from .audio import Audio
+from .features import *
+from .features import (
+    _ArrayXD,
+    _ArrayXDExtensionType,
+    _arrow_to_datasets_dtype,
+    _cast_to_python_objects,
+    _is_zero_copy_only,
+)
+from .translation import Translation, TranslationVariableLanguages
diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
@@ -0,0 +1,43 @@
+from dataclasses import dataclass, field
+from typing import Any, ClassVar, Optional
+
+import pyarrow as pa
+
+
+@dataclass(frozen=True)
+class Audio:
+    """Audio Feature to extract audio data from an audio file.
+
+    Args:
+        sampling_rate (:obj:`int`, optional): Target sampling rate. If `None`, the native sampling rate is used.
+        mono (:obj:`bool`, default ```True``): Whether to convert the audio signal to mono by averaging samples across channels.
+    """
+
+    sampling_rate: int = None
+    mono: bool = True
+    id: Optional[str] = None
+    # Automatically constructed
+    dtype: ClassVar[str] = "dict"
+    pa_type: ClassVar[Any] = None
+    _type: str = field(default="Audio", init=False, repr=False)
+
+    def __call__(self):
+        return pa.string()
+
+    def decode_example(self, value):
+        """Decode example audio file into audio data.
+
+        Args:
+            value: Audio file path.
+
+        Returns:
+            dict
+        """
+        try:
+            import librosa
+        except ImportError:
+            return value
+
+        with open(value, "rb") as f:
+            array, sample_rate = librosa.load(f, sr=self.sampling_rate, mono=self.mono)
+        return {"path": value, "array": array, "sampling_rate": sample_rate}
diff --git a/src/datasets/features.py → src/datasets/features/features.py b/src/datasets/features.py → src/datasets/features/features.py
@@ -35,8 +35,10 @@
 from pyarrow.lib import TimestampType
 from pyarrow.types import is_boolean, is_primitive
 
-from . import config, utils
-from .utils.logging import get_logger
+from datasets import config, utils
+from datasets.features.audio import Audio
+from datasets.features.translation import Translation, TranslationVariableLanguages
+from datasets.utils.logging import get_logger
 
 
 logger = get_logger(__name__)
@@ -700,119 +702,6 @@ def _load_names_from_file(names_filepath):
             return [name.strip() for name in f.read().split("\n") if name.strip()]  # Filter empty names
 
 
-@dataclass
-class Translation:
-    """`FeatureConnector` for translations with fixed languages per example.
-    Here for compatiblity with tfds.
-
-    Input: The Translate feature accepts a dictionary for each example mapping
-        string language codes to string translations.
-
-    Output: A dictionary mapping string language codes to translations as `Text`
-        features.
-
-    Example::
-
-        # At construction time:
-
-        datasets.features.Translation(languages=['en', 'fr', 'de'])
-
-        # During data generation:
-
-        yield {
-                'en': 'the cat',
-                'fr': 'le chat',
-                'de': 'die katze'
-        }
-    """
-
-    languages: List[str]
-    id: Optional[str] = None
-    # Automatically constructed
-    dtype: ClassVar[str] = "dict"
-    pa_type: ClassVar[Any] = None
-    _type: str = field(default="Translation", init=False, repr=False)
-
-    def __call__(self):
-        return pa.struct({lang: pa.string() for lang in sorted(self.languages)})
-
-
-@dataclass
-class TranslationVariableLanguages:
-    """`FeatureConnector` for translations with variable languages per example.
-    Here for compatiblity with tfds.
-
-    Input: The TranslationVariableLanguages feature accepts a dictionary for each
-        example mapping string language codes to one or more string translations.
-        The languages present may vary from example to example.
-
-    Output:
-        language: variable-length 1D tf.Tensor of tf.string language codes, sorted
-            in ascending order.
-        translation: variable-length 1D tf.Tensor of tf.string plain text
-            translations, sorted to align with language codes.
-
-    Example::
-
-        # At construction time:
-
-        datasets.features.Translation(languages=['en', 'fr', 'de'])
-
-        # During data generation:
-
-        yield {
-                'en': 'the cat',
-                'fr': ['le chat', 'la chatte,']
-                'de': 'die katze'
-        }
-
-        # Tensor returned :
-
-        {
-                'language': ['en', 'de', 'fr', 'fr'],
-                'translation': ['the cat', 'die katze', 'la chatte', 'le chat'],
-        }
-    """
-
-    languages: Optional[List] = None
-    num_languages: Optional[int] = None
-    id: Optional[str] = None
-    # Automatically constructed
-    dtype: ClassVar[str] = "dict"
-    pa_type: ClassVar[Any] = None
-    _type: str = field(default="TranslationVariableLanguages", init=False, repr=False)
-
-    def __post_init__(self):
-        self.languages = list(sorted(list(set(self.languages)))) if self.languages else None
-        self.num_languages = len(self.languages) if self.languages else None
-
-    def __call__(self):
-        return pa.struct({"language": pa.list_(pa.string()), "translation": pa.list_(pa.string())})
-
-    def encode_example(self, translation_dict):
-        lang_set = set(self.languages)
-        if self.languages and set(translation_dict) - lang_set:
-            raise ValueError(
-                "Some languages in example ({0}) are not in valid set ({1}).".format(
-                    ", ".join(sorted(set(translation_dict) - lang_set)), ", ".join(lang_set)
-                )
-            )
-
-        # Convert dictionary into tuples, splitting out cases where there are
-        # multiple translations for a single language.
-        translation_tuples = []
-        for lang, text in translation_dict.items():
-            if isinstance(text, str):
-                translation_tuples.append((lang, text))
-            else:
-                translation_tuples.extend([(lang, el) for el in text])
-
-        # Ensure translations are in ascending order by language code.
-        languages, translations = zip(*sorted(translation_tuples))
-
-        return {"language": languages, "translation": translations}
-
-
 @dataclass
 class Sequence:
     """Construct a list of feature from a single type or a dict of types.
@@ -841,6 +730,7 @@ class Sequence:
     Array3D,
     Array4D,
     Array5D,
+    Audio,
 ]
 
 
@@ -915,6 +805,20 @@ def encode_nested_example(schema, obj):
     return obj
 
 
+def decode_nested_example(feature, example):
+    if isinstance(feature, dict):
+        return {
+            col: decode_nested_example(col_feature, col_example)
+            for col, (col_feature, col_example) in utils.zip_dict(
+                {key: value for key, value in feature.items() if key in example}, example
+            )
+        }
+    elif isinstance(feature, Audio):
+        return feature.decode_example(example)
+    else:
+        return example
+
+
 def generate_from_dict(obj: Any):
     """Regenerate the nested feature object from a deserialized dict.
     We use the '_type' fields to get the dataclass name to load.
@@ -1080,6 +984,31 @@ def encode_batch(self, batch):
             encoded_batch[key] = [encode_nested_example(self[key], obj) for obj in column]
         return encoded_batch
 
+    def decode_example(self, example):
+        """Decode example with custom feature decoding.
+
+        Args:
+            example (:obj:`dict[str, Any]`): Dataset row data.
+
+        Returns:
+            :obj:`dict[str, Any]`
+        """
+        return decode_nested_example(self, example)
+
+    def decode_batch(self, batch):
+        """Decode batch with custom feature decoding.
+
+        Args:
+            batch (:obj:`dict[str, list[Any]]`): Dataset batch data.
+
+        Returns:
+            :obj:`dict[str, list[Any]]`
+        """
+        decoded_batch = {}
+        for key, column in batch.items():
+            decoded_batch[key] = [decode_nested_example(self[key], obj) for obj in column]
+        return decoded_batch
+
     def copy(self) -> "Features":
         """
         Make a deep copy of Features.