Adding support for generic multi dimensional tensors and auxillary im…

…age data for multimodal datasets (#363) * add support for multi dim arrays * format and add tests for array2d * fix docs and style * fix docs * make both write_batch and write_on_file work for array2d * refac types inference to support multi dim arrays * fix test * remove lxmert dataset for now * minor * add tests + fix from_dict and pandas * naming * add typed sequence tests * set disable_nullable to False, remove data_type in writer, add tests for writer * better error messages for overflows * quality * add array3-5d * add Array[3-5]D and use fixed length lists * fix overflow error message * use table with one column instead of lading everything * allow -> disallow * style * style Co-authored-by: Quentin Lhoest <lhoest.q@gmail.com>
huggingface · Aug 24, 2020 · c93a19e · c93a19e
1 parent 12a32b9
commit c93a19e
Show file tree

Hide file tree

Showing 13 changed files with 1,124 additions and 254 deletions.
diff --git a/datasets/guardian_authorship/guardian_authorship.py b/datasets/guardian_authorship/guardian_authorship.py
diff --git a/docs/source/features.rst b/docs/source/features.rst
@@ -16,6 +16,5 @@ Here is a brief presentation of the various types of features which can be used
 - a :class:`nlp.ClassLabel` feature specifies a field with a predefined set of classes which can have labels associated to them and will be stored as integers in the dataset. This field will be stored and retrieved as an integer value and two conversion methodes, :func:`nlp.ClassLabel.str2int` and :func:`nlp.ClassLabel.int2str` can be used to convert from the label names to the associate integer value and vice-versa.
 
 - a :class:`nlp.Value` feature specifies a single typed value, e.g. ``int64`` or ``string``. The types supported are all the `non-nested types of Apache Arrow <https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions>`__ among which the most commonly used ones are ``int64``, ``float32`` and ``string``.
-- :class:`nlp.Tensor` is mostly supported to have a compatibility layer with the TensorFlow Datasets library and can host a 0D or 1D array. A 0D array is equivalent to a :class:`nlp.Value` of the same dtype while a 1D array is equivalent to a :class:`nlp.Sequence` of the same dtype and fixed length.
 - eventually, two features are specific to Machine Translation: :class:`nlp.Translation` and :class:`nlp.TranslationVariableLanguages`. We refere to the package reference for more details on these features.
 
diff --git a/docs/source/package_reference/main_classes.rst b/docs/source/package_reference/main_classes.rst
@@ -53,15 +53,23 @@ It also has dataset transform methods like map or filter, to process all the spl
 .. autoclass:: nlp.Value
     :members:
 
-.. autoclass:: nlp.Tensor
-    :members:
-
 .. autoclass:: nlp.Translation
     :members:
 
 .. autoclass:: nlp.TranslationVariableLanguages
     :members:
 
+.. autoclass:: nlp.Array2D
+    :members:
+
+.. autoclass:: nlp.Array3D
+    :members:
+
+.. autoclass:: nlp.Array4D
+    :members:
+
+.. autoclass:: nlp.Array5D
+    :members:
 
 ``MetricInfo``
 ~~~~~~~~~~~~~~~~~~~~~

diff --git a/src/nlp/__init__.py b/src/nlp/__init__.py
@@ -28,7 +28,18 @@
 from .arrow_reader import ReadInstruction
 from .builder import ArrowBasedBuilder, BeamBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder
 from .dataset_dict import DatasetDict
-from .features import ClassLabel, Features, Sequence, Tensor, Translation, TranslationVariableLanguages, Value
+from .features import (
+    Array2D,
+    Array3D,
+    Array4D,
+    Array5D,
+    ClassLabel,
+    Features,
+    Sequence,
+    Translation,
+    TranslationVariableLanguages,
+    Value,
+)
 from .info import DatasetInfo, MetricInfo
 from .inspect import inspect_dataset, inspect_metric, list_datasets, list_metrics
 from .load import concatenate_datasets, import_main_class, load_dataset, load_metric, prepare_module

diff --git a/src/nlp/arrow_dataset.py b/src/nlp/arrow_dataset.py
@@ -36,8 +36,8 @@
 
 from nlp.utils.py_utils import dumps
 
-from .arrow_writer import ArrowWriter
-from .features import Features, cast_to_python_objects
+from .arrow_writer import ArrowWriter, TypedSequence
+from .features import Features, cast_to_python_objects, pandas_types_mapper
 from .info import DatasetInfo
 from .search import IndexableMixin
 from .splits import NamedSplit
@@ -258,9 +258,11 @@ def from_dict(
             mapping = features.encode_batch(mapping)
         else:
             mapping = cast_to_python_objects(mapping)
-        pa_table: pa.Table = pa.Table.from_pydict(
-            mapping=mapping, schema=pa.schema(features.type) if features is not None else None
-        )
+        mapping = {
+            col: TypedSequence(data, type=features.type[col].type if features is not None else None)
+            for col, data in mapping.items()
+        }
+        pa_table: pa.Table = pa.Table.from_pydict(mapping=mapping)
         return cls(pa_table, info=info, split=split)
 
     @property
@@ -624,12 +626,14 @@ def identity(x):
                 return x
 
             command = identity
-        if isinstance(outputs, (list, tuple, np.ndarray)):
+        if isinstance(outputs, (list, tuple, np.ndarray, pd.Series)):
             return command(outputs)
         elif isinstance(outputs, pd.DataFrame):
             if format_columns is not None and not output_all_columns:
                 to_remove_columns = [col for col in self.column_names if col not in format_columns]
                 output_dict = outputs.drop(to_remove_columns, axis=1)
+            else:
+                output_dict = outputs
         else:
             output_dict = {}
             for k, v in outputs.items():
@@ -661,17 +665,20 @@ def _getitem(
         """
         # In the following, to convert data from the arrow table to dicts or lists,
         # we use .to_pandas().to_dict() or .to_pandas().to_list() as they are
-        # significantly faster than .to_pydict() thanks to zero-copy
+        # significantly faster than .to_pydict() thanks to zero-copy and because it doesn't
+        # call `list()` on every object in sequences of sequences of objects for example
         if isinstance(key, int):
             if key < 0:
                 key = self._data.num_rows + key
             if key >= self._data.num_rows:
                 raise IndexError(f"Index ({key}) outside of table length ({self._data.num_rows}).")
             if format_type is not None:
                 if format_type == "pandas":
-                    outputs = self._data.slice(key, 1).to_pandas()
+                    outputs = self._data.slice(key, 1).to_pandas(types_mapper=pandas_types_mapper)
                 else:
-                    outputs = self._unnest(self._data.slice(key, 1).to_pandas().to_dict("list"))
+                    outputs = self._unnest(
+                        self._data.slice(key, 1).to_pandas(types_mapper=pandas_types_mapper).to_dict("list")
+                    )
             else:
                 outputs = self._unnest(self._data.slice(key, 1).to_pydict())
         elif isinstance(key, slice):
@@ -681,12 +688,12 @@ def _getitem(
             if format_type is not None:
                 if format_type == "pandas":
                     outputs = self._data.slice(key_indices[0], key_indices[1] - key_indices[0]).to_pandas(
-                        split_blocks=True
+                        types_mapper=pandas_types_mapper
                     )
                 else:
                     outputs = (
                         self._data.slice(key_indices[0], key_indices[1] - key_indices[0])
-                        .to_pandas(split_blocks=True)
+                        .to_pandas(types_mapper=pandas_types_mapper)
                         .to_dict("list")
                     )
             else:
@@ -695,15 +702,21 @@ def _getitem(
             if key not in self._data.column_names:
                 raise ValueError(f"Column ({key}) not in table columns ({self._data.column_names}).")
             if format_type is not None:
+                # We should use
+                # outputs = self._data[key].to_pandas(types_mapper=pandas_types_mapper)
+                # but there is a bug in pyarrow that makes ignores the types_mapper in that case
+                # see https://issues.apache.org/jira/browse/ARROW-9664
+                # We build a table with one column and call to_pandas on it instead
+                one_column_table = pa.Table.from_arrays(
+                    [self._data[key]], schema=pa.schema([self._data.schema.field(key)])
+                )
                 if format_columns is None or key in format_columns:
                     if format_type == "pandas":
-                        outputs = self._data[key].to_pandas(split_blocks=True)
-                    elif format_type in ("numpy", "torch", "tensorflow"):
-                        outputs = self._data.to_pandas(split_blocks=True).to_dict("list")[key]
+                        outputs = one_column_table.to_pandas(types_mapper=pandas_types_mapper)[key]
                     else:
-                        outputs = self._data[key].to_pylist()
+                        outputs = one_column_table.to_pandas(types_mapper=pandas_types_mapper)[key].to_list()
                 else:
-                    outputs = self._data[key].to_pylist()
+                    outputs = one_column_table.to_pandas(types_mapper=pandas_types_mapper)[key].to_list()
             else:
                 outputs = self._data[key].to_pylist()
         elif isinstance(key, Iterable):
@@ -718,9 +731,9 @@ def _getitem(
             data_subset = pa.concat_tables(self._data.slice(int(i), 1) for i in indices)
             if format_type is not None:
                 if format_type == "pandas":
-                    outputs = data_subset.to_pandas(split_blocks=True)
+                    outputs = data_subset.to_pandas(types_mapper=pandas_types_mapper)
                 else:
-                    outputs = data_subset.to_pandas(split_blocks=True).to_dict("list")
+                    outputs = data_subset.to_pandas(types_mapper=pandas_types_mapper).to_dict("list")
             else:
                 outputs = data_subset.to_pydict()
 
@@ -805,7 +818,7 @@ def map(
         cache_file_name: Optional[str] = None,
         writer_batch_size: Optional[int] = 1000,
         features: Optional[Features] = None,
-        disable_nullable: bool = True,
+        disable_nullable: bool = False,
         verbose: bool = True,
         fn_kwargs: Optional[dict] = None,
     ) -> "Dataset":
@@ -836,7 +849,7 @@ def map(
                     Higher value gives smaller cache files, lower value consume less temporary memory while running `.map()`.
                 `features` (`Optional[nlp.Features]`, default: `None`): Use a specific Features to store the cache file
                     instead of the automatically generated one.
-                `disable_nullable` (`bool`, default: `True`): Allow null values in the table.
+                `disable_nullable` (`bool`, default: `False`): Disallow null values in the table.
                 `verbose` (`bool`, default: `True`): Set to `False` to deactivate the tqdm progress bar and informations.
                 `fn_kwargs` (`Optional[Dict]`, default: `None`): Keyword arguments to be passed to `function`
         """