Skip to content

Commit

Permalink
Adding support for generic multi dimensional tensors and auxillary im…
Browse files Browse the repository at this point in the history
…age data for multimodal datasets (#363)

* add support for multi dim arrays

* format and add tests for array2d

* fix docs and style

* fix docs

* make both write_batch and write_on_file work for array2d

* refac types inference to support multi dim arrays

* fix test

* remove lxmert dataset for now

* minor

* add tests + fix from_dict and pandas

* naming

* add typed sequence tests

* set disable_nullable to False, remove data_type in writer, add tests for writer

* better error messages for overflows

* quality

* add array3-5d

* add Array[3-5]D and use fixed length lists

* fix overflow error message

* use table with one column instead of lading everything

* allow -> disallow

* style

* style

Co-authored-by: Quentin Lhoest <lhoest.q@gmail.com>
  • Loading branch information
eltoto1219 and lhoestq committed Aug 24, 2020
1 parent 12a32b9 commit c93a19e
Show file tree
Hide file tree
Showing 13 changed files with 1,124 additions and 254 deletions.
291 changes: 168 additions & 123 deletions datasets/guardian_authorship/guardian_authorship.py

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion docs/source/features.rst
Expand Up @@ -16,6 +16,5 @@ Here is a brief presentation of the various types of features which can be used
- a :class:`nlp.ClassLabel` feature specifies a field with a predefined set of classes which can have labels associated to them and will be stored as integers in the dataset. This field will be stored and retrieved as an integer value and two conversion methodes, :func:`nlp.ClassLabel.str2int` and :func:`nlp.ClassLabel.int2str` can be used to convert from the label names to the associate integer value and vice-versa.

- a :class:`nlp.Value` feature specifies a single typed value, e.g. ``int64`` or ``string``. The types supported are all the `non-nested types of Apache Arrow <https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions>`__ among which the most commonly used ones are ``int64``, ``float32`` and ``string``.
- :class:`nlp.Tensor` is mostly supported to have a compatibility layer with the TensorFlow Datasets library and can host a 0D or 1D array. A 0D array is equivalent to a :class:`nlp.Value` of the same dtype while a 1D array is equivalent to a :class:`nlp.Sequence` of the same dtype and fixed length.
- eventually, two features are specific to Machine Translation: :class:`nlp.Translation` and :class:`nlp.TranslationVariableLanguages`. We refere to the package reference for more details on these features.

14 changes: 11 additions & 3 deletions docs/source/package_reference/main_classes.rst
Expand Up @@ -53,15 +53,23 @@ It also has dataset transform methods like map or filter, to process all the spl
.. autoclass:: nlp.Value
:members:

.. autoclass:: nlp.Tensor
:members:

.. autoclass:: nlp.Translation
:members:

.. autoclass:: nlp.TranslationVariableLanguages
:members:

.. autoclass:: nlp.Array2D
:members:

.. autoclass:: nlp.Array3D
:members:

.. autoclass:: nlp.Array4D
:members:

.. autoclass:: nlp.Array5D
:members:

``MetricInfo``
~~~~~~~~~~~~~~~~~~~~~
Expand Down
13 changes: 12 additions & 1 deletion src/nlp/__init__.py
Expand Up @@ -28,7 +28,18 @@
from .arrow_reader import ReadInstruction
from .builder import ArrowBasedBuilder, BeamBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder
from .dataset_dict import DatasetDict
from .features import ClassLabel, Features, Sequence, Tensor, Translation, TranslationVariableLanguages, Value
from .features import (
Array2D,
Array3D,
Array4D,
Array5D,
ClassLabel,
Features,
Sequence,
Translation,
TranslationVariableLanguages,
Value,
)
from .info import DatasetInfo, MetricInfo
from .inspect import inspect_dataset, inspect_metric, list_datasets, list_metrics
from .load import concatenate_datasets, import_main_class, load_dataset, load_metric, prepare_module
Expand Down
53 changes: 33 additions & 20 deletions src/nlp/arrow_dataset.py
Expand Up @@ -36,8 +36,8 @@

from nlp.utils.py_utils import dumps

from .arrow_writer import ArrowWriter
from .features import Features, cast_to_python_objects
from .arrow_writer import ArrowWriter, TypedSequence
from .features import Features, cast_to_python_objects, pandas_types_mapper
from .info import DatasetInfo
from .search import IndexableMixin
from .splits import NamedSplit
Expand Down Expand Up @@ -258,9 +258,11 @@ def from_dict(
mapping = features.encode_batch(mapping)
else:
mapping = cast_to_python_objects(mapping)
pa_table: pa.Table = pa.Table.from_pydict(
mapping=mapping, schema=pa.schema(features.type) if features is not None else None
)
mapping = {
col: TypedSequence(data, type=features.type[col].type if features is not None else None)
for col, data in mapping.items()
}
pa_table: pa.Table = pa.Table.from_pydict(mapping=mapping)
return cls(pa_table, info=info, split=split)

@property
Expand Down Expand Up @@ -624,12 +626,14 @@ def identity(x):
return x

command = identity
if isinstance(outputs, (list, tuple, np.ndarray)):
if isinstance(outputs, (list, tuple, np.ndarray, pd.Series)):
return command(outputs)
elif isinstance(outputs, pd.DataFrame):
if format_columns is not None and not output_all_columns:
to_remove_columns = [col for col in self.column_names if col not in format_columns]
output_dict = outputs.drop(to_remove_columns, axis=1)
else:
output_dict = outputs
else:
output_dict = {}
for k, v in outputs.items():
Expand Down Expand Up @@ -661,17 +665,20 @@ def _getitem(
"""
# In the following, to convert data from the arrow table to dicts or lists,
# we use .to_pandas().to_dict() or .to_pandas().to_list() as they are
# significantly faster than .to_pydict() thanks to zero-copy
# significantly faster than .to_pydict() thanks to zero-copy and because it doesn't
# call `list()` on every object in sequences of sequences of objects for example
if isinstance(key, int):
if key < 0:
key = self._data.num_rows + key
if key >= self._data.num_rows:
raise IndexError(f"Index ({key}) outside of table length ({self._data.num_rows}).")
if format_type is not None:
if format_type == "pandas":
outputs = self._data.slice(key, 1).to_pandas()
outputs = self._data.slice(key, 1).to_pandas(types_mapper=pandas_types_mapper)
else:
outputs = self._unnest(self._data.slice(key, 1).to_pandas().to_dict("list"))
outputs = self._unnest(
self._data.slice(key, 1).to_pandas(types_mapper=pandas_types_mapper).to_dict("list")
)
else:
outputs = self._unnest(self._data.slice(key, 1).to_pydict())
elif isinstance(key, slice):
Expand All @@ -681,12 +688,12 @@ def _getitem(
if format_type is not None:
if format_type == "pandas":
outputs = self._data.slice(key_indices[0], key_indices[1] - key_indices[0]).to_pandas(
split_blocks=True
types_mapper=pandas_types_mapper
)
else:
outputs = (
self._data.slice(key_indices[0], key_indices[1] - key_indices[0])
.to_pandas(split_blocks=True)
.to_pandas(types_mapper=pandas_types_mapper)
.to_dict("list")
)
else:
Expand All @@ -695,15 +702,21 @@ def _getitem(
if key not in self._data.column_names:
raise ValueError(f"Column ({key}) not in table columns ({self._data.column_names}).")
if format_type is not None:
# We should use
# outputs = self._data[key].to_pandas(types_mapper=pandas_types_mapper)
# but there is a bug in pyarrow that makes ignores the types_mapper in that case
# see https://issues.apache.org/jira/browse/ARROW-9664
# We build a table with one column and call to_pandas on it instead
one_column_table = pa.Table.from_arrays(
[self._data[key]], schema=pa.schema([self._data.schema.field(key)])
)
if format_columns is None or key in format_columns:
if format_type == "pandas":
outputs = self._data[key].to_pandas(split_blocks=True)
elif format_type in ("numpy", "torch", "tensorflow"):
outputs = self._data.to_pandas(split_blocks=True).to_dict("list")[key]
outputs = one_column_table.to_pandas(types_mapper=pandas_types_mapper)[key]
else:
outputs = self._data[key].to_pylist()
outputs = one_column_table.to_pandas(types_mapper=pandas_types_mapper)[key].to_list()
else:
outputs = self._data[key].to_pylist()
outputs = one_column_table.to_pandas(types_mapper=pandas_types_mapper)[key].to_list()
else:
outputs = self._data[key].to_pylist()
elif isinstance(key, Iterable):
Expand All @@ -718,9 +731,9 @@ def _getitem(
data_subset = pa.concat_tables(self._data.slice(int(i), 1) for i in indices)
if format_type is not None:
if format_type == "pandas":
outputs = data_subset.to_pandas(split_blocks=True)
outputs = data_subset.to_pandas(types_mapper=pandas_types_mapper)
else:
outputs = data_subset.to_pandas(split_blocks=True).to_dict("list")
outputs = data_subset.to_pandas(types_mapper=pandas_types_mapper).to_dict("list")
else:
outputs = data_subset.to_pydict()

Expand Down Expand Up @@ -805,7 +818,7 @@ def map(
cache_file_name: Optional[str] = None,
writer_batch_size: Optional[int] = 1000,
features: Optional[Features] = None,
disable_nullable: bool = True,
disable_nullable: bool = False,
verbose: bool = True,
fn_kwargs: Optional[dict] = None,
) -> "Dataset":
Expand Down Expand Up @@ -836,7 +849,7 @@ def map(
Higher value gives smaller cache files, lower value consume less temporary memory while running `.map()`.
`features` (`Optional[nlp.Features]`, default: `None`): Use a specific Features to store the cache file
instead of the automatically generated one.
`disable_nullable` (`bool`, default: `True`): Allow null values in the table.
`disable_nullable` (`bool`, default: `False`): Disallow null values in the table.
`verbose` (`bool`, default: `True`): Set to `False` to deactivate the tqdm progress bar and informations.
`fn_kwargs` (`Optional[Dict]`, default: `None`): Keyword arguments to be passed to `function`
"""
Expand Down

0 comments on commit c93a19e

Please sign in to comment.