More robust first elem check in encode/cast example (#3402)

* More robust first elem check in encode/cast example * Type hints and better docstring
huggingface · Dec 8, 2021 · 18e0adf · 18e0adf · github-actions · Dec 8, 2021
1 parent 49bb250
commit 18e0adf
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 6 deletions.
diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
@@ -152,7 +152,7 @@ def _cast_to_python_objects(obj: Any, only_1d_for_numpy: bool) -> Tuple[Any, boo
     Cast pytorch/tensorflow/pandas objects to python numpy array/lists.
     It works recursively.
 
-    To avoid iterating over possibly long lists, it first checks if the first element that is not None has to be casted.
+    To avoid iterating over possibly long lists, it first checks (recursively) if the first element that is not None or empty (if it is a sequence) has to be casted.
     If the first element needs to be casted, then all the elements of the list will be casted, otherwise they'll stay the same.
     This trick allows to cast objects that contain tokenizers outputs without iterating over every single token for example.
 
@@ -221,7 +221,7 @@ def _cast_to_python_objects(obj: Any, only_1d_for_numpy: bool) -> Tuple[Any, boo
     elif isinstance(obj, (list, tuple)):
         if len(obj) > 0:
             for first_elmt in obj:
-                if first_elmt is not None:
+                if _check_non_null_non_empty_recursive(first_elmt):
                     break
             casted_first_elmt, has_changed_first_elmt = _cast_to_python_objects(
                 first_elmt, only_1d_for_numpy=only_1d_for_numpy
@@ -244,7 +244,7 @@ def cast_to_python_objects(obj: Any, only_1d_for_numpy=False) -> Any:
     Cast numpy/pytorch/tensorflow/pandas objects to python lists.
     It works recursively.
 
-    To avoid iterating over possibly long lists, it first checks if the first element that is not None has to be casted.
+    To avoid iterating over possibly long lists, it first checks (recursively) if the first element that is not None or empty (if it is a sequence) has to be casted.
     If the first element needs to be casted, then all the elements of the list will be casted, otherwise they'll stay the same.
     This trick allows to cast objects that contain tokenizers outputs without iterating over every single token for example.
 
@@ -774,6 +774,28 @@ class Sequence:
 ]
 
 
+def _check_non_null_non_empty_recursive(obj, schema: Optional[FeatureType] = None) -> bool:
+    """
+    Check if the object is not None.
+    If the object is a list or a tuple, recursively check the first element of the sequence and stop if at any point the first element is not a sequence or is an empty sequence.
+    """
+    if obj is None:
+        return False
+    elif isinstance(obj, (list, tuple)) and (schema is None or isinstance(schema, (list, tuple, Sequence))):
+        if len(obj) > 0:
+            if schema is None:
+                pass
+            elif isinstance(schema, (list, tuple)):
+                schema = schema[0]
+            else:
+                schema = schema.feature
+            return _check_non_null_non_empty_recursive(obj[0], schema)
+        else:
+            return False
+    else:
+        return True
+
+
 def get_nested_type(schema: FeatureType) -> pa.DataType:
     """
     get_nested_type() converts a datasets.FeatureType into a pyarrow.DataType, and acts as the inverse of
@@ -810,7 +832,7 @@ def encode_nested_example(schema, obj):
     """Encode a nested example.
     This is used since some features (in particular ClassLabel) have some logic during encoding.
 
-    To avoid iterating over possibly long lists, it first checks if the first element that is not None has to be encoded.
+    To avoid iterating over possibly long lists, it first checks (recursively) if the first element that is not None or empty (if it is a sequence) has to be encoded.
     If the first element needs to be encoded, then all the elements of the list will be encoded, otherwise they'll stay the same.
     """
     # Nested structures: we allow dict, list/tuples, sequences
@@ -825,7 +847,7 @@ def encode_nested_example(schema, obj):
         else:
             if len(obj) > 0:
                 for first_elmt in obj:
-                    if first_elmt is not None:
+                    if _check_non_null_non_empty_recursive(first_elmt, sub_schema):
                         break
                 if encode_nested_example(sub_schema, first_elmt) != first_elmt:
                     return [encode_nested_example(sub_schema, o) for o in obj]
@@ -853,7 +875,7 @@ def encode_nested_example(schema, obj):
         else:
             if len(obj) > 0:
                 for first_elmt in obj:
-                    if first_elmt is not None:
+                    if _check_non_null_non_empty_recursive(first_elmt, schema.feature):
                         break
                 # be careful when comparing tensors here
                 if not isinstance(first_elmt, list) or encode_nested_example(schema.feature, first_elmt) != first_elmt:

diff --git a/tests/features/test_features.py b/tests/features/test_features.py
@@ -244,6 +244,23 @@ def test_encode_nested_example_sequence_with_none():
     assert result is None
 
 
+def test_encode_batch_with_example_with_empty_first_elem():
+    features = Features(
+        {
+            "x": Sequence(Sequence(ClassLabel(names=["a", "b"]))),
+        }
+    )
+    encoded_batch = features.encode_batch(
+        {
+            "x": [
+                [["a"], ["b"]],
+                [[], ["b"]],
+            ]
+        }
+    )
+    assert encoded_batch == {"x": [[[0], [1]], [[], [1]]]}
+
+
 def iternumpy(key1, value1, value2):
     if value1.dtype != value2.dtype:  # check only for dtype
         raise AssertionError(