Move task casting to builder

This makes sense for two reasons: 1) to handle both Dataset and DatasetDict objects, and 2) be closer to the post processing logic per split
huggingface · May 7, 2021 · b2a02c5 · b2a02c5 · github-actions · May 7, 2021
1 parent 3cf039d
commit b2a02c5
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 5 deletions.
diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -204,6 +204,7 @@ def __init__(
         name: Optional[str] = None,
         hash: Optional[str] = None,
         features: Optional[Features] = None,
+        task=None,
         **config_kwargs,
     ):
         """Constructs a DatasetBuilder.
@@ -226,6 +227,7 @@ def __init__(
         # DatasetBuilder name
         self.name: str = camelcase_to_snakecase(self.__class__.__name__)
         self.hash: Optional[str] = hash
+        self.task = task
 
         # Prepare config: DatasetConfig contains name, version and description but can be extended by each dataset
         config_kwargs = {key: value for key, value in config_kwargs.items() if value is not None}
@@ -813,6 +815,14 @@ def _build_single_dataset(
                         )
                     else:
                         ds.info.features = self.info.post_processed.features
+            # Rename feature column names to match task schema
+            tasks = [template.task for template in self.info.task_templates]
+            if self.task not in tasks:
+                raise ValueError(f"Task {self.task} not found! Avaliable tasks: {tasks}")
+            else:
+                for template in self.info.task_templates:
+                    if template.task == self.task:
+                        ds = ds.rename_columns(template.column_mapping)
 
         return ds
 

diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -735,6 +735,7 @@ def load_dataset(
         data_files=data_files,
         hash=hash,
         features=features,
+        task=task,
         **config_kwargs,
     )
 
@@ -757,11 +758,6 @@ def load_dataset(
         keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
     )
     ds = builder_instance.as_dataset(split=split, ignore_verifications=ignore_verifications, in_memory=keep_in_memory)
-    # Rename feature column names to match task schema
-    for template in builder_instance.info.task_templates:
-        if template.task == task:
-            for k, v in template.column_mapping.items():
-                ds = ds.rename_column(k, v)
     if save_infos:
         builder_instance._save_infos()