huggingface · lewtun · May 18, 2021 · Apr 23, 2021 · Apr 23, 2021 · Apr 23, 2021
diff --git a/datasets/emotion/dataset_infos.json b/datasets/emotion/dataset_infos.json
@@ -1 +1 @@
-{"emotion":{"description":"Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. For more detailed information please refer to the\npaper.\n","citation":"@inproceedings{saravia-etal-2018-carer,\n    title = \"{CARER}: Contextualized Affect Representations for Emotion Recognition\",\n    author = \"Saravia, Elvis  and\n      Liu, Hsien-Chi Toby  and\n      Huang, Yen-Hao  and\n      Wu, Junlin  and\n      Chen, Yi-Shin\",\n    booktitle = \"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing\",\n    month = oct # \"-\" # nov,\n    year = \"2018\",\n    address = \"Brussels, Belgium\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/D18-1404\",\n    doi = \"10.18653/v1/D18-1404\",\n    pages = \"3687--3697\",\n    abstract = \"Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.\",\n}\n","homepage":"https://github.com/dair-ai/emotion_dataset","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"},"label":{"dtype":"string","id":null,"_type":"Value"}},"post_processed":null,"supervised_keys":null,"builder_name":"emotion","config_name":"emotion","version":{"version_str":"0.1.0","description":"First Emotion release","major":0,"minor":1,"patch":0},"splits":{"train":{"name":"train","num_bytes":1754632,"num_examples":16000,"dataset_name":"emotion"},"validation":{"name":"validation","num_bytes":216248,"num_examples":2000,"dataset_name":"emotion"},"test":{"name":"test","num_bytes":218768,"num_examples":2000,"dataset_name":"emotion"}},"download_checksums":{"https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt?dl=1":{"num_bytes":1658616,"checksum":"3ab03d945a6cb783d818ccd06dafd52d2ed8b4f62f0f85a09d7d11870865b190"},"https://www.dropbox.com/s/2mzialpsgf9k5l3/val.txt?dl=1":{"num_bytes":204240,"checksum":"34faaa31962fe63cdf5dbf6c132ef8ab166c640254ab991af78f3aea375e79ef"},"https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt?dl=1":{"num_bytes":206760,"checksum":"60f531690d20127339e7f054edc299a82c627b5ec0dd5d552d53d544e0cfcc17"}},"download_size":2069616,"post_processing_size":null,"dataset_size":2189648,"size_in_bytes":4259264},"default":{"description":"Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. For more detailed information please refer to the paper.\n","citation":"@inproceedings{saravia-etal-2018-carer,\n    title = \"{CARER}: Contextualized Affect Representations for Emotion Recognition\",\n    author = \"Saravia, Elvis  and\n      Liu, Hsien-Chi Toby  and\n      Huang, Yen-Hao  and\n      Wu, Junlin  and\n      Chen, Yi-Shin\",\n    booktitle = \"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing\",\n    month = oct # \"-\" # nov,\n    year = \"2018\",\n    address = \"Brussels, Belgium\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/D18-1404\",\n    doi = \"10.18653/v1/D18-1404\",\n    pages = \"3687--3697\",\n    abstract = \"Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.\",\n}\n","homepage":"https://github.com/dair-ai/emotion_dataset","license":"","features":{"text":{"dtype":"string","id":null,"_type":"Value"},"label":{"num_classes":6,"names":["sadness","joy","love","anger","fear","surprise"],"names_file":null,"id":null,"_type":"ClassLabel"}},"post_processed":null,"supervised_keys":{"input":"text","output":"label"},"builder_name":"emotion","config_name":"default","version":{"version_str":"0.0.0","description":null,"major":0,"minor":0,"patch":0},"splits":{"train":{"name":"train","num_bytes":1741541,"num_examples":16000,"dataset_name":"emotion"},"validation":{"name":"validation","num_bytes":214699,"num_examples":2000,"dataset_name":"emotion"},"test":{"name":"test","num_bytes":217177,"num_examples":2000,"dataset_name":"emotion"}},"download_checksums":{"https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt?dl=1":{"num_bytes":1658616,"checksum":"3ab03d945a6cb783d818ccd06dafd52d2ed8b4f62f0f85a09d7d11870865b190"},"https://www.dropbox.com/s/2mzialpsgf9k5l3/val.txt?dl=1":{"num_bytes":204240,"checksum":"34faaa31962fe63cdf5dbf6c132ef8ab166c640254ab991af78f3aea375e79ef"},"https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt?dl=1":{"num_bytes":206760,"checksum":"60f531690d20127339e7f054edc299a82c627b5ec0dd5d552d53d544e0cfcc17"}},"download_size":2069616,"post_processing_size":null,"dataset_size":2173417,"size_in_bytes":4243033}}
+{"emotion": {"description": "Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. For more detailed information please refer to the paper.\n", "citation": "@inproceedings{saravia-etal-2018-carer,\n    title = \"{CARER}: Contextualized Affect Representations for Emotion Recognition\",\n    author = \"Saravia, Elvis  and\n      Liu, Hsien-Chi Toby  and\n      Huang, Yen-Hao  and\n      Wu, Junlin  and\n      Chen, Yi-Shin\",\n    booktitle = \"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing\",\n    month = oct # \"-\" # nov,\n    year = \"2018\",\n    address = \"Brussels, Belgium\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/D18-1404\",\n    doi = \"10.18653/v1/D18-1404\",\n    pages = \"3687--3697\",\n    abstract = \"Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.\",\n}\n", "homepage": "https://github.com/dair-ai/emotion_dataset", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 6, "names": ["sadness", "joy", "love", "anger", "fear", "surprise"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": {"input": "text", "output": "label"}, "builder_name": "emotion", "config_name": "emotion", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1741541, "num_examples": 16000, "dataset_name": "emotion"}, "validation": {"name": "validation", "num_bytes": 214699, "num_examples": 2000, "dataset_name": "emotion"}, "test": {"name": "test", "num_bytes": 217177, "num_examples": 2000, "dataset_name": "emotion"}}, "download_checksums": {"https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt?dl=1": {"num_bytes": 1658616, "checksum": "3ab03d945a6cb783d818ccd06dafd52d2ed8b4f62f0f85a09d7d11870865b190"}, "https://www.dropbox.com/s/2mzialpsgf9k5l3/val.txt?dl=1": {"num_bytes": 204240, "checksum": "34faaa31962fe63cdf5dbf6c132ef8ab166c640254ab991af78f3aea375e79ef"}, "https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt?dl=1": {"num_bytes": 206760, "checksum": "60f531690d20127339e7f054edc299a82c627b5ec0dd5d552d53d544e0cfcc17"}}, "download_size": 2069616, "post_processing_size": null, "dataset_size": 2173417, "size_in_bytes": 4243033, "task_templates": [{"task": "text_classification", "input_schema": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "label_schema": {"label": {"num_classes": 6, "names": ["anger", "fear", "joy", "love", "sadness", "surprise"], "names_file": null, "id": null, "_type": "ClassLabel"}}}]}}
diff --git a/datasets/emotion/emotion.py b/datasets/emotion/emotion.py
@@ -1,6 +1,7 @@
 import csv
 
 import datasets
+from datasets.tasks import TextClassification
 
 
 _CITATION = """\
@@ -33,17 +34,33 @@
 _TEST_DOWNLOAD_URL = "https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt?dl=1"
 
 
+class EmotionConfig(datasets.BuilderConfig):
+    def __init__(self, **kwargs):
+        super(EmotionConfig, self).__init__(**kwargs)
+
+
 class Emotion(datasets.GeneratorBasedBuilder):
+
+    VERSION = datasets.Version("0.1.0")
+    BUILDER_CONFIGS = [
+        EmotionConfig(
+            name="emotion",
+            version=datasets.Version("1.0.0"),
+            description="Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise.",
+        )
+    ]
+
     def _info(self):
         class_names = ["sadness", "joy", "love", "anger", "fear", "surprise"]
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=datasets.Features(
-                {"text": datasets.Value("string"), "label": datasets.ClassLabel(names=class_names)}
+                {"tweet": datasets.Value("string"), "emotion": datasets.ClassLabel(names=class_names)}
             ),
             supervised_keys=("text", "label"),
             homepage=_URL,
             citation=_CITATION,
+            task_templates=[TextClassification(labels=class_names, text_column="tweet", label_column="emotion")],
         )
 
     def _split_generators(self, dl_manager):
@@ -63,4 +80,4 @@ def _generate_examples(self, filepath):
             csv_reader = csv.reader(csv_file, delimiter=";")
             for id_, row in enumerate(csv_reader):
                 text, label = row
-                yield id_, {"text": text, "label": label}
+                yield id_, {"tweet": text, "emotion": label}
diff --git a/datasets/squad/squad.py b/datasets/squad/squad.py
@@ -20,6 +20,7 @@
 import json
 
 import datasets
+from datasets.tasks import QuestionAnswering
 
 
 logger = datasets.logging.get_logger(__name__)
@@ -98,6 +99,9 @@ def _info(self):
             supervised_keys=None,
             homepage="https://rajpurkar.github.io/SQuAD-explorer/",
             citation=_CITATION,
+            task_templates=[
+                QuestionAnswering(),
+            ],
         )
 
     def _split_generators(self, dl_manager):

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -1372,6 +1372,33 @@ def with_transform(
         dataset.set_transform(transform=transform, columns=columns, output_all_columns=output_all_columns)
         return dataset
 
+    def prepare_for_task(self, task: str) -> "Dataset":
+        """Prepares a dataset for the given task.
+
+        Casts :attr:`datasets.DatasetInfo.features` according to a task-specific schema.
+
+        Args:
+            task (``str``): One of the compatible tasks in ['text_classification', 'question_answering']
+        """
+        tasks = [template.task for template in (self.info.task_templates or [])]
+        compatible_templates = [template for template in (self.info.task_templates or []) if template.task == task]
+        if not compatible_templates:
+            raise ValueError(f"Task {task} is not compatible with this dataset! Available tasks: {tasks}")
+
+        if len(compatible_templates) > 1:
+            raise ValueError(
+                f"""Expected 1 task template but found {len(compatible_templates)}! Please ensure that \
+                :attr:`datasets.DatasetInfo.task_templates` contains a unique set of task types."""
+            )
+        template = compatible_templates[0]
+        column_mapping = template.column_mapping
+        columns_to_drop = [column for column in self.column_names if column not in column_mapping]
+        dataset = self.remove_columns(columns_to_drop)
+        # TODO(sbrandeis): Add support for unnesting columns too
+        dataset = dataset.rename_columns(column_mapping)
+        dataset = dataset.cast(features=template.features)
+        return dataset
+
     def _getitem(
         self,
         key: Union[int, slice, str],

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -204,6 +204,7 @@ def __init__(
         name: Optional[str] = None,
         hash: Optional[str] = None,
         features: Optional[Features] = None,
+        task=None,
         **config_kwargs,
     ):
         """Constructs a DatasetBuilder.
@@ -226,6 +227,7 @@ def __init__(
         # DatasetBuilder name
         self.name: str = camelcase_to_snakecase(self.__class__.__name__)
         self.hash: Optional[str] = hash
+        self.task = task
 
         # Prepare config: DatasetConfig contains name, version and description but can be extended by each dataset
         config_kwargs = {key: value for key, value in config_kwargs.items() if value is not None}
@@ -813,6 +815,9 @@ def _build_single_dataset(
                         )
                     else:
                         ds.info.features = self.info.post_processed.features
+            # Rename and cast features to match task schema
+            if self.task is not None:
+                ds = ds.prepare_for_task(self.task)
 
         return ds
 

diff --git a/src/datasets/info.py b/src/datasets/info.py
@@ -39,6 +39,7 @@
 from . import config
 from .features import Features, Value
 from .splits import SplitDict
+from .tasks import TaskTemplate, task_template_from_dict
 from .utils import Version
 from .utils.logging import get_logger
 
@@ -108,6 +109,7 @@ class DatasetInfo:
         post_processing_size (int, optional):
         dataset_size (int, optional):
         size_in_bytes (int, optional):
+        task_templates (List[TaskTemplate], optional): TODO(sbrandeis) - document this
     """
 
     # Set in the dataset scripts
@@ -131,6 +133,8 @@ class DatasetInfo:
     dataset_size: Optional[int] = None
     size_in_bytes: Optional[int] = None
 
+    task_templates: Optional[List[TaskTemplate]] = None
+
     def __post_init__(self):
         # Convert back to the correct classes when we reload from dict
         if self.features is not None and not isinstance(self.features, Features):
@@ -150,6 +154,19 @@ def __post_init__(self):
             else:
                 self.supervised_keys = SupervisedKeysData(**self.supervised_keys)
 
+        if self.task_templates is not None:
+            if isinstance(self.task_templates, (list, tuple)):
+                templates = [
+                    template if isinstance(template, TaskTemplate) else task_template_from_dict(template)
+                    for template in self.task_templates
+                ]
+                self.task_templates = [template for template in templates if template is not None]
+            elif isinstance(self.task_templates, TaskTemplate):
+                self.task_templates = [self.task_templates]
+            else:
+                template = task_template_from_dict(self.task_templates)
+                self.task_templates = [template] if template is not None else []
+
     def _license_path(self, dataset_info_dir):
         return os.path.join(dataset_info_dir, config.LICENSE_FILENAME)
 
@@ -189,13 +206,17 @@ def unique(values):
         features = None
         supervised_keys = None
 
+        # Can be prepared for a pipeline if all members can also be prepared for this specific pipeline
+        task_templates = None  # TODO(sbrandeis)
+
         return cls(
             description=description,
             citation=citation,
             homepage=homepage,
             license=license,
             features=features,
             supervised_keys=supervised_keys,
+            task_templates=task_templates,
         )
 
     @classmethod

diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -635,6 +635,7 @@ def load_dataset(
     save_infos: bool = False,
     script_version: Optional[Union[str, Version]] = None,
     use_auth_token: Optional[Union[bool, str]] = None,
+    task: Optional[str] = None,
     **config_kwargs,
 ) -> Union[DatasetDict, Dataset]:
     """Load a dataset.
@@ -730,6 +731,7 @@ def load_dataset(
         data_files=data_files,
         hash=hash,
         features=features,
+        task=task,
         **config_kwargs,
     )
 

diff --git a/src/datasets/tasks/__init__.py b/src/datasets/tasks/__init__.py
@@ -0,0 +1,21 @@
+from typing import Optional
+
+from .base import TaskTemplate
+from .question_answering import QuestionAnswering
+from .text_classification import TextClassification
+
+
+__all__ = ["TaskTemplate", "QuestionAnswering", "TextClassification"]
+
+
+NAME2TEMPLATE = {QuestionAnswering.task: QuestionAnswering, TextClassification.task: TextClassification}
+
+
+def task_template_from_dict(task_template_dict: dict) -> Optional[TaskTemplate]:
+    task_name = task_template_dict.get("task")
+    if task_name is None:
+        return None
+    template = NAME2TEMPLATE.get(task_name)
+    if template is None:
+        return None
+    return template.from_dict(task_template_dict)
diff --git a/src/datasets/tasks/base.py b/src/datasets/tasks/base.py
@@ -0,0 +1,26 @@
+import abc
+from dataclasses import dataclass
+from typing import Dict
+
+from ..features import Features
+
+
+@dataclass
+class TaskTemplate(abc.ABC):
+    task: str
+    input_schema: Features
+    label_schema: Features
+
+    @property
+    def features(self) -> Features:
+        return Features(**self.input_schema, **self.label_schema)
+
+    @property
+    @abc.abstractmethod
+    def column_mapping(self) -> Dict[str, str]:
+        return NotImplemented
+
+    @classmethod
+    @abc.abstractmethod
+    def from_dict(cls, template_dict: dict) -> "TaskTemplate":
+        return NotImplemented
diff --git a/src/datasets/tasks/question_answering.py b/src/datasets/tasks/question_answering.py
@@ -0,0 +1,40 @@
+from typing import Dict
+
+from ..features import Features, Value
+from .base import TaskTemplate
+
+
+class QuestionAnswering(TaskTemplate):
+    task = "question_answering"
+    input_schema = Features({"question": Value("string"), "context": Value("string")})
+    label_schema = Features({"answer_start": Value("int32"), "answer_end": Value("int32")})
+
+    def __init__(
+        self,
+        question_column: str = "question",
+        context_column: str = "context",
+        answer_start_column: str = "answer_start",
+        answer_end_column: str = "answer_end",
+    ):
+        self.question_column = question_column
+        self.context_column = context_column
+        self.answer_start_column = answer_start_column
+        self.answer_end_column = answer_end_column
+
+    @property
+    def column_mapping(self) -> Dict[str, str]:
+        return {
+            self.question_column: "question",
+            self.context_column: "context",
+            self.answer_start_column: "answer_start",
+            self.answer_end_column: "answer_end",
+        }
+
+    @classmethod
+    def from_dict(cls, template_dict: dict) -> "QuestionAnswering":
+        return cls(
+            question_column=template_dict["question_column"],
+            context_column=template_dict["answer_column"],
+            answer_start_column=template_dict["answer_start_column"],
+            answer_end_column=template_dict["answer_end_column"],
+        )