huggingface · lhoestq · Oct 18, 2021 · Oct 12, 2021 · Oct 12, 2021 · Oct 12, 2021
diff --git a/datasets/food101/README.md b/datasets/food101/README.md
@@ -259,7 +259,16 @@ The data instances have the following fields:
 
 ### Licensing Information
 
-[More Information Needed]
+LICENSE AGREEMENT
+=================
+ - The Food-101 data set consists of images from Foodspotting [1] which are not
+   property of the Federal Institute of Technology Zurich (ETHZ). Any use beyond
+   scientific fair use must be negociated with the respective picture owners
+   according to the Foodspotting terms of use [2].
+
+[1] http://www.foodspotting.com/
+[2] http://www.foodspotting.com/terms/
+
 
 ### Citation Information
 

diff --git a/datasets/food101/dataset_infos.json b/datasets/food101/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "This dataset consists of 101 food categories, with 101'000 images. For each class, 250 manually reviewed test images are provided as well as 750 training images. On purpose, the training images were not cleaned, and thus still contain some amount of noise. This comes mostly in the form of intense colors and sometimes wrong labels. All images were rescaled to have a maximum side length of 512 pixels.", "citation": " @inproceedings{bossard14,\n  title = {Food-101 -- Mining Discriminative Components with Random Forests},\n  author = {Bossard, Lukas and Guillaumin, Matthieu and Van Gool, Luc},\n  booktitle = {European Conference on Computer Vision},\n  year = {2014}\n}\n", "homepage": "https://data.vision.ee.ethz.ch/cvl/datasets_extra/food-101/", "license": "", "features": {"image": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 101, "names": ["apple_pie", "baby_back_ribs", "baklava", "beef_carpaccio", "beef_tartare", "beet_salad", "beignets", "bibimbap", "bread_pudding", "breakfast_burrito", "bruschetta", "caesar_salad", "cannoli", "caprese_salad", "carrot_cake", "ceviche", "cheesecake", "cheese_plate", "chicken_curry", "chicken_quesadilla", "chicken_wings", "chocolate_cake", "chocolate_mousse", "churros", "clam_chowder", "club_sandwich", "crab_cakes", "creme_brulee", "croque_madame", "cup_cakes", "deviled_eggs", "donuts", "dumplings", "edamame", "eggs_benedict", "escargots", "falafel", "filet_mignon", "fish_and_chips", "foie_gras", "french_fries", "french_onion_soup", "french_toast", "fried_calamari", "fried_rice", "frozen_yogurt", "garlic_bread", "gnocchi", "greek_salad", "grilled_cheese_sandwich", "grilled_salmon", "guacamole", "gyoza", "hamburger", "hot_and_sour_soup", "hot_dog", "huevos_rancheros", "hummus", "ice_cream", "lasagna", "lobster_bisque", "lobster_roll_sandwich", "macaroni_and_cheese", "macarons", "miso_soup", "mussels", "nachos", "omelette", "onion_rings", "oysters", "pad_thai", "paella", "pancakes", "panna_cotta", "peking_duck", "pho", "pizza", "pork_chop", "poutine", "prime_rib", "pulled_pork_sandwich", "ramen", "ravioli", "red_velvet_cake", "risotto", "samosa", "sashimi", "scallops", "seaweed_salad", "shrimp_and_grits", "spaghetti_bolognese", "spaghetti_carbonara", "spring_rolls", "steak", "strawberry_shortcake", "sushi", "tacos", "takoyaki", "tiramisu", "tuna_tartare", "waffles"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": {"input": "image", "output": "label"}, "task_templates": [{"task": "image-classification", "image_file_path_column": "image", "label_column": "label", "labels": ["apple_pie", "baby_back_ribs", "baklava", "beef_carpaccio", "beef_tartare", "beet_salad", "beignets", "bibimbap", "bread_pudding", "breakfast_burrito", "bruschetta", "caesar_salad", "cannoli", "caprese_salad", "carrot_cake", "ceviche", "cheese_plate", "cheesecake", "chicken_curry", "chicken_quesadilla", "chicken_wings", "chocolate_cake", "chocolate_mousse", "churros", "clam_chowder", "club_sandwich", "crab_cakes", "creme_brulee", "croque_madame", "cup_cakes", "deviled_eggs", "donuts", "dumplings", "edamame", "eggs_benedict", "escargots", "falafel", "filet_mignon", "fish_and_chips", "foie_gras", "french_fries", "french_onion_soup", "french_toast", "fried_calamari", "fried_rice", "frozen_yogurt", "garlic_bread", "gnocchi", "greek_salad", "grilled_cheese_sandwich", "grilled_salmon", "guacamole", "gyoza", "hamburger", "hot_and_sour_soup", "hot_dog", "huevos_rancheros", "hummus", "ice_cream", "lasagna", "lobster_bisque", "lobster_roll_sandwich", "macaroni_and_cheese", "macarons", "miso_soup", "mussels", "nachos", "omelette", "onion_rings", "oysters", "pad_thai", "paella", "pancakes", "panna_cotta", "peking_duck", "pho", "pizza", "pork_chop", "poutine", "prime_rib", "pulled_pork_sandwich", "ramen", "ravioli", "red_velvet_cake", "risotto", "samosa", "sashimi", "scallops", "seaweed_salad", "shrimp_and_grits", "spaghetti_bolognese", "spaghetti_carbonara", "spring_rolls", "steak", "strawberry_shortcake", "sushi", "tacos", "takoyaki", "tiramisu", "tuna_tartare", "waffles"]}], "builder_name": "food101", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 13210094, "num_examples": 75750, "dataset_name": "food101"}, "validation": {"name": "validation", "num_bytes": 4403191, "num_examples": 25250, "dataset_name": "food101"}}, "download_checksums": {"http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz": {"num_bytes": 4996278331, "checksum": "d97d15e438b7f4498f96086a4f7e2fa42a32f2712e87d3295441b2b6314053a4"}}, "download_size": 4996278331, "post_processing_size": null, "dataset_size": 17613285, "size_in_bytes": 5013891616}}
+{"default": {"description": "This dataset consists of 101 food categories, with 101'000 images. For each class, 250 manually reviewed test images are provided as well as 750 training images. On purpose, the training images were not cleaned, and thus still contain some amount of noise. This comes mostly in the form of intense colors and sometimes wrong labels. All images were rescaled to have a maximum side length of 512 pixels.", "citation": " @inproceedings{bossard14,\n  title = {Food-101 -- Mining Discriminative Components with Random Forests},\n  author = {Bossard, Lukas and Guillaumin, Matthieu and Van Gool, Luc},\n  booktitle = {European Conference on Computer Vision},\n  year = {2014}\n}\n", "homepage": "https://data.vision.ee.ethz.ch/cvl/datasets_extra/food-101/", "license": "LICENSE AGREEMENT\n=================\n - The Food-101 data set consists of images from Foodspotting [1] which are not\n   property of the Federal Institute of Technology Zurich (ETHZ). Any use beyond\n   scientific fair use must be negociated with the respective picture owners\n   according to the Foodspotting terms of use [2].\n\n[1] http://www.foodspotting.com/\n[2] http://www.foodspotting.com/terms/\n", "features": {"image": {"filename": {"dtype": "string", "id": null, "_type": "Value"}, "data": {"dtype": "binary", "id": null, "_type": "Value"}}, "label": {"num_classes": 101, "names": ["apple_pie", "baby_back_ribs", "baklava", "beef_carpaccio", "beef_tartare", "beet_salad", "beignets", "bibimbap", "bread_pudding", "breakfast_burrito", "bruschetta", "caesar_salad", "cannoli", "caprese_salad", "carrot_cake", "ceviche", "cheesecake", "cheese_plate", "chicken_curry", "chicken_quesadilla", "chicken_wings", "chocolate_cake", "chocolate_mousse", "churros", "clam_chowder", "club_sandwich", "crab_cakes", "creme_brulee", "croque_madame", "cup_cakes", "deviled_eggs", "donuts", "dumplings", "edamame", "eggs_benedict", "escargots", "falafel", "filet_mignon", "fish_and_chips", "foie_gras", "french_fries", "french_onion_soup", "french_toast", "fried_calamari", "fried_rice", "frozen_yogurt", "garlic_bread", "gnocchi", "greek_salad", "grilled_cheese_sandwich", "grilled_salmon", "guacamole", "gyoza", "hamburger", "hot_and_sour_soup", "hot_dog", "huevos_rancheros", "hummus", "ice_cream", "lasagna", "lobster_bisque", "lobster_roll_sandwich", "macaroni_and_cheese", "macarons", "miso_soup", "mussels", "nachos", "omelette", "onion_rings", "oysters", "pad_thai", "paella", "pancakes", "panna_cotta", "peking_duck", "pho", "pizza", "pork_chop", "poutine", "prime_rib", "pulled_pork_sandwich", "ramen", "ravioli", "red_velvet_cake", "risotto", "samosa", "sashimi", "scallops", "seaweed_salad", "shrimp_and_grits", "spaghetti_bolognese", "spaghetti_carbonara", "spring_rolls", "steak", "strawberry_shortcake", "sushi", "tacos", "takoyaki", "tiramisu", "tuna_tartare", "waffles"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": {"input": "image", "output": "label"}, "task_templates": null, "builder_name": "food101", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3843765322, "num_examples": 75750, "dataset_name": "food101"}, "validation": {"name": "validation", "num_bytes": 1275549954, "num_examples": 25250, "dataset_name": "food101"}}, "download_checksums": {"http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz": {"num_bytes": 4996278331, "checksum": "d97d15e438b7f4498f96086a4f7e2fa42a32f2712e87d3295441b2b6314053a4"}, "https://s3.amazonaws.com/datasets.huggingface.co/food101/meta/train.txt": {"num_bytes": 1468812, "checksum": "2920f7d55473974492b41a01241ccfd71df1b74d29d27b617337f840f58f77ab"}, "https://s3.amazonaws.com/datasets.huggingface.co/food101/meta/test.txt": {"num_bytes": 489429, "checksum": "440d53374697d019a972fe66e8e44031ae80267a126ecb814ad537ec1fd506db"}}, "download_size": 4998236572, "post_processing_size": null, "dataset_size": 5119315276, "size_in_bytes": 10117551848}}
diff --git a/datasets/food101/dummy/0.0.0/dummy_data.zip b/datasets/food101/dummy/0.0.0/dummy_data.zip
diff --git a/datasets/food101/food101.py b/datasets/food101/food101.py
@@ -14,15 +14,16 @@
 # limitations under the License.
 """Dataset class for Food-101 dataset."""
 
-import json
-from pathlib import Path
-
 import datasets
-from datasets.tasks import ImageClassification
 
 
 _BASE_URL = "http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz"
 
+_METADATA_URLS = {
+    "train": "https://s3.amazonaws.com/datasets.huggingface.co/food101/meta/train.txt",
+    "test": "https://s3.amazonaws.com/datasets.huggingface.co/food101/meta/test.txt",
+}
+
 _HOMEPAGE = "https://data.vision.ee.ethz.ch/cvl/datasets_extra/food-101/"
 
 _DESCRIPTION = (
@@ -43,6 +44,18 @@
 }
 """
 
+_LICENSE = """\
+LICENSE AGREEMENT
+=================
+ - The Food-101 data set consists of images from Foodspotting [1] which are not
+   property of the Federal Institute of Technology Zurich (ETHZ). Any use beyond
+   scientific fair use must be negociated with the respective picture owners
+   according to the Foodspotting terms of use [2].
+
+[1] http://www.foodspotting.com/
+[2] http://www.foodspotting.com/terms/
+"""
+
 _NAMES = [
     "apple_pie",
     "baby_back_ribs",
@@ -147,6 +160,8 @@
     "waffles",
 ]
 
+_IMAGES_DIR = "food-101/images/"
+
 
 class Food101(datasets.GeneratorBasedBuilder):
     """Food-101 Images dataset."""
@@ -156,36 +171,45 @@ def _info(self):
             description=_DESCRIPTION,
             features=datasets.Features(
                 {
-                    "image": datasets.Value("string"),
+                    "image": {"filename": datasets.Value("string"), "data": datasets.Value("binary")},
                     "label": datasets.features.ClassLabel(names=_NAMES),
                 }
             ),
             supervised_keys=("image", "label"),
             homepage=_HOMEPAGE,
-            task_templates=[ImageClassification(image_file_path_column="image", label_column="label", labels=_NAMES)],
             citation=_CITATION,
+            license=_LICENSE,
         )
 
     def _split_generators(self, dl_manager):
-        dl_path = Path(dl_manager.download_and_extract(_BASE_URL))
-        meta_path = dl_path / "food-101" / "meta"
-        image_dir_path = dl_path / "food-101" / "images"
+        archive_path = dl_manager.download(_BASE_URL)
+        split_metadata_paths = dl_manager.download(_METADATA_URLS)
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
-                gen_kwargs={"json_file_path": meta_path / "train.json", "image_dir_path": image_dir_path},
+                gen_kwargs={
+                    "images": dl_manager.iter_archive(archive_path),
+                    "metadata_path": split_metadata_paths["train"],
+                },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
-                gen_kwargs={"json_file_path": meta_path / "test.json", "image_dir_path": image_dir_path},
+                gen_kwargs={
+                    "images": dl_manager.iter_archive(archive_path),
+                    "metadata_path": split_metadata_paths["test"],
+                },
             ),
         ]
 
-    def _generate_examples(self, json_file_path, image_dir_path):
+    def _generate_examples(self, images, metadata_path):
         """Generate images and labels for splits."""
-        data = json.loads(json_file_path.read_text())
-        for label, images in data.items():
-            for image_name in images:
-                image = image_dir_path / f"{image_name}.jpg"
-                features = {"image": str(image), "label": label}
-                yield image_name, features
+        with open(metadata_path, encoding="utf-8") as f:
+            files_to_keep = set(f.read().split("\n"))
+        for file_path, file_obj in images:
+            if file_path.startswith(_IMAGES_DIR):
+                if file_path[len(_IMAGES_DIR) : -len(".jpg")] in files_to_keep:
+                    label = file_path.split("/")[2]
+                    yield file_path, {
+                        "image": {"filename": file_path.split("/")[-1], "data": file_obj.read()},
+                        "label": label,
+                    }
diff --git a/src/datasets/utils/download_manager.py b/src/datasets/utils/download_manager.py
@@ -239,7 +239,7 @@ def iter_archive(self, path):
                 relative_file_path = os.path.join(relative_dir_path, name)
                 absolute_file_path = os.path.join(root, name)
                 with open(absolute_file_path, "rb") as file_obj:
-                    yield (relative_file_path, file_obj)
+                    yield (relative_file_path.replace(os.sep, "/"), file_obj)
 
     def extract(self, path_or_paths, num_proc=None):
         """Extract given path(s).

diff --git a/src/datasets/utils/mock_download_manager.py b/src/datasets/utils/mock_download_manager.py
@@ -207,3 +207,9 @@ def delete_extracted_files(self):
 
     def manage_extracted_files(self):
         pass
+
+    def iter_archive(self, path):
+        path = Path(path)
+        for file_path in path.rglob("*"):
+            if file_path.is_file() and not file_path.name.startswith(".") and not file_path.name.startswith("__"):
+                yield file_path.relative_to(path).as_posix(), file_path.open("rb")
diff --git a/src/datasets/utils/streaming_download_manager.py b/src/datasets/utils/streaming_download_manager.py
@@ -1,6 +1,7 @@
 import glob
 import os
 import re
+import tarfile
 import time
 from pathlib import Path, PurePosixPath
 from typing import Optional, Tuple, Union
@@ -19,7 +20,21 @@
 
 logger = get_logger(__name__)
 
-BASE_KNOWN_EXTENSIONS = ["txt", "csv", "json", "jsonl", "tsv", "conll", "conllu", "parquet", "pkl", "pickle", "xml"]
+BASE_KNOWN_EXTENSIONS = [
+    "txt",
+    "csv",
+    "json",
+    "jsonl",
+    "tsv",
+    "conll",
+    "conllu",
+    "orig",
+    "parquet",
+    "pkl",
+    "pickle",
+    "rel",
+    "xml",
+]
 COMPRESSION_EXTENSION_TO_PROTOCOL = {
     # single file compression
     **{fs_class.extension.lstrip("."): fs_class.protocol for fs_class in COMPRESSION_FILESYSTEMS},
@@ -149,7 +164,9 @@ def _get_extraction_protocol(urlpath: str) -> Optional[str]:
     if extension in BASE_KNOWN_EXTENSIONS:
         return None
     elif path.endswith(".tar.gz") or path.endswith(".tgz"):
-        pass
+        raise NotImplementedError(
+            f"Extraction protocol for TAR archives like '{urlpath}' is not implemented in streaming mode. Please use `dl_manager.iter_archive` instead."
+        )
     elif extension in COMPRESSION_EXTENSION_TO_PROTOCOL:
         return COMPRESSION_EXTENSION_TO_PROTOCOL[extension]
     raise NotImplementedError(f"Extraction protocol '{extension}' for file at '{urlpath}' is not implemented yet")
@@ -339,11 +356,37 @@ def _extract(self, urlpath: str) -> str:
             inner_file = inner_file[: inner_file.rindex(".")]
             # check for tar.gz, tar.bz2 etc.
             if inner_file.endswith(".tar"):
-                return f"tar://::{urlpath}"
+                return f"tar://::{protocol}://{inner_file}::{urlpath}"
             else:
                 return f"{protocol}://{inner_file}::{urlpath}"
         else:
             return f"{protocol}://::{urlpath}"
 
     def download_and_extract(self, url_or_urls):
         return self.extract(self.download(url_or_urls))
+
+    def iter_archive(self, urlpath: str):
+        """Returns iterator over files within archive.
+
+        Args:
+            path: path to archive.
+
+        Returns:
+            Generator yielding tuple (path_within_archive, file_obj).
+            File-Obj are opened in byte mode (io.BufferedReader)
+        """
+        with xopen(urlpath, "rb", use_auth_token=self._download_config.use_auth_token) as f:
+            stream = tarfile.open(fileobj=f, mode="r|*")
+            for tarinfo in stream:
+                file_path = tarinfo.name
+                if not tarinfo.isreg():
+                    continue
+                if file_path is None:
+                    continue
+                if os.path.basename(file_path).startswith(".") or os.path.basename(file_path).startswith("__"):
+                    # skipping hidden files
+                    continue
+                file_obj = stream.extractfile(tarinfo)
+                yield (file_path, file_obj)
+                stream.members = []
+            del stream