update dummy data

huggingface · Oct 12, 2021 · 2c1b6fd · 2c1b6fd · github-actions · Oct 12, 2021
1 parent 500d1ce
commit 2c1b6fd
Show file tree

Hide file tree

Showing 5 changed files with 8 additions and 9 deletions.
diff --git a/datasets/food101/dataset_infos.json b/datasets/food101/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "This dataset consists of 101 food categories, with 101'000 images. For each class, 250 manually reviewed test images are provided as well as 750 training images. On purpose, the training images were not cleaned, and thus still contain some amount of noise. This comes mostly in the form of intense colors and sometimes wrong labels. All images were rescaled to have a maximum side length of 512 pixels.", "citation": " @inproceedings{bossard14,\n  title = {Food-101 -- Mining Discriminative Components with Random Forests},\n  author = {Bossard, Lukas and Guillaumin, Matthieu and Van Gool, Luc},\n  booktitle = {European Conference on Computer Vision},\n  year = {2014}\n}\n", "homepage": "https://data.vision.ee.ethz.ch/cvl/datasets_extra/food-101/", "license": "LICENSE AGREEMENT\n=================\n - The Food-101 data set consists of images from Foodspotting [1] which are not\n   property of the Federal Institute of Technology Zurich (ETHZ). Any use beyond\n   scientific fair use must be negociated with the respective picture owners\n   according to the Foodspotting terms of use [2].\n\n[1] http://www.foodspotting.com/\n[2] http://www.foodspotting.com/terms/\n", "features": {"image": {"dtype": "binary", "id": null, "_type": "Value"}, "label": {"num_classes": 101, "names": ["apple_pie", "baby_back_ribs", "baklava", "beef_carpaccio", "beef_tartare", "beet_salad", "beignets", "bibimbap", "bread_pudding", "breakfast_burrito", "bruschetta", "caesar_salad", "cannoli", "caprese_salad", "carrot_cake", "ceviche", "cheesecake", "cheese_plate", "chicken_curry", "chicken_quesadilla", "chicken_wings", "chocolate_cake", "chocolate_mousse", "churros", "clam_chowder", "club_sandwich", "crab_cakes", "creme_brulee", "croque_madame", "cup_cakes", "deviled_eggs", "donuts", "dumplings", "edamame", "eggs_benedict", "escargots", "falafel", "filet_mignon", "fish_and_chips", "foie_gras", "french_fries", "french_onion_soup", "french_toast", "fried_calamari", "fried_rice", "frozen_yogurt", "garlic_bread", "gnocchi", "greek_salad", "grilled_cheese_sandwich", "grilled_salmon", "guacamole", "gyoza", "hamburger", "hot_and_sour_soup", "hot_dog", "huevos_rancheros", "hummus", "ice_cream", "lasagna", "lobster_bisque", "lobster_roll_sandwich", "macaroni_and_cheese", "macarons", "miso_soup", "mussels", "nachos", "omelette", "onion_rings", "oysters", "pad_thai", "paella", "pancakes", "panna_cotta", "peking_duck", "pho", "pizza", "pork_chop", "poutine", "prime_rib", "pulled_pork_sandwich", "ramen", "ravioli", "red_velvet_cake", "risotto", "samosa", "sashimi", "scallops", "seaweed_salad", "shrimp_and_grits", "spaghetti_bolognese", "spaghetti_carbonara", "spring_rolls", "steak", "strawberry_shortcake", "sushi", "tacos", "takoyaki", "tiramisu", "tuna_tartare", "waffles"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": {"input": "image", "output": "label"}, "task_templates": [{"task": "image-classification", "image_file_path_column": "image", "label_column": "label", "labels": ["apple_pie", "baby_back_ribs", "baklava", "beef_carpaccio", "beef_tartare", "beet_salad", "beignets", "bibimbap", "bread_pudding", "breakfast_burrito", "bruschetta", "caesar_salad", "cannoli", "caprese_salad", "carrot_cake", "ceviche", "cheese_plate", "cheesecake", "chicken_curry", "chicken_quesadilla", "chicken_wings", "chocolate_cake", "chocolate_mousse", "churros", "clam_chowder", "club_sandwich", "crab_cakes", "creme_brulee", "croque_madame", "cup_cakes", "deviled_eggs", "donuts", "dumplings", "edamame", "eggs_benedict", "escargots", "falafel", "filet_mignon", "fish_and_chips", "foie_gras", "french_fries", "french_onion_soup", "french_toast", "fried_calamari", "fried_rice", "frozen_yogurt", "garlic_bread", "gnocchi", "greek_salad", "grilled_cheese_sandwich", "grilled_salmon", "guacamole", "gyoza", "hamburger", "hot_and_sour_soup", "hot_dog", "huevos_rancheros", "hummus", "ice_cream", "lasagna", "lobster_bisque", "lobster_roll_sandwich", "macaroni_and_cheese", "macarons", "miso_soup", "mussels", "nachos", "omelette", "onion_rings", "oysters", "pad_thai", "paella", "pancakes", "panna_cotta", "peking_duck", "pho", "pizza", "pork_chop", "poutine", "prime_rib", "pulled_pork_sandwich", "ramen", "ravioli", "red_velvet_cake", "risotto", "samosa", "sashimi", "scallops", "seaweed_salad", "shrimp_and_grits", "spaghetti_bolognese", "spaghetti_carbonara", "spring_rolls", "steak", "strawberry_shortcake", "sushi", "tacos", "takoyaki", "tiramisu", "tuna_tartare", "waffles"]}], "builder_name": "food101", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3842654228, "num_examples": 75750, "dataset_name": "food101"}, "validation": {"name": "validation", "num_bytes": 1275179763, "num_examples": 25250, "dataset_name": "food101"}}, "download_checksums": {"http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz": {"num_bytes": 4996278331, "checksum": "d97d15e438b7f4498f96086a4f7e2fa42a32f2712e87d3295441b2b6314053a4"}, "https://s3.amazonaws.com/datasets.huggingface.co/food101/meta/train.txt": {"num_bytes": 1468812, "checksum": "2920f7d55473974492b41a01241ccfd71df1b74d29d27b617337f840f58f77ab"}, "https://s3.amazonaws.com/datasets.huggingface.co/food101/meta/test.txt": {"num_bytes": 489429, "checksum": "440d53374697d019a972fe66e8e44031ae80267a126ecb814ad537ec1fd506db"}}, "download_size": 4998236572, "post_processing_size": null, "dataset_size": 5117833991, "size_in_bytes": 10116070563}}
+{"default": {"description": "This dataset consists of 101 food categories, with 101'000 images. For each class, 250 manually reviewed test images are provided as well as 750 training images. On purpose, the training images were not cleaned, and thus still contain some amount of noise. This comes mostly in the form of intense colors and sometimes wrong labels. All images were rescaled to have a maximum side length of 512 pixels.", "citation": " @inproceedings{bossard14,\n  title = {Food-101 -- Mining Discriminative Components with Random Forests},\n  author = {Bossard, Lukas and Guillaumin, Matthieu and Van Gool, Luc},\n  booktitle = {European Conference on Computer Vision},\n  year = {2014}\n}\n", "homepage": "https://data.vision.ee.ethz.ch/cvl/datasets_extra/food-101/", "license": "LICENSE AGREEMENT\n=================\n - The Food-101 data set consists of images from Foodspotting [1] which are not\n   property of the Federal Institute of Technology Zurich (ETHZ). Any use beyond\n   scientific fair use must be negociated with the respective picture owners\n   according to the Foodspotting terms of use [2].\n\n[1] http://www.foodspotting.com/\n[2] http://www.foodspotting.com/terms/\n", "features": {"image": {"dtype": "binary", "id": null, "_type": "Value"}, "label": {"num_classes": 101, "names": ["apple_pie", "baby_back_ribs", "baklava", "beef_carpaccio", "beef_tartare", "beet_salad", "beignets", "bibimbap", "bread_pudding", "breakfast_burrito", "bruschetta", "caesar_salad", "cannoli", "caprese_salad", "carrot_cake", "ceviche", "cheesecake", "cheese_plate", "chicken_curry", "chicken_quesadilla", "chicken_wings", "chocolate_cake", "chocolate_mousse", "churros", "clam_chowder", "club_sandwich", "crab_cakes", "creme_brulee", "croque_madame", "cup_cakes", "deviled_eggs", "donuts", "dumplings", "edamame", "eggs_benedict", "escargots", "falafel", "filet_mignon", "fish_and_chips", "foie_gras", "french_fries", "french_onion_soup", "french_toast", "fried_calamari", "fried_rice", "frozen_yogurt", "garlic_bread", "gnocchi", "greek_salad", "grilled_cheese_sandwich", "grilled_salmon", "guacamole", "gyoza", "hamburger", "hot_and_sour_soup", "hot_dog", "huevos_rancheros", "hummus", "ice_cream", "lasagna", "lobster_bisque", "lobster_roll_sandwich", "macaroni_and_cheese", "macarons", "miso_soup", "mussels", "nachos", "omelette", "onion_rings", "oysters", "pad_thai", "paella", "pancakes", "panna_cotta", "peking_duck", "pho", "pizza", "pork_chop", "poutine", "prime_rib", "pulled_pork_sandwich", "ramen", "ravioli", "red_velvet_cake", "risotto", "samosa", "sashimi", "scallops", "seaweed_salad", "shrimp_and_grits", "spaghetti_bolognese", "spaghetti_carbonara", "spring_rolls", "steak", "strawberry_shortcake", "sushi", "tacos", "takoyaki", "tiramisu", "tuna_tartare", "waffles"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": {"input": "image", "output": "label"}, "task_templates": null, "builder_name": "food101", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3842654228, "num_examples": 75750, "dataset_name": "food101"}, "validation": {"name": "validation", "num_bytes": 1275179763, "num_examples": 25250, "dataset_name": "food101"}}, "download_checksums": {"http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz": {"num_bytes": 4996278331, "checksum": "d97d15e438b7f4498f96086a4f7e2fa42a32f2712e87d3295441b2b6314053a4"}, "https://s3.amazonaws.com/datasets.huggingface.co/food101/meta/train.txt": {"num_bytes": 1468812, "checksum": "2920f7d55473974492b41a01241ccfd71df1b74d29d27b617337f840f58f77ab"}, "https://s3.amazonaws.com/datasets.huggingface.co/food101/meta/test.txt": {"num_bytes": 489429, "checksum": "440d53374697d019a972fe66e8e44031ae80267a126ecb814ad537ec1fd506db"}}, "download_size": 4998236572, "post_processing_size": null, "dataset_size": 5117833991, "size_in_bytes": 10116070563}}
diff --git a/datasets/food101/dummy/0.0.0/dummy_data.zip b/datasets/food101/dummy/0.0.0/dummy_data.zip
diff --git a/datasets/food101/food101.py b/datasets/food101/food101.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """Dataset class for Food-101 dataset."""
 
-import json
-from pathlib import Path
-
 import datasets
 from datasets.tasks import ImageClassification
 
@@ -181,7 +178,6 @@ def _info(self):
             ),
             supervised_keys=("image", "label"),
             homepage=_HOMEPAGE,
-            task_templates=[ImageClassification(image_file_path_column="image", label_column="label", labels=_NAMES)],
             citation=_CITATION,
             license=_LICENSE,
         )

diff --git a/src/datasets/utils/mock_download_manager.py b/src/datasets/utils/mock_download_manager.py
@@ -207,3 +207,9 @@ def delete_extracted_files(self):
 
     def manage_extracted_files(self):
         pass
+
+    def iter_archive(self, path):
+        path = Path(path)
+        for file_path in path.rglob("*"):
+            if file_path.is_file() and not file_path.name.startswith(".") and not file_path.name.startswith("__"):
+                yield file_path.relative_to(path).as_posix(), file_path.open("rb")
diff --git a/src/datasets/utils/streaming_download_manager.py b/src/datasets/utils/streaming_download_manager.py
@@ -365,10 +365,7 @@ def iter_archive(self, urlpath: str):
                     continue
                 if file_path is None:
                     continue
-                if "/" not in file_path and file_path.startswith("__") and file_path.endswith("__"):
-                    # skipping metadata
-                    continue
-                if os.path.basename(file_path).startswith("."):
+                if os.path.basename(file_path).startswith(".") or os.path.basename(file_path).startswith("__"):
                     # skipping hidden files
                     continue
                 file_obj = stream.extractfile(tarinfo)