Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add iter_archive #3066

Merged
merged 9 commits into from
Oct 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion datasets/food101/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,16 @@ The data instances have the following fields:

### Licensing Information

[More Information Needed]
LICENSE AGREEMENT
=================
- The Food-101 data set consists of images from Foodspotting [1] which are not
property of the Federal Institute of Technology Zurich (ETHZ). Any use beyond
scientific fair use must be negociated with the respective picture owners
according to the Foodspotting terms of use [2].

[1] http://www.foodspotting.com/
[2] http://www.foodspotting.com/terms/


### Citation Information

Expand Down
2 changes: 1 addition & 1 deletion datasets/food101/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"default": {"description": "This dataset consists of 101 food categories, with 101'000 images. For each class, 250 manually reviewed test images are provided as well as 750 training images. On purpose, the training images were not cleaned, and thus still contain some amount of noise. This comes mostly in the form of intense colors and sometimes wrong labels. All images were rescaled to have a maximum side length of 512 pixels.", "citation": " @inproceedings{bossard14,\n title = {Food-101 -- Mining Discriminative Components with Random Forests},\n author = {Bossard, Lukas and Guillaumin, Matthieu and Van Gool, Luc},\n booktitle = {European Conference on Computer Vision},\n year = {2014}\n}\n", "homepage": "https://data.vision.ee.ethz.ch/cvl/datasets_extra/food-101/", "license": "", "features": {"image": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 101, "names": ["apple_pie", "baby_back_ribs", "baklava", "beef_carpaccio", "beef_tartare", "beet_salad", "beignets", "bibimbap", "bread_pudding", "breakfast_burrito", "bruschetta", "caesar_salad", "cannoli", "caprese_salad", "carrot_cake", "ceviche", "cheesecake", "cheese_plate", "chicken_curry", "chicken_quesadilla", "chicken_wings", "chocolate_cake", "chocolate_mousse", "churros", "clam_chowder", "club_sandwich", "crab_cakes", "creme_brulee", "croque_madame", "cup_cakes", "deviled_eggs", "donuts", "dumplings", "edamame", "eggs_benedict", "escargots", "falafel", "filet_mignon", "fish_and_chips", "foie_gras", "french_fries", "french_onion_soup", "french_toast", "fried_calamari", "fried_rice", "frozen_yogurt", "garlic_bread", "gnocchi", "greek_salad", "grilled_cheese_sandwich", "grilled_salmon", "guacamole", "gyoza", "hamburger", "hot_and_sour_soup", "hot_dog", "huevos_rancheros", "hummus", "ice_cream", "lasagna", "lobster_bisque", "lobster_roll_sandwich", "macaroni_and_cheese", "macarons", "miso_soup", "mussels", "nachos", "omelette", "onion_rings", "oysters", "pad_thai", "paella", "pancakes", "panna_cotta", "peking_duck", "pho", "pizza", "pork_chop", "poutine", "prime_rib", "pulled_pork_sandwich", "ramen", "ravioli", "red_velvet_cake", "risotto", "samosa", "sashimi", "scallops", "seaweed_salad", "shrimp_and_grits", "spaghetti_bolognese", "spaghetti_carbonara", "spring_rolls", "steak", "strawberry_shortcake", "sushi", "tacos", "takoyaki", "tiramisu", "tuna_tartare", "waffles"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": {"input": "image", "output": "label"}, "task_templates": [{"task": "image-classification", "image_file_path_column": "image", "label_column": "label", "labels": ["apple_pie", "baby_back_ribs", "baklava", "beef_carpaccio", "beef_tartare", "beet_salad", "beignets", "bibimbap", "bread_pudding", "breakfast_burrito", "bruschetta", "caesar_salad", "cannoli", "caprese_salad", "carrot_cake", "ceviche", "cheese_plate", "cheesecake", "chicken_curry", "chicken_quesadilla", "chicken_wings", "chocolate_cake", "chocolate_mousse", "churros", "clam_chowder", "club_sandwich", "crab_cakes", "creme_brulee", "croque_madame", "cup_cakes", "deviled_eggs", "donuts", "dumplings", "edamame", "eggs_benedict", "escargots", "falafel", "filet_mignon", "fish_and_chips", "foie_gras", "french_fries", "french_onion_soup", "french_toast", "fried_calamari", "fried_rice", "frozen_yogurt", "garlic_bread", "gnocchi", "greek_salad", "grilled_cheese_sandwich", "grilled_salmon", "guacamole", "gyoza", "hamburger", "hot_and_sour_soup", "hot_dog", "huevos_rancheros", "hummus", "ice_cream", "lasagna", "lobster_bisque", "lobster_roll_sandwich", "macaroni_and_cheese", "macarons", "miso_soup", "mussels", "nachos", "omelette", "onion_rings", "oysters", "pad_thai", "paella", "pancakes", "panna_cotta", "peking_duck", "pho", "pizza", "pork_chop", "poutine", "prime_rib", "pulled_pork_sandwich", "ramen", "ravioli", "red_velvet_cake", "risotto", "samosa", "sashimi", "scallops", "seaweed_salad", "shrimp_and_grits", "spaghetti_bolognese", "spaghetti_carbonara", "spring_rolls", "steak", "strawberry_shortcake", "sushi", "tacos", "takoyaki", "tiramisu", "tuna_tartare", "waffles"]}], "builder_name": "food101", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 13210094, "num_examples": 75750, "dataset_name": "food101"}, "validation": {"name": "validation", "num_bytes": 4403191, "num_examples": 25250, "dataset_name": "food101"}}, "download_checksums": {"http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz": {"num_bytes": 4996278331, "checksum": "d97d15e438b7f4498f96086a4f7e2fa42a32f2712e87d3295441b2b6314053a4"}}, "download_size": 4996278331, "post_processing_size": null, "dataset_size": 17613285, "size_in_bytes": 5013891616}}
{"default": {"description": "This dataset consists of 101 food categories, with 101'000 images. For each class, 250 manually reviewed test images are provided as well as 750 training images. On purpose, the training images were not cleaned, and thus still contain some amount of noise. This comes mostly in the form of intense colors and sometimes wrong labels. All images were rescaled to have a maximum side length of 512 pixels.", "citation": " @inproceedings{bossard14,\n title = {Food-101 -- Mining Discriminative Components with Random Forests},\n author = {Bossard, Lukas and Guillaumin, Matthieu and Van Gool, Luc},\n booktitle = {European Conference on Computer Vision},\n year = {2014}\n}\n", "homepage": "https://data.vision.ee.ethz.ch/cvl/datasets_extra/food-101/", "license": "LICENSE AGREEMENT\n=================\n - The Food-101 data set consists of images from Foodspotting [1] which are not\n property of the Federal Institute of Technology Zurich (ETHZ). Any use beyond\n scientific fair use must be negociated with the respective picture owners\n according to the Foodspotting terms of use [2].\n\n[1] http://www.foodspotting.com/\n[2] http://www.foodspotting.com/terms/\n", "features": {"image": {"filename": {"dtype": "string", "id": null, "_type": "Value"}, "data": {"dtype": "binary", "id": null, "_type": "Value"}}, "label": {"num_classes": 101, "names": ["apple_pie", "baby_back_ribs", "baklava", "beef_carpaccio", "beef_tartare", "beet_salad", "beignets", "bibimbap", "bread_pudding", "breakfast_burrito", "bruschetta", "caesar_salad", "cannoli", "caprese_salad", "carrot_cake", "ceviche", "cheesecake", "cheese_plate", "chicken_curry", "chicken_quesadilla", "chicken_wings", "chocolate_cake", "chocolate_mousse", "churros", "clam_chowder", "club_sandwich", "crab_cakes", "creme_brulee", "croque_madame", "cup_cakes", "deviled_eggs", "donuts", "dumplings", "edamame", "eggs_benedict", "escargots", "falafel", "filet_mignon", "fish_and_chips", "foie_gras", "french_fries", "french_onion_soup", "french_toast", "fried_calamari", "fried_rice", "frozen_yogurt", "garlic_bread", "gnocchi", "greek_salad", "grilled_cheese_sandwich", "grilled_salmon", "guacamole", "gyoza", "hamburger", "hot_and_sour_soup", "hot_dog", "huevos_rancheros", "hummus", "ice_cream", "lasagna", "lobster_bisque", "lobster_roll_sandwich", "macaroni_and_cheese", "macarons", "miso_soup", "mussels", "nachos", "omelette", "onion_rings", "oysters", "pad_thai", "paella", "pancakes", "panna_cotta", "peking_duck", "pho", "pizza", "pork_chop", "poutine", "prime_rib", "pulled_pork_sandwich", "ramen", "ravioli", "red_velvet_cake", "risotto", "samosa", "sashimi", "scallops", "seaweed_salad", "shrimp_and_grits", "spaghetti_bolognese", "spaghetti_carbonara", "spring_rolls", "steak", "strawberry_shortcake", "sushi", "tacos", "takoyaki", "tiramisu", "tuna_tartare", "waffles"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": {"input": "image", "output": "label"}, "task_templates": null, "builder_name": "food101", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3843765322, "num_examples": 75750, "dataset_name": "food101"}, "validation": {"name": "validation", "num_bytes": 1275549954, "num_examples": 25250, "dataset_name": "food101"}}, "download_checksums": {"http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz": {"num_bytes": 4996278331, "checksum": "d97d15e438b7f4498f96086a4f7e2fa42a32f2712e87d3295441b2b6314053a4"}, "https://s3.amazonaws.com/datasets.huggingface.co/food101/meta/train.txt": {"num_bytes": 1468812, "checksum": "2920f7d55473974492b41a01241ccfd71df1b74d29d27b617337f840f58f77ab"}, "https://s3.amazonaws.com/datasets.huggingface.co/food101/meta/test.txt": {"num_bytes": 489429, "checksum": "440d53374697d019a972fe66e8e44031ae80267a126ecb814ad537ec1fd506db"}}, "download_size": 4998236572, "post_processing_size": null, "dataset_size": 5119315276, "size_in_bytes": 10117551848}}
Binary file modified datasets/food101/dummy/0.0.0/dummy_data.zip
Binary file not shown.
60 changes: 42 additions & 18 deletions datasets/food101/food101.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,16 @@
# limitations under the License.
"""Dataset class for Food-101 dataset."""

import json
from pathlib import Path

import datasets
from datasets.tasks import ImageClassification


_BASE_URL = "http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz"

_METADATA_URLS = {
"train": "https://s3.amazonaws.com/datasets.huggingface.co/food101/meta/train.txt",
"test": "https://s3.amazonaws.com/datasets.huggingface.co/food101/meta/test.txt",
}

_HOMEPAGE = "https://data.vision.ee.ethz.ch/cvl/datasets_extra/food-101/"

_DESCRIPTION = (
Expand All @@ -43,6 +44,18 @@
}
"""

_LICENSE = """\
LICENSE AGREEMENT
=================
- The Food-101 data set consists of images from Foodspotting [1] which are not
property of the Federal Institute of Technology Zurich (ETHZ). Any use beyond
scientific fair use must be negociated with the respective picture owners
according to the Foodspotting terms of use [2].

[1] http://www.foodspotting.com/
[2] http://www.foodspotting.com/terms/
"""

_NAMES = [
"apple_pie",
"baby_back_ribs",
Expand Down Expand Up @@ -147,6 +160,8 @@
"waffles",
]

_IMAGES_DIR = "food-101/images/"


class Food101(datasets.GeneratorBasedBuilder):
"""Food-101 Images dataset."""
Expand All @@ -156,36 +171,45 @@ def _info(self):
description=_DESCRIPTION,
features=datasets.Features(
{
"image": datasets.Value("string"),
"image": {"filename": datasets.Value("string"), "data": datasets.Value("binary")},
"label": datasets.features.ClassLabel(names=_NAMES),
}
),
supervised_keys=("image", "label"),
homepage=_HOMEPAGE,
task_templates=[ImageClassification(image_file_path_column="image", label_column="label", labels=_NAMES)],
citation=_CITATION,
license=_LICENSE,
)

def _split_generators(self, dl_manager):
dl_path = Path(dl_manager.download_and_extract(_BASE_URL))
meta_path = dl_path / "food-101" / "meta"
image_dir_path = dl_path / "food-101" / "images"
archive_path = dl_manager.download(_BASE_URL)
split_metadata_paths = dl_manager.download(_METADATA_URLS)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"json_file_path": meta_path / "train.json", "image_dir_path": image_dir_path},
gen_kwargs={
"images": dl_manager.iter_archive(archive_path),
"metadata_path": split_metadata_paths["train"],
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={"json_file_path": meta_path / "test.json", "image_dir_path": image_dir_path},
gen_kwargs={
"images": dl_manager.iter_archive(archive_path),
"metadata_path": split_metadata_paths["test"],
},
),
]

def _generate_examples(self, json_file_path, image_dir_path):
def _generate_examples(self, images, metadata_path):
"""Generate images and labels for splits."""
data = json.loads(json_file_path.read_text())
for label, images in data.items():
for image_name in images:
image = image_dir_path / f"{image_name}.jpg"
features = {"image": str(image), "label": label}
yield image_name, features
with open(metadata_path, encoding="utf-8") as f:
files_to_keep = set(f.read().split("\n"))
for file_path, file_obj in images:
if file_path.startswith(_IMAGES_DIR):
if file_path[len(_IMAGES_DIR) : -len(".jpg")] in files_to_keep:
label = file_path.split("/")[2]
yield file_path, {
"image": {"filename": file_path.split("/")[-1], "data": file_obj.read()},
"label": label,
}
2 changes: 1 addition & 1 deletion src/datasets/utils/download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def iter_archive(self, path):
relative_file_path = os.path.join(relative_dir_path, name)
absolute_file_path = os.path.join(root, name)
with open(absolute_file_path, "rb") as file_obj:
yield (relative_file_path, file_obj)
yield (relative_file_path.replace(os.sep, "/"), file_obj)

def extract(self, path_or_paths, num_proc=None):
"""Extract given path(s).
Expand Down
6 changes: 6 additions & 0 deletions src/datasets/utils/mock_download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,3 +207,9 @@ def delete_extracted_files(self):

def manage_extracted_files(self):
pass

def iter_archive(self, path):
path = Path(path)
for file_path in path.rglob("*"):
if file_path.is_file() and not file_path.name.startswith(".") and not file_path.name.startswith("__"):
yield file_path.relative_to(path).as_posix(), file_path.open("rb")
49 changes: 46 additions & 3 deletions src/datasets/utils/streaming_download_manager.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import glob
import os
import re
import tarfile
import time
from pathlib import Path, PurePosixPath
from typing import Optional, Tuple, Union
Expand All @@ -19,7 +20,21 @@

logger = get_logger(__name__)

BASE_KNOWN_EXTENSIONS = ["txt", "csv", "json", "jsonl", "tsv", "conll", "conllu", "parquet", "pkl", "pickle", "xml"]
BASE_KNOWN_EXTENSIONS = [
"txt",
"csv",
"json",
"jsonl",
"tsv",
"conll",
"conllu",
"orig",
"parquet",
"pkl",
"pickle",
"rel",
"xml",
]
COMPRESSION_EXTENSION_TO_PROTOCOL = {
# single file compression
**{fs_class.extension.lstrip("."): fs_class.protocol for fs_class in COMPRESSION_FILESYSTEMS},
Expand Down Expand Up @@ -149,7 +164,9 @@ def _get_extraction_protocol(urlpath: str) -> Optional[str]:
if extension in BASE_KNOWN_EXTENSIONS:
return None
elif path.endswith(".tar.gz") or path.endswith(".tgz"):
pass
raise NotImplementedError(
f"Extraction protocol for TAR archives like '{urlpath}' is not implemented in streaming mode. Please use `dl_manager.iter_archive` instead."
)
elif extension in COMPRESSION_EXTENSION_TO_PROTOCOL:
return COMPRESSION_EXTENSION_TO_PROTOCOL[extension]
raise NotImplementedError(f"Extraction protocol '{extension}' for file at '{urlpath}' is not implemented yet")
Expand Down Expand Up @@ -339,11 +356,37 @@ def _extract(self, urlpath: str) -> str:
inner_file = inner_file[: inner_file.rindex(".")]
# check for tar.gz, tar.bz2 etc.
if inner_file.endswith(".tar"):
return f"tar://::{urlpath}"
return f"tar://::{protocol}://{inner_file}::{urlpath}"
else:
return f"{protocol}://{inner_file}::{urlpath}"
else:
return f"{protocol}://::{urlpath}"

def download_and_extract(self, url_or_urls):
return self.extract(self.download(url_or_urls))

def iter_archive(self, urlpath: str):
"""Returns iterator over files within archive.

Args:
path: path to archive.

Returns:
Generator yielding tuple (path_within_archive, file_obj).
File-Obj are opened in byte mode (io.BufferedReader)
"""
with xopen(urlpath, "rb", use_auth_token=self._download_config.use_auth_token) as f:
stream = tarfile.open(fileobj=f, mode="r|*")
for tarinfo in stream:
file_path = tarinfo.name
if not tarinfo.isreg():
continue
if file_path is None:
continue
if os.path.basename(file_path).startswith(".") or os.path.basename(file_path).startswith("__"):
# skipping hidden files
continue
file_obj = stream.extractfile(tarinfo)
yield (file_path, file_obj)
stream.members = []
del stream