In [1]:
import torchvision.datasets as datasets
import random
import shutil
from pathlib import Path


In [2]:
data_path = Path("/data")

In [3]:
# get training data
train_data = datasets.Food101(root=data_path,
                              split="train",
                              download=True)


# get testing data
test_data = datasets.Food101(root=data_path,
                             split="test",
                             download=True)


Downloading https://data.vision.ee.ethz.ch/cvl/food-101.tar.gz to /data/food-101.tar.gz


100%|██████████| 4996278331/4996278331 [05:02<00:00, 16496657.88it/s]


Extracting /data/food-101.tar.gz to /data


In [5]:
class_names = train_data.classes
target_classes = ["cup_cakes", "red_velvet_cake", "chocolate_cake"]

In [6]:

data_dir = data_path / "food-101" /"images"
amount_to_get = 1

def get_subset(image_path=data_dir,
               data_splits=["train", "test"],
               target_classes=target_classes,
               amount=1,
               seed=42):
  random.seed(42)
  label_splits = {}

  # get labels
  for data_split in data_splits:
    label_path = data_path / "food-101" / "meta" / f"{data_split}.txt"
    with open(label_path, "r") as f:
      labels = [line.strip("\n") for line in f.readlines() if line.split("/")[0] in target_classes]

    # get random subset of target classes image IDs
    number_to_sample = round(amount * len(labels))
    sampled_images = random.sample(labels, k=number_to_sample)

    # Apply full paths
    image_paths = [Path(str(image_path / sample_image) + ".jpg") for sample_image in sampled_images]
    label_splits[data_split] = image_paths
  return label_splits

In [8]:
label_splits = get_subset(amount=amount_to_get)

# move train and test image to target folder
# create target directory path
target_dir_name = f"/data/cakes_{str(int(amount_to_get*100))}_percent"
# setup directories
target_dir = Path(target_dir_name)

# make directories
target_dir.mkdir(parents=True, exist_ok=True)

for image_split in label_splits.keys():
  for image_path in label_splits[str(image_split)]:
    dest_dir = target_dir / image_split / image_path.parent.stem / image_path.name
    if not dest_dir.parent.is_dir():
      dest_dir.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy2(image_path, dest_dir)

In [9]:

# Zip cakes images
zip_file_name = data_dir / f"cakes_{str(int(amount_to_get*100))}_percent"
shutil.make_archive(zip_file_name,
                    format="zip",
                    root_dir=target_dir)

'/data/food-101/images/cakes_100_percent.zip'