In [4]:
from fiftyone import ViewField as F
from pathlib import Path

from dataset_utils.download_utils import prepare_dataset
from dataset_utils.fiftyone_load_utils import load_dataset, split_and_export_manifest
from dataset_utils.fiftyone_export_utils import merge_datasets, export_manifest, create_absolute_paths_in_manifest

#### Download the datasets to your local folder

In [5]:
# Specify which datasets you want to download. Note that dataset_names must be equal to the datasets in https://github.com/iterative/google-kaggle-competition-data-pipeline/tree/main/datasets

dataset_names=["food_101_small", "freiburg_groceries"]
dataset_path=Path("../datasets/")

In [7]:
for dataset_name in dataset_names:
    prepare_dataset(dataset_name=dataset_name,
                    output_path=dataset_path/dataset_name
                    )

# In case you would like to donwload only certain partitions, you specify them in the list
#prepare_dataset(dataset_name=dataset_name, output_path=dataset_path, partitions=[0, 1, 3])

In [8]:
dataset_food_101 = load_dataset(dataset_name=dataset_names[0],
                                dataset_path=dataset_path/dataset_names[0],
                                rewrite=True,
                                print_test=False)
dataset_freiburg = load_dataset(dataset_name=dataset_names[1],
                                dataset_path=dataset_path/dataset_names[1],
                                rewrite=True,
                                print_test=False
                                )

 100% |█████████████| 10100/10100 [5.8s elapsed, 0s remaining, 1.7K samples/s]        
 100% |███████████████| 4947/4947 [2.9s elapsed, 0s remaining, 1.7K samples/s]      


#### Now you do filtering and changes to the invidual datasets that you need.
You may get inspired by some of these functions

In [9]:
view_food_101 = (
    dataset_food_101
    .map_labels("ground_truth", {"spaghetti_carbonara": "spaghetti", "spaghetti_bolognese": "spaghetti"})
)

view_only_pizza = (
    view_food_101
    .filter_labels("ground_truth", F("label").is_in(['pizza']))
)

# This function replaces original label in ground_truth.label with new label. This means that all images will have the same single label.
view_freiburg = (
    dataset_freiburg
    .set_field("ground_truth.label", "packaged_goods")
)

#### Merge and output datasets

In [10]:
dataset = merge_datasets([view_food_101, view_freiburg])

In [11]:
# Final check that the datasets are merged correctly

print(dataset.count_values('ground_truth.label'))


{'caesar_salad': 100, 'hot_and_sour_soup': 100, 'dumplings': 100, 'caprese_salad': 100, 'beignets': 100, 'fish_and_chips': 100, 'beef_carpaccio': 100, 'scallops': 100, 'grilled_salmon': 100, 'fried_rice': 100, 'sushi': 100, 'frozen_yogurt': 100, 'sashimi': 100, 'crab_cakes': 100, 'breakfast_burrito': 100, 'lobster_roll_sandwich': 100, 'hummus': 100, 'creme_brulee': 100, 'steak': 100, 'croque_madame': 100, 'cannoli': 100, 'peking_duck': 100, 'churros': 100, 'ceviche': 100, 'hot_dog': 100, 'packaged_goods': 4947, 'cheese_plate': 100, 'huevos_rancheros': 100, 'seaweed_salad': 100, 'chicken_wings': 100, 'cup_cakes': 100, 'baklava': 100, 'mussels': 100, 'macaroni_and_cheese': 100, 'french_onion_soup': 100, 'chicken_quesadilla': 100, 'ravioli': 100, 'chicken_curry': 100, 'foie_gras': 100, 'filet_mignon': 100, 'risotto': 100, 'pork_chop': 100, 'ice_cream': 100, 'poutine': 100, 'club_sandwich': 100, 'tiramisu': 100, 'clam_chowder': 100, 'grilled_cheese_sandwich': 100, 'baby_back_ribs': 100, 'r

#### Export dataset either with absolute or relative paths

Use relative=False (default) for absolute paths - in case you want to train with this dataset on your local computer

Use relative=True for relative paths - in case you want to share this dataset with someone

In [12]:
export_manifest(dataset, export_dir=Path("../data/merged_dataset/"), relative=True)

 100% |█████████████| 15047/15047 [6.4s elapsed, 0s remaining, 2.6K samples/s]        


#### Alternatively, you may also split the dataset and then export it

In [13]:
split_and_export_manifest(dataset, export_dir=Path("../data/merged_dataset/"), relative=False)

 100% |███████████████| 9028/9028 [3.6s elapsed, 0s remaining, 3.2K samples/s]        
 100% |███████████████| 3010/3010 [1.7s elapsed, 0s remaining, 2.4K samples/s]       
 100% |███████████████| 3009/3009 [1.5s elapsed, 0s remaining, 1.7K samples/s]         


#### Changing relative paths to absolute paths
In case, you would like to change the relative paths back to absolute paths, you need to specify absolute paths to the datasets folders

In [None]:
dataset_path_dic = {}
dataset_path_dic["food_101_small"] = "/workspaces/google-kaggle-competition/datasets/food_101_small"
dataset_path_dic["freiburg_groceries"] = "/workspaces/google-kaggle-competition/datasets/freiburg_groceries"

create_absolute_paths_in_manifest(dataset_path_dic=dataset_path_dic, input_manifest_path=Path("../data/merged_dataset/manifest.json"))