In [1]:
import cv2
import fiftyone as fo
import fiftyone.brain as fob
import fiftyone.utils.random as four
from fiftyone import ViewField as F
import json
import numpy as np
from pathlib import Path



### Download dataset to local
Download the datasets you want to test from our Data registry to your local folder and unzip it. Be careful about the revision of the Data registry.

You do not need any credentials to download the files.

Data registry link: https://github.com/iterative/google-kaggle-competition-data-pipeline/tree/pipeline_v2

In [8]:
#!dvc get https://github.com/iterative/google-kaggle-competition-data-pipeline datasets/food_101_small/ -o ../datasets/ --rev c3d1044
#!dvc get https://github.com/iterative/google-kaggle-competition-data-pipeline datasets/freiburg_groceries -o ../datasets --rev c8cada7

  0% Downloading |                                  |0/13 [00:00<?,    ?files/s]
![A
  0%|          |datasets/food_101_small/labels.json0.00/? [00:00<?,        ?B/s][A
  0%|          |datasets/food_101_small/labels.j0.00/657k [00:00<?,        ?B/s][A
                                                                                [A
![A
  0%|          |datasets/food_101_small/prepare_dat0.00/? [00:00<?,        ?B/s][A
  0%|          |datasets/food_101_small/prepare0.00/24.6k [00:00<?,        ?B/s][A

  0%|          |datasets/food_101_small/readme.md  0.00/? [00:00<?,        ?B/s][A[A

  0%|          |datasets/food_101_small/readme.md0.00/116 [00:00<?,        ?B/s][A[A
                                                                                [A

                                                                                [A[A
![A
  0%|          |datasets/food_101_small/partition_60.00/? [00:00<?,        ?B/s][A
  0%|          |datasets/food_101_small/partiti0.00/

In [53]:
#!unzip -o '../datasets/food_101_small/partition_*.zip' -d ../datasets/food_101_small/
#!unzip -o ../datasets/freiburg_groceries/freiburg_groceries.zip -d ../datasets/freiburg_groceries/

### Check consistency of labels.json
In case you download only portion of dataset, then labels.json needs to be regenerated

In [63]:

# Download 'clean version of labels.json file'
#!dvc update https://github.com/iterative/google-kaggle-competition-data-pipeline datasets/food_101_small/labels.json --rev c3d1044
!rm ../datasets/food_101_small/labels.json
!dvc get https://github.com/iterative/google-kaggle-competition-data-pipeline datasets/food_101_small/labels.json -o ../datasets/food_101_small --rev c3d1044

dataset_folder = Path('../datasets/food_101_small')

with open(dataset_folder/'labels.json', 'r') as file:
      labels = json.load(file)

pathlist = (dataset_folder/'data').rglob('*.*')
img_list = [path.name.split('.')[0] for path in pathlist]

labels_consistent = labels['labels'].copy()
for img_name in labels['labels'].keys():
      if img_name not in img_list:
            labels_consistent.pop(img_name)

labels['labels'] = labels_consistent

with open(dataset_folder/"labels.json", "w") as outfile:
    json.dump(labels, outfile)

  0% Downloading labels.json|                        |0/1 [00:00<?,    ?files/s]
![A
  0%|          |datasets/food_101_small/labels.json0.00/? [00:00<?,        ?B/s][A
  0%|          |datasets/food_101_small/labels.j0.00/657k [00:00<?,        ?B/s][A
[0m                                                                            [A

### Load datasets
Load datasets that you want to work with into Voxel51.

In [64]:
# In case, you need to delete the dataset because it was not created properly
dataset = fo.load_dataset('food_101_small')
dataset.delete()

In [65]:
def load_dataset(dataset_name):
    if not fo.dataset_exists(dataset_name):
        dataset = fo.Dataset.from_dir(
            dataset_dir=Path(f"../datasets/{dataset_name}/"),
            dataset_type=fo.types.FiftyOneImageClassificationDataset,
            name=dataset_name
        )
    else:
        dataset = fo.load_dataset(dataset_name)
    return dataset

dataset_food_101_small = load_dataset('food_101_small')
dataset_freiburg_groceries = load_dataset('freiburg_groceries')

 100% |█████████████| 10100/10100 [6.9s elapsed, 0s remaining, 1.4K samples/s]      


#### Quick check that everything was loaded properly

In [66]:
print(dataset_food_101_small.count())
print(dataset_food_101_small.count_values('ground_truth.label'))

print(dataset_freiburg_groceries.count())
print(dataset_freiburg_groceries.count_values('ground_truth.label'))

10100
{'prime_rib': 100, 'hot_and_sour_soup': 100, 'grilled_cheese_sandwich': 100, 'crab_cakes': 100, 'gyoza': 100, 'french_toast': 100, 'fish_and_chips': 100, 'chicken_quesadilla': 100, 'lasagna': 100, 'chicken_curry': 100, 'foie_gras': 100, 'apple_pie': 100, 'carrot_cake': 100, 'macarons': 100, 'french_fries': 100, 'pizza': 100, 'frozen_yogurt': 100, 'donuts': 100, 'dumplings': 100, 'bibimbap': 100, 'lobster_bisque': 100, 'beef_tartare': 100, 'cheese_plate': 100, 'seaweed_salad': 100, 'cup_cakes': 100, 'eggs_benedict': 100, 'steak': 100, 'creme_brulee': 100, 'guacamole': 100, 'grilled_salmon': 100, 'baby_back_ribs': 100, 'tuna_tartare': 100, 'gnocchi': 100, 'onion_rings': 100, 'pulled_pork_sandwich': 100, 'mussels': 100, 'huevos_rancheros': 100, 'chicken_wings': 100, 'bread_pudding': 100, 'oysters': 100, 'baklava': 100, 'pad_thai': 100, 'paella': 100, 'deviled_eggs': 100, 'clam_chowder': 100, 'greek_salad': 100, 'hot_dog': 100, 'tacos': 100, 'chocolate_mousse': 100, 'tiramisu': 100, 

### Filtering and changing of labels

In [68]:
view_food_101 = (
    dataset_food_101_small
    .map_labels("ground_truth", {"spaghetti_carbonara": "spaghetti", "spaghetti_bolognese": "spaghetti"})
)

view_only_pizza = (
    view_food_101
    .filter_labels("ground_truth", F("label").is_in(['pizza']))
)

# This function replaces original label in ground_truth.label with new label. This means that all images will have the same single label.
view_freiburg = (
    dataset_freiburg_groceries
    .set_field("ground_truth.label", "packaged_goods")
)

### Merging of datasets

In [69]:
dataset = view_food_101.clone()
dataset.merge_samples(view_freiburg)

In [70]:
print(dataset.count_values('ground_truth.label'))

{'chocolate_mousse': 100, 'tiramisu': 100, 'cheesecake': 100, 'fried_rice': 100, 'sushi': 100, 'poutine': 100, 'spring_rolls': 100, 'ceviche': 100, 'ravioli': 100, 'packaged_goods': 4947, 'club_sandwich': 100, 'scallops': 100, 'ramen': 100, 'sashimi': 100, 'beef_carpaccio': 100, 'samosa': 100, 'hummus': 100, 'breakfast_burrito': 100, 'spaghetti': 200, 'takoyaki': 100, 'cannoli': 100, 'garlic_bread': 100, 'edamame': 100, 'chocolate_cake': 100, 'caprese_salad': 100, 'lobster_roll_sandwich': 100, 'pancakes': 100, 'croque_madame': 100, 'caesar_salad': 100, 'hamburger': 100, 'churros': 100, 'peking_duck': 100, 'escargots': 100, 'miso_soup': 100, 'waffles': 100, 'omelette': 100, 'french_onion_soup': 100, 'filet_mignon': 100, 'bruschetta': 100, 'prime_rib': 100, 'pho': 100, 'beet_salad': 100, 'strawberry_shortcake': 100, 'red_velvet_cake': 100, 'beignets': 100, 'foie_gras': 100, 'pizza': 100, 'carrot_cake': 100, 'macarons': 100, 'french_fries': 100, 'hot_and_sour_soup': 100, 'grilled_cheese_s

### In case you would like to manually inspect the dataset

In [None]:
session = fo.launch_app(dataset=dataset)

In [None]:
#To save changes you made in the UI, you need to call
dataset.save()

### Split dataset
Note: We do not want to do split here. We need to move it to data pipeline to make sure that ML and Search-index pipeline work with different set of images.

In [71]:

# Note: This produces splits that are not overlapping
view_train,view_val,view_test = four.random_split(dataset, [0.6,0.2,0.2], seed=51)

### Export dataset
Note: Beware that there are absolute paths in manifest.json. Unfortunately, relative paths are buggy and do not work well in fiftyone.

In [81]:
output_dir = Path("../data/voxel51/")

view_train.export(export_dir=str(output_dir/'train'), 
                dataset_type=fo.types.FiftyOneImageClassificationDataset,
                label_field='ground_truth',
                data_path='manifest.json',
                labels_path='labels.json',
                export_media='manifest',
                overwrite=True
                )

view_val.export(export_dir=str(output_dir/'val'), 
                dataset_type=fo.types.FiftyOneImageClassificationDataset,
                label_field='ground_truth',
                data_path='manifest.json',
                labels_path='labels.json',
                export_media='manifest',
                overwrite=True
                )

view_test.export(export_dir=str(output_dir/'test'), 
                dataset_type=fo.types.FiftyOneImageClassificationDataset,
                label_field='ground_truth',
                data_path='manifest.json',
                labels_path='labels.json',
                export_media='manifest',
                overwrite=True
                )     

 100% |███████████████| 9028/9028 [3.6s elapsed, 0s remaining, 3.0K samples/s]        
 100% |███████████████| 3010/3010 [1.2s elapsed, 0s remaining, 2.6K samples/s]         
 100% |███████████████| 3009/3009 [1.1s elapsed, 0s remaining, 2.7K samples/s]         


### Code that changes absolute paths in manifest.json to relative paths
This is useful in case you would like to share this dataset with someone else

In [82]:
output_dir = Path("../data/voxel51/")

In [103]:
import json
from os.path import relpath
from pathlib import Path

def create_relative_paths(manifest_path: Path):
    manifest_path_parent = manifest_path.absolute().parent

    with open(manifest_path) as json_file:
        manifest = json.load(json_file)

    # Relative path should look like as follows: "<dataset_name>/data/img_name_with_extension"
    manifest_relative = {img_name:"/".join(Path(abs_path).parts[-3:]) for img_name, abs_path in manifest.items()}
    #manifest_relative = {img_name:relpath(Path(abs_path), manifest_path_parent) for img_name, abs_path in manifest.items()}

    with open(manifest_path_parent/'manifest_relative.json', "w") as outfile:
        json.dump(manifest_relative, outfile)


create_relative_paths(manifest_path=output_dir/"train"/"manifest.json")
create_relative_paths(manifest_path=output_dir/"val"/"manifest.json")
create_relative_paths(manifest_path=output_dir/"test"/"manifest.json")

### Code that changes relative paths in manifest_relative.json to absolute paths
This is useful in case you would like to share this dataset with someone else

In [132]:
# The input should be dictionary with format: {"<dataset_name>: <absolute_path_to_the_folder>"}
# For example: {"food_101_small": "/workspaces/google-kaggle-competition/datasets/food_101_small"}

def create_absolute_paths(manifest_path: Path, dataset_path_dic):
    manifest_path=output_dir/"train"/"manifest_relative.json"
    manifest_path_parent = manifest_path.absolute().parent

    with open(manifest_path) as json_file:
        manifest_relative = json.load(json_file)

    manifest_absolute = {}
    for img_name, rel_path in manifest_relative.items():
        rel_path = Path(rel_path)
        dataset_name = Path(rel_path).parts[0]
        abs_path = Path(dataset_path_dic[dataset_name])/rel_path.parts[1]/rel_path.parts[2]
        manifest_absolute[img_name] = str(abs_path)
    
    with open(manifest_path_parent/'manifest.json', "w") as outfile:
        json.dump(manifest_absolute, outfile)



dataset_path_dic = {}
dataset_path_dic["food_101_small"] = "/workspaces/google-kaggle-competition/datasets/food_101_small"
dataset_path_dic["freiburg_groceries"] = "/workspaces/google-kaggle-competition/datasets/freiburg_groceries"

create_absolute_paths(manifest_path=output_dir/"train"/"manifest.json", dataset_path_dic=dataset_path_dic)
create_absolute_paths(manifest_path=output_dir/"val"/"manifest.json", dataset_path_dic=dataset_path_dic)
create_absolute_paths(manifest_path=output_dir/"test"/"manifest.json", dataset_path_dic=dataset_path_dic)

### How to import it to PyTorch
Once you generate labels.json and manifest.json files, you can load them into PyTorch with Custom data loader as follows:

In [None]:
# TODO: add readme that describes how to download data for ML training

In [None]:
train_dataset = FiftyOneTorchDataset(Path("/data/voxel51/train"), transform=ToTensor())
val_dataset = FiftyOneTorchDataset(Path("/data/voxel51/val"), transform=ToTensor())