In [27]:
import cv2
import fiftyone as fo
import fiftyone.brain as fob
import fiftyone.utils.random as four
from fiftyone import ViewField as F
import numpy as np
from pathlib import Path



### Download dataset to local
Download the datasets you want to test from our Data registry to your local folder and unzip it. Be careful about the revision of the Data registry.

You do not need any credentials to download the files.

Data registry link: https://github.com/iterative/google-kaggle-competition-data-pipeline/tree/pipeline_v2

In [13]:
!dvc get https://github.com/iterative/google-kaggle-competition-data-pipeline datasets/food_101_small -o ../datasets/food_101_small --rev a44271f
!dvc get https://github.com/iterative/google-kaggle-competition-data-pipeline datasets/freiburg_groceries -o ../datasets/freiburg_groceries --rev a44271f

  0% Downloading freiburg_groceries|                 |0/3 [00:00<?,    ?files/s]
![A
  0%|          |datasets/freiburg_groceries/labels.0.00/? [00:00<?,        ?B/s][A
  0%|          |datasets/freiburg_groceries/labe0.00/104k [00:00<?,        ?B/s][A

![A[A

  0%|          |datasets/freiburg_groceries/readme.0.00/? [00:00<?,        ?B/s][A[A
                                                                                [A

  0%|          |datasets/freiburg_groceries/read0.00/70.0 [00:00<?,        ?B/s][A[A

                                                                                [A[A
![A
  0%|          |datasets/freiburg_groceries/freibur0.00/? [00:00<?,        ?B/s][A
  0%|          |datasets/freiburg_groceries/frei0.00/510M [00:00<?,        ?B/s][A
  0%|          |datasets/freiburg_groceries67.6k/510M [00:00<24:23,     365kB/s][A
  0%|          |datasets/freiburg_groceries/272k/510M [00:00<10:58,     811kB/s][A
  0%|          |datasets/freiburg_groceries1.08M

In [14]:
!unzip -o ../datasets/food_101_small/food_101_small.zip -d ../datasets/food_101_small/
!unzip -o ../datasets/freiburg_groceries/freiburg_groceries.zip -d ../datasets/freiburg_groceries/

Archive:  ../datasets/freiburg_groceries/freiburg_groceries.zip
   creating: ../datasets/freiburg_groceries/data/
  inflating: ../datasets/freiburg_groceries/data/WATER0176.png  
  inflating: ../datasets/freiburg_groceries/data/SPICES0119.png  
  inflating: ../datasets/freiburg_groceries/data/JUICE0150.png  
  inflating: ../datasets/freiburg_groceries/data/HONEY0087.png  
  inflating: ../datasets/freiburg_groceries/data/CHOCOLATE0156.png  
  inflating: ../datasets/freiburg_groceries/data/SPICES0198.png  
  inflating: ../datasets/freiburg_groceries/data/JAM0145.png  
  inflating: ../datasets/freiburg_groceries/data/SODA0068.png  
  inflating: ../datasets/freiburg_groceries/data/TOMATO_SAUCE0148.png  
  inflating: ../datasets/freiburg_groceries/data/CEREAL0034.png  
  inflating: ../datasets/freiburg_groceries/data/HONEY0139.png  
  inflating: ../datasets/freiburg_groceries/data/FLOUR0056.png  
  inflating: ../datasets/freiburg_groceries/data/HONEY0134.png  
  inflating: ../datasets/freib

### Load datasets
Load datasets that you want to work with into Voxel51.

In [32]:
# In case, you need to delete the dataset because it was not created properly
dataset = fo.load_dataset('freiburg_groceries')
dataset.delete()

In [33]:
def load_dataset(dataset_name):
    if not fo.dataset_exists(dataset_name):
        dataset = fo.Dataset.from_dir(
            dataset_dir=Path(f"../datasets/{dataset_name}/"),
            dataset_type=fo.types.FiftyOneImageClassificationDataset,
            name=dataset_name
        )
    else:
        dataset = fo.load_dataset(dataset_name)
    return dataset

dataset_food_101_small = load_dataset('food_101_small')
dataset_freiburg_groceries = load_dataset('freiburg_groceries')

 100% |███████████████| 4947/4947 [3.2s elapsed, 0s remaining, 1.8K samples/s]      


#### Quick check that everything was loaded properly

In [34]:
print(dataset_food_101_small.count())
print(dataset_food_101_small.count_values('ground_truth.label'))

print(dataset_freiburg_groceries.count())
print(dataset_freiburg_groceries.count_values('ground_truth.label'))

10100
{'bibimbap': 100, 'cheesecake': 100, 'beet_salad': 100, 'spaghetti_carbonara': 100, 'fried_calamari': 100, 'falafel': 100, 'macaroni_and_cheese': 100, 'clam_chowder': 100, 'nachos': 100, 'hot_dog': 100, 'ramen': 100, 'scallops': 100, 'beignets': 100, 'filet_mignon': 100, 'beef_carpaccio': 100, 'takoyaki': 100, 'risotto': 100, 'ice_cream': 100, 'deviled_eggs': 100, 'french_onion_soup': 100, 'greek_salad': 100, 'club_sandwich': 100, 'garlic_bread': 100, 'ravioli': 100, 'bread_pudding': 100, 'croque_madame': 100, 'baklava': 100, 'sushi': 100, 'chicken_wings': 100, 'grilled_salmon': 100, 'paella': 100, 'fried_rice': 100, 'dumplings': 100, 'huevos_rancheros': 100, 'caprese_salad': 100, 'hummus': 100, 'creme_brulee': 100, 'breakfast_burrito': 100, 'fish_and_chips': 100, 'spring_rolls': 100, 'chicken_quesadilla': 100, 'ceviche': 100, 'shrimp_and_grits': 100, 'pork_chop': 100, 'crab_cakes': 100, 'frozen_yogurt': 100, 'cannoli': 100, 'lobster_roll_sandwich': 100, 'chicken_curry': 100, 'sa

### Filtering and changing of labels

In [23]:
view_food_101 = (
    dataset_food_101_small
    .map_labels("ground_truth", {"spaghetti_carbonara": "spaghetti", "spaghetti_bolognese": "spaghetti"})
)

view_only_pizza = (
    view_food_101
    .filter_labels("ground_truth", F("label").is_in(['pizza']))
)

view_freiburg = (
    dataset_freiburg_groceries
    .set_field("ground_truth.label", "packaged_goods")
)

### Merging of datasets

In [25]:
dataset = view_food_101.clone()
dataset.merge_samples(view_freiburg)

### In case you would like to manually inspect the dataset

In [None]:
session = fo.launch_app(dataset=dataset)

### Split dataset
Note: We do not want to do split here. We need to move it to data pipeline to make sure that ML and Search-index pipeline work with different set of images.

In [9]:

# Note: This produces splits that are not overlapping
view_train,view_val,view_test = four.random_split(dataset, [0.6,0.2,0.2])

### Export dataset
Note: Beware that there are absolute paths in manifest.json. Unfortunately, relative paths are buggy and do not work well in fiftyone.

In [10]:
output_dir = Path("../data/voxel51/")

view_train.export(export_dir=output_dir/'train', 
                dataset_type=fo.types.FiftyOneImageClassificationDataset,
                label_field='ground_truth',
                data_path='manifest.json',
                labels_path='labels.json',
                export_media='manifest')

view_val.export(export_dir=output_dir/'val', 
                dataset_type=fo.types.FiftyOneImageClassificationDataset,
                label_field='ground_truth',
                data_path='manifest.json',
                labels_path='labels.json',
                export_media='manifest')

view_test.export(export_dir=output_dir/'test', 
                dataset_type=fo.types.FiftyOneImageClassificationDataset,
                label_field='ground_truth',
                data_path='manifest.json',
                labels_path='labels.json',
                export_media='manifest')     

Directory '../data/voxel51/train' already exists; export will be merged with existing files
 100% |█████████████| 92770/92770 [31.7s elapsed, 0s remaining, 3.1K samples/s]      
Directory '../data/voxel51/val' already exists; export will be merged with existing files
 100% |█████████████| 26505/26505 [9.3s elapsed, 0s remaining, 3.0K samples/s]        
Directory '../data/voxel51/test' already exists; export will be merged with existing files
 100% |█████████████| 13253/13253 [4.6s elapsed, 0s remaining, 3.0K samples/s]        


### Code that changes relative paths in manifest.json to absolute paths
This is useful in case you would like to share this dataset with someone else

In [3]:
output_dir = Path("../data/voxel51/")

In [5]:
import json
from os.path import relpath
from pathlib import Path

def create_relative_paths(manifest_path: Path):
    manifest_path_parent = manifest_path.absolute().parent

    with open(manifest_path) as json_file:
        manifest = json.load(json_file)

    manifest_relative = {img_name:relpath(Path(abs_path), manifest_path_parent) for img_name, abs_path in manifest.items()}

    with open(manifest_path_parent/'manifest_relative.json', "w") as outfile:
        json.dump(manifest_relative, outfile)


create_relative_paths(manifest_path=output_dir/"train"/"manifest.json")
create_relative_paths(manifest_path=output_dir/"val"/"manifest.json")
create_relative_paths(manifest_path=output_dir/"test"/"manifest.json")

### How to import it to PyTorch
Once you generate labels.json and manifest.json files, you can load them into PyTorch with Custom data loader as follows:

In [None]:
train_dataset = FiftyOneTorchDataset(Path("/data/voxel51/train"), transform=Compose([Resize(size=(128, 128)), ToTensor()]))
val_dataset = FiftyOneTorchDataset(Path("/data/voxel51/val"), transform=Compose([Resize(size=(128, 128)), ToTensor()]))