In [1]:
import fiftyone as fo
import fiftyone.brain as fob
import fiftyone.utils.random as four
from fiftyone import ViewField as F

from dataset_utils.download_utils import prepare_dataset
from dataset_utils.fiftyone_load_utils import load_dataset, split_and_export_manifest
from dataset_utils.fiftyone_export_utils import merge_datasets, export_manifest, create_absolute_paths_in_manifest

import cv2
import json
import numpy as np
import os
from pathlib import Path

In [2]:
DATASET_NAME_FOOD = "food_101_small"
INPUT_IMAGE_PATH_FOOD = Path("../food_101_small/")
TRIPLET_FOLDER_FOOD = Path("triplets_food_101_small")

DATASET_NAME_130K = "kaggle_130k"
INPUT_IMAGE_PATH_130K = Path("../kaggle_130k/")
TRIPLET_FOLDER_130K = Path("triplets_kaggle_130k")

### Load original dataset

In [3]:
def load_dataset(dataset_name, dataset_dir=None):
    if not fo.dataset_exists(dataset_name):
        dataset = fo.Dataset.from_dir(
            dataset_dir=Path(dataset_dir, dataset_name),
            dataset_type=fo.types.FiftyOneImageClassificationDataset,
            name=dataset_name
        )
    else:
        # Loads the FiftyOne dataset with the given name
        dataset = fo.load_dataset(dataset_name)
    return dataset

dataset_food_101 = load_dataset(DATASET_NAME_FOOD, dataset_dir=INPUT_IMAGE_PATH_FOOD.parent)
dataset_kaggle_130k = load_dataset(DATASET_NAME_130K, dataset_dir=INPUT_IMAGE_PATH_130K.parent)

### Load triplets

In [6]:
def load_triplets(triplet_folder):
    triplet_files = [p for p in triplet_folder.iterdir() if p.is_file()]

    triplets = {}
    for triplet_file in triplet_files:
        with open(triplet_file) as file:
            triplet = json.load(file)
            triplets[triplet['label']]=triplet

    files_to_triplet = {}
    for label, triplet in triplets.items():
        for file in triplet['close_positive']:
            files_to_triplet.setdefault(str(Path(file)), []).append({"label": triplet['label'], "tag":"close_positive"})
        for file in triplet['hard_positive']:
            files_to_triplet.setdefault(str(Path(file)), []).append({"label": triplet['label'], "tag":"hard_positive"})
        for file in triplet['hard_negative']:
            files_to_triplet.setdefault(str(Path(file)), []).append({"label": triplet['label'], "tag":"hard_negative"})
            #files_to_triplet[file]=[{"label": triplet['label'], "anchor":"xx", "tag":"close_positive"}]
    
    return files_to_triplet


files_to_triplet_food = load_triplets(TRIPLET_FOLDER_FOOD)
files_to_triplet_130k = load_triplets(TRIPLET_FOLDER_130K)

### Add information about similarity tags into dataset

In [48]:
def add_similarity_tag_info(dataset, files_to_triplet):
    for sample in dataset.iter_samples(autosave=True):
        rel_path = str(Path(*list(Path(sample['filepath']).parts[-2:])))
        if rel_path in files_to_triplet:
            sample['ground_truth']['similarity_tags']=files_to_triplet[rel_path]
            sample['ground_truth']['original_source']=str(Path(*list(Path(sample['filepath']).parts[-3:])))

add_similarity_tag_info(dataset_food_101, files_to_triplet_food)
add_similarity_tag_info(dataset_kaggle_130k, files_to_triplet_130k)

### Get only images with some similarity tag

In [49]:
dataset_food_101_triplets = dataset_food_101.match(F("ground_truth.similarity_tags").length() > 0)
dataset_kaggle_130k_triplets = dataset_kaggle_130k.match(F("ground_truth.similarity_tags").length() > 0)

### Merge datasets

In [51]:
dataset_merge = merge_datasets([dataset_food_101_triplets, dataset_kaggle_130k_triplets])

### Export merged dataset with images

In [52]:
dataset_merge.export(
    export_dir=str(Path("")),
    dataset_type=fo.types.FiftyOneImageClassificationDataset,
    include_attributes=['similarity_tags', 'original_source'],
    export_media=True,
    overwrite=False
)

Directory '.' already exists; export will be merged with existing files
 100% |███████████████| 3062/3062 [40.2s elapsed, 0s remaining, 100.9 samples/s]      


Bad pipe message: %s [b'([\x846\xc4\x8c\xcb\xcd\x9f1\xc0\x96%*\xc2m<\xf8 \x90\xb5C\xae\xb9)\xe3\xb5%{\x04\x80\x0ebJ\xbf\x17\xca!\xbfm\xc4\xa8 \xc2\xa6\xea\x7f\xbb\x93\t\xc8\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00']
Bad pipe message: %s [b' \x86\x0c al\xde\xb9\xa8\x83d\xc5\xbc\xb7*\xafS\xd8I\xea\x074)m\xf1"\xf02']
Bad pipe message: %s [b'D\xe6\x8c\xe0:,\x05e_\xd0^=t@\xa7\x8b\xfe\x10 ~\xa9\x02c\x82X\x11Cer\x86\x8bU?\x16y\x17\xb31\xc1\xb6\x03\xbe\xf69t{Iy:\xf1_\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x