In [4]:
import fiftyone as fo
import fiftyone.brain as fob
import fiftyone.utils.random as four
from fiftyone import ViewField as F

from dataset_utils.download_utils import prepare_dataset
from dataset_utils.fiftyone_load_utils import load_dataset, split_and_export_manifest
from dataset_utils.fiftyone_export_utils import merge_datasets, export_manifest, create_absolute_paths_in_manifest

import cv2
import json
import numpy as np
import os
from pathlib import Path

In [5]:
DATASET_NAME_FOOD = "food_101_small"
INPUT_IMAGE_PATH_FOOD = Path("../food_101_small/")
TRIPLET_FOLDER_FOOD = Path("triplets_food_101_small")

DATASET_NAME_130K = "kaggle_130k"
INPUT_IMAGE_PATH_130K = Path("../kaggle_130k/")
TRIPLET_FOLDER_130K = Path("triplets_kaggle_130k")

### Load original dataset

In [30]:
def load_dataset(dataset_name, dataset_dir=None):
    if not fo.dataset_exists(dataset_name):
        dataset = fo.Dataset.from_dir(
            dataset_dir=Path(dataset_dir, dataset_name),
            dataset_type=fo.types.FiftyOneImageClassificationDataset,
            name=dataset_name
        )
    else:
        # Loads the FiftyOne dataset with the given name
        dataset = fo.load_dataset(dataset_name)
    return dataset

dataset_food_101 = load_dataset(DATASET_NAME_FOOD, dataset_dir=INPUT_IMAGE_PATH_FOOD.parent)
dataset_kaggle_130k = load_dataset(DATASET_NAME_130K, dataset_dir=INPUT_IMAGE_PATH_130K.parent)

 100% |███████████| 132528/132528 [1.5m elapsed, 0s remaining, 1.4K samples/s]      


### Load triplets

In [69]:
def load_triplets(triplet_folder):
    triplet_files = [p for p in triplet_folder.iterdir() if p.is_file()]

    triplets = {}
    for triplet_file in triplet_files:
        with open(triplet_file) as file:
            triplet = json.load(file)
            triplets[triplet['label']]=triplet

    files_to_triplet = {}
    for label, triplet in triplets.items():
        anchor = str(Path(triplet['close_positive'][0]))
        for file in triplet['close_positive']:
            files_to_triplet.setdefault(str(Path(file)), []).append({"label": triplet['label'], "anchor":anchor, "tag":"close_positive"})
        for file in triplet['hard_positive']:
            files_to_triplet.setdefault(str(Path(file)), []).append({"label": triplet['label'], "anchor":anchor, "tag":"hard_positive"})
        for file in triplet['hard_negative']:
            files_to_triplet.setdefault(str(Path(file)), []).append({"label": triplet['label'], "anchor":anchor, "tag":"hard_negative"})
            #files_to_triplet[file]=[{"label": triplet['label'], "anchor":"xx", "tag":"close_positive"}]
    
    return files_to_triplet


files_to_triplet_food = load_triplets(TRIPLET_FOLDER_FOOD)
files_to_triplet_130k = load_triplets(TRIPLET_FOLDER_130K)

In [70]:
triplet_folder=TRIPLET_FOLDER_130K

triplet_files = [p for p in triplet_folder.iterdir() if p.is_file()]

triplets = {}
for triplet_file in triplet_files:
    with open(triplet_file) as file:
        triplet = json.load(file)
        triplets[triplet['label']]=triplet

files_to_triplet = {}
for label, triplet in triplets.items():
    anchor = str(Path(triplet['close_positive'][0]))
    for file in triplet['close_positive']:
        files_to_triplet.setdefault(str(Path(file)), []).append({"label": triplet['label'], "anchor":anchor, "tag":"close_positive"})
    for file in triplet['hard_positive']:
        files_to_triplet.setdefault(str(Path(file)), []).append({"label": triplet['label'], "anchor":anchor, "tag":"hard_positive"})
    for file in triplet['hard_negative']:
        files_to_triplet.setdefault(str(Path(file)), []).append({"label": triplet['label'], "anchor":anchor, "tag":"hard_negative"})
        #files_to_triplet[file]=[{"label": triplet['label'], "anchor":"xx", "tag":"close_positive"}]


test=Path(triplet['close_positive'][0])

#from pathlib import PureWindowsPath, PurePosixPath
#PureWindowsPath(test).as_posix()
files_to_triplet_130k

{'data/image11102-2.jpg': [{'label': 'street sign',
   'anchor': 'data/image11102-2.jpg',
   'tag': 'close_positive'}],
 'data/image13140-2.jpg': [{'label': 'street sign',
   'anchor': 'data/image11102-2.jpg',
   'tag': 'close_positive'}],
 'data/image13358-2.jpg': [{'label': 'street sign',
   'anchor': 'data/image11102-2.jpg',
   'tag': 'close_positive'}],
 'data/image14340-2.jpg': [{'label': 'street sign',
   'anchor': 'data/image11102-2.jpg',
   'tag': 'close_positive'}],
 'data/image1472-7.jpg': [{'label': 'street sign',
   'anchor': 'data/image11102-2.jpg',
   'tag': 'close_positive'}],
 'data/image18099-2.jpg': [{'label': 'street sign',
   'anchor': 'data/image11102-2.jpg',
   'tag': 'close_positive'}],
 'data/image20647-2.jpg': [{'label': 'street sign',
   'anchor': 'data/image11102-2.jpg',
   'tag': 'close_positive'}],
 'data/image24243-2.jpg': [{'label': 'street sign',
   'anchor': 'data/image11102-2.jpg',
   'tag': 'close_positive'}],
 'data/image26445-2.jpg': [{'label': 'str

### Add information about similarity tags into dataset

In [71]:
def add_similarity_tag_info(dataset, files_to_triplet):
    for sample in dataset.iter_samples(autosave=True):
        rel_path = str(Path(Path(sample['filepath']).parts[-2],Path(sample['filepath']).parts[-1]))
        if rel_path in files_to_triplet:
            sample['ground_truth']['similarity_tags']=files_to_triplet[rel_path]

add_similarity_tag_info(dataset_food_101, files_to_triplet_food)
add_similarity_tag_info(dataset_kaggle_130k, files_to_triplet_130k)

### Get only images with some similarity tag

In [72]:
dataset_food_101_triplets = dataset_food_101.match(F("ground_truth.similarity_tags").length() > 0)
dataset_kaggle_130k_triplets = dataset_kaggle_130k.match(F("ground_truth.similarity_tags").length() > 0)

### Merge datasets

In [74]:
dataset_merge = merge_datasets([dataset_food_101_triplets, dataset_kaggle_130k_triplets])

### Export merged dataset with images

In [76]:
dataset_merge.export(
    export_dir=str(Path("")),
    #labels_path=OUTPUT_IMAGE_PATH/'labels.json',
    dataset_type=fo.types.FiftyOneImageClassificationDataset,
    include_attributes=['similarity_tags'],
    export_media=True,
    overwrite=False
)

Directory '.' already exists; export will be merged with existing files
 100% |███████████████| 3062/3062 [42.7s elapsed, 0s remaining, 79.3 samples/s]      
