In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import sys
import json
import glob
import torch

import numpy as np
import pandas as pd

from PIL import Image
from matplotlib import pyplot

from tqdm.auto import tqdm

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from script.samples import representative_sample_selection, get_min_distance_to_representative_samples

In [3]:
ROOT = '../kcg-ml-image-pipeline/output/dataset/'

DATASETs = [
    'environmental', 
    'character', 
    'icons', 
    'mech', 
    'waifu',
    'propaganda-poster'
]

In [4]:
def save_representative_samples(dataset_name):

    # load emb
    
    emb_path = os.path.join('./data', dataset_name, 'clip_vision_emb.npz')
    
    npz = np.load(emb_path, allow_pickle=True)
    
    file_paths = npz['file_paths']
    file_paths = [os.path.splitext(file_path.split('_')[0])[0] for file_path in file_paths]
    path_to_index = {file_path: i for i, file_path in enumerate(file_paths)}
    
    # load rank data
    
    paths = sorted(glob.glob(os.path.join(ROOT, 'ranking', dataset_name, '*.json')))
    
    rank_pairs = list()
    for path in tqdm(paths, leave=False):
        js = json.load(open(path))
        
        file_path_1 = os.path.splitext(js['image_1_metadata']['file_path'])[0].replace('datasets/', '')
        file_path_2 = os.path.splitext(js['image_2_metadata']['file_path'])[0].replace('datasets/', '')
        
        if (file_path_1 not in path_to_index) or (file_path_2 not in path_to_index):
            continue
        rank_pairs.append((file_path_1, file_path_2, js['selected_image_index']))
    
    # build dataset
    
    rank_pairs = pd.DataFrame(rank_pairs, columns=['image_1', 'image_2', 'selected_image_index'])
    
    ordered_pairs = [((image_1, image_2) if selected_image_index == 0 else (image_2, image_1)) for image_1, image_2, selected_image_index in rank_pairs.itertuples(index=False, name=None)]
    ordered_pairs = pd.DataFrame(ordered_pairs, columns=['image_1', 'image_2'])
    
    seleted_indices = list(map(path_to_index.get, ordered_pairs['image_1']))
    un_seleted_indices = list(map(path_to_index.get, ordered_pairs['image_2']))
    
    indices = np.array(list(set(seleted_indices) | set(un_seleted_indices)))
    
    embs = npz['image_embeds'][indices]
    
    #
    
    selected = representative_sample_selection(samples=embs, threshold=0.25)
    
    # save
    
    representative_indices = indices[selected]
    unused_indices = indices[list(set(range(len(indices))).difference(selected))]
    
    representative_names = [file_paths[i] for i in representative_indices]
    unused_names = [file_paths[i] for i in unused_indices]
    
    json.dump({
        'representative': representative_names,
        'unused': unused_names
    }, open(f'./data/{dataset_name}/representative.json', 'wt'))

In [5]:
for dataset_name in DATASETs:
    save_representative_samples(dataset_name)

  0%|          | 0/662 [00:00<?, ?it/s]

  0%|          | 0/1090 [00:00<?, ?it/s]

  0%|          | 0/1377 [00:00<?, ?it/s]

  0%|          | 0/1969 [00:00<?, ?it/s]

  0%|          | 0/2879 [00:00<?, ?it/s]

  0%|          | 0/2968 [00:00<?, ?it/s]

  0%|          | 0/529 [00:00<?, ?it/s]

  0%|          | 0/919 [00:00<?, ?it/s]

  0%|          | 0/798 [00:00<?, ?it/s]

  0%|          | 0/1058 [00:00<?, ?it/s]

# continue

In [14]:
selected_names = json.load(open('./data/environmental/selected.json'))

In [15]:
selected_indices = list(map(path_to_index.get, selected_names))

In [21]:
distances = get_min_distance_to_representative_samples(embs, npz['image_embeds'][selected_indices])

In [16]:
selected = representative_sample_selection(samples=embs, threshold=0.25, existed_samples=npz['image_embeds'][selected_indices[:-10]])

  0%|          | 0/39750 [00:00<?, ?it/s]