# Criteria

## 1. Selected pairs of images with similar scores. 

    To get more detailed rank info. Because the scores are assigned by our scoring model, this might be accurate when predicting the selection of images with large score delta. But for images with small score delta, it performs badly.
## 2. Selected pairs within similar images.

    Currently,  annotator’s selection is strongly affected by the image topic or style. I.e., the annotator may alway choose Nintendo Mario style images, therefore, the scoring models will assign high scores to them. 
    This policy will force the annotator to focus on the image quality and may help us improve generation.

# Implement

## 1. Function: get_candidate_pairs_within_category

I will first provide a general function to get candidate pairs within category

Input:
- categories: np.ndarray[int], shape is (N,)
- max_pairs: int, max selecting pairs. 
- max_pairs should 0 < max_pairs < (N / n_categories) ** 2.
    we will attempt to select (max_pairs / n_categories) pairs within each category.
    
Output:

pairs: list[(index, index)], seleted pairs, index of input categories.


## 2. Function: get_candidate_pairs_by_score

I use 2 way to binning scores to categories:
By fixed step bins
By quantities
	
	I will provide a function to get candidate pairs with similar scores

Input:
- scores: np.ndarray[float], shape is (N,)
- max_pairs: int, max selecting pairs. 
- n_bins: int, number of categories to be divided
- use_quantities: bool, to use quantities or fixed step bins

Output:

pairs: list[(index, index)], seleted pairs, index of input scores.
	

## 3. Function: get_candidate_pairs_by_embedding
	
I use kmeans to divide images into categories of clusters.

Input:
- embeddings: np.ndarray, shape is (N, 768)
- max_pairs: int, max selecting pairs. 
- n_clusters: int, number of categories to be divided

Output:

pairs: list[(index, index)], seleted pairs, index of input embeddings.

These 2 criteria can be used with existing filters, we can filter images with score / variance / date, and pass the uuids and corresponding scores or embeddings to the function, and get candidate pairs.


In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import pandas as pd
import numpy as np

import os
import json

from tqdm.auto import tqdm

import torch

from script.pairs import get_candidate_pairs_by_score, get_candidate_pairs_by_embedding, embedding_to_category, get_candidate_pairs_within_category
from script.samples import get_min_distance_to_representative_samples
from utils import get_score_from_embs

In [2]:
ROOT = '../kcg-ml-image-pipeline/output/dataset/'

DATASETs = [
    'environmental', 
    'character', 
    'icons', 
    'mech', 
    'waifu',
    'propaganda-poster'
]

SAVE_DIR = './result/1130/'

# save image info

In [3]:
def save_image_info(dataset_name):

    #

    emb_path = os.path.join('./data', dataset_name, 'clip_vision_emb.npz')
    
    npz = np.load(emb_path, allow_pickle=True)
    
    samples = npz['image_embeds'].astype('float32')
    
    file_paths = npz['file_paths']
    file_paths = [os.path.splitext(file_path.split('_')[0])[0] for file_path in file_paths]
    path_to_index = {file_path: i for i, file_path in enumerate(file_paths)}

    #
    
    pmt_path = os.path.join('./data', dataset_name, 'prompt.json')
    
    prompts = json.load(open(pmt_path))
    
    path_to_hash = {j['file_path'].split('_')[0]:i for i, j in prompts.items()}
    path_to_uuid = {j['file_path'].split('_')[0]: j['job_uuid'] for i, j in prompts.items()}
    # uuid_to_path = {j: i for i, j in path_to_uuid.items()}
    
    df = pd.DataFrame(
        zip(file_paths, map(path_to_hash.get, file_paths), map(path_to_uuid.get, file_paths)), 
        columns=['file_path', 'file_hash', 'job_uuid']
    )
    
    # score
    
    vision_weight_path = os.path.join('./weight/004', dataset_name, 'clip_vision.pt')
    
    vision_model = torch.nn.Linear(samples.shape[-1], 1, bias=True).cuda().eval()
    vision_model.load_state_dict(torch.load(vision_weight_path))

    score = get_score_from_embs(samples, vision_model, batch_size=1024)
    df['sigma_score'] = (score - score.mean()) / score.std()
    
    # distance
    
    representative_names = json.load(open(os.path.join('./data', dataset_name, 'representative.json')))['representative']
    representative_indices = list(map(path_to_index.get, representative_names))
    representative_samples = samples[representative_indices]
    
    df['min_distance_to_representative_samples'] = get_min_distance_to_representative_samples(samples, representative_samples, distance_type='cosine')

    # 
    
    for n_clusters in [10, 100]:
        if n_clusters > samples.shape[0] / 100:
            break
        df[f'category_{n_clusters}'] = embedding_to_category(embeddings=samples, n_clusters=n_clusters)

    #

    os.makedirs(os.path.join(SAVE_DIR, dataset_name), exist_ok=True)
    
    df.to_csv(os.path.join(SAVE_DIR, dataset_name, 'image_info.csv'), index=False)


In [4]:
for dataset_name in DATASETs:
    save_image_info(dataset_name)

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

# save_rank_queue

In [5]:
def get_job_uuid_pairs(df, pairs):
    
    indices_1, indices_2 = zip(*pairs)

    job_uuid_1s = df['job_uuid'].iloc[list(indices_1)]
    job_uuid_2s = df['job_uuid'].iloc[list(indices_2)]
    
    return [((job_uuid_1, job_uuid_2) if job_uuid_1 < job_uuid_2 else (job_uuid_2, job_uuid_1)) for job_uuid_1, job_uuid_2 in zip(job_uuid_1s, job_uuid_1s)]

In [6]:
def save_rank_queue(dataset_name):

    pmt_path = os.path.join('./data', dataset_name, 'prompt.json')
    
    prompts = json.load(open(pmt_path))
    
    ranked_pairs = set()
    for fname in tqdm(os.listdir(os.path.join(ROOT, 'ranking', dataset_name)), leave=False):
        js = json.load(open(os.path.join(ROOT, 'ranking', dataset_name, fname)))
        
        file_hash_1 = js['image_1_metadata']['file_hash']
        file_hash_2 = js['image_2_metadata']['file_hash']
    
        try:
            job_uuids_1 = prompts[file_hash_1]['job_uuid']
            job_uuids_2 = prompts[file_hash_2]['job_uuid']
        except:
            continue
        
        ranked_pairs.add((job_uuids_1, job_uuids_2))
        ranked_pairs.add((job_uuids_2, job_uuids_1))
    
    #
    
    df = pd.read_csv(os.path.join(SAVE_DIR, dataset_name, 'image_info.csv')).dropna()
    df.query('sigma_score > .75', inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    #
    
    result = df.query(f'min_distance_to_representative_samples > 0.25')[['job_uuid']].copy()
    result['policy'] = 'far_distance_to_ranked_images'
    result.to_csv(os.path.join(SAVE_DIR, dataset_name, 'images.csv') , index=False)
    
    #
    
    results = list()
    
    for n_bins in [10, 100]:
        
        pairs = get_candidate_pairs_by_score(df['sigma_score'].values, max_pairs=1000, n_bins=n_bins, use_quantiles=True)
        
        pairs = get_job_uuid_pairs(df, pairs)
        results.extend([pair + (f'same_sigma_score_bin_{n_bins}',) for pair in pairs if pair not in ranked_pairs])
    
    for n_clusters in [10, 100]:
        
        if f'category_{n_clusters}' not in df.columns:
            break
            
        pairs = get_candidate_pairs_within_category(df[f'category_{n_clusters}'].values, max_pairs=1000)
        
        pairs = get_job_uuid_pairs(df, pairs)
        results.extend([pair + (f'same_embedding_cluster_{n_bins}',) for pair in pairs if pair not in ranked_pairs])
        
    results = pd.DataFrame(results, columns=['job_uuid_1', 'job_uuid_2', 'policy'])
    results.drop_duplicates(['job_uuid_1', 'job_uuid_2'], keep='first', inplace=True)
    results.to_csv(os.path.join(SAVE_DIR, dataset_name, 'pairs.csv') , index=False)

In [7]:
for dataset_name in DATASETs:
    save_rank_queue(dataset_name)

  0%|          | 0/529 [00:00<?, ?it/s]

  0%|          | 0/798 [00:00<?, ?it/s]

# select images

In [100]:
import pandas as pd
import os

In [107]:
df = pd.DataFrame(zip(npz['file_paths'], labels), columns=['file_path', 'label'])

In [108]:
target_dir = './image_clustering_clip_vision_kmeans'

for c, g in df.groupby('label'):
    
    os.makedirs(os.path.join(target_dir, f'{c}'), exist_ok=True)
    
    if g.shape[0] < 5:
        continue
    
    selected = np.random.choice(g['file_path'], 5, False)
    
    for file_path in selected:
        file_path = os.path.join('../kcg-ml-image-pipeline/output/dataset/image/', file_path.split('_')[0] + '.jpg')
        os.system(f'cp {file_path} {target_dir}/{c}/')

In [35]:
import requests
import json

from minio import Minio

import os

from tqdm.auto import tqdm

In [20]:
class API:

    def __init__(self, api_url: str, display: bool = False, **kwargs):
        self.api_url = api_url
        self.display = display
        self.kwargs = kwargs

    def get(self, url: str, data: dict = None, return_content: bool = True, **kwargs):
        
        kwargs_ = dict(**self.kwargs)
        kwargs_.update(kwargs)
        if data is not None:
            kwargs_['params'] = data

        response = requests.get(f'{self.api_url}/{url}', **kwargs_)

        if return_content:
            if response.status_code != 200:
                raise Exception(f'{url} responded status_code {status_code}')
            return response.content
        return response
        
    def post(self, data, **kwargs):
        pass

    def get_rank_list(self, dataset: str):

        content = self.get('datasets/rank/list', data={'dataset': dataset})

        return json.loads(content)

    # def get_rank_infos(self, dataset: str, filenames: list):

    #     infos = list()

    #     for filename in tqdm(filenames, leave=False, disable=not self.display):

    #         info = None
            
    #         try:
                
    #             content = self.get(
    #                 'datasets/rank/read', 
    #                 data={
    #                     'dataset': dataset,
    #                     'filename': filename
    #                 },
    #                 return_content=False
    #             )

    #             if content.status_code == 200:
    #                 info = json.loads(content)
            
    #         except KeyboardInterrupt:
    #             break
    #         except:
    #             pass

    #         infos.append(info)

    #     return infos
    

In [13]:
api_url = 'http://123.176.98.90:8764'
dataset = 'mech'

In [25]:
api = API(api_url, display=True)

In [7]:
rank_result_file_paths = api.get_rank_list(dataset)

In [17]:
rank_result_file_names = [os.path.split(file_path)[-1] for file_path in rank_result_file_paths]

In [None]:
# infos = api.get_rank_infos(dataset, rank_result_file_names)
infos = list(tqdm(
    map(lambda file_name: get_file_from_minio(client, bucket_name, file_name), rank_result_file_paths), 
    leave=False, total=len(rank_result_file_paths)
))

In [42]:
rank_result_file_paths[0]

'mech/data/ranking/aggregate/2023-10-23-15-50-38-mert.json'

In [33]:
MINIO_ADDRESS = "123.176.98.90:9000"
access_key = "GXvqLWtthELCaROPITOG"
secret_key = "DmlKgey5u0DnMHP30Vg7rkLT0NNbNIGaM8IwPckD"
bucket_name = 'datasets'

In [37]:
def connect_to_minio(minio_addr, access_key, secret_key):
    # Initialize the MinIO client
    client = Minio(minio_addr, access_key, secret_key, secure=False)

    #Check server status
    try:
        response = requests.get("http://" + minio_addr + "/minio/health/live", timeout=5)
        if response.status_code == 200:
            print("Connected to MinIO server.")
        else:
            return None
    except requests.RequestException as e:
        return None
    
    return client

def get_file_from_minio(client, bucket_name, file_name):
    try:
        # Get object data
        data = client.get_object(bucket_name, file_name)

        return data

    except Exception as err:
        print(f"Error: {err}")

    return None

In [36]:
client = connect_to_minio(MINIO_ADDRESS, access_key, secret_key)

Connected to MinIO server.


In [None]:
def get_obj

In [30]:
client.get_object

NameError: name 'client' is not defined

In [8]:
objects = client.list_objects(bucket_name, dataset_name, recursive=True)

In [9]:
objects = list(tqdm(objects))

0it [00:00, ?it/s]

In [None]:
for obj in tqdm(objects):
    
    object_name = obj.object_name
    
    # if not object_name.lower().endswith((".jpg", ".png", ".jpeg")):
    #     continue
    # out_path = os.path.join(ROOT, 'image', object_name)
    
    # if not object_name.lower().endswith('_clip.msgpack'):
    #     continue
    # out_path = os.path.join(ROOT, 'clip', object_name)

    if object_name.lower().endswith('_clip.msgpack'):
        out_path = os.path.join(ROOT, 'clip', object_name)
    elif object_name.lower().endswith('_data.msgpack'):
        out_path = os.path.join(ROOT, 'data', object_name)
    else:
        continue
    
    if os.path.exists(out_path):
        continue
        
    try:
        client.fget_object(bucket_name, object_name, out_path)
    except KeyboardInterrupt:
        break
    except:
        continue

  0%|          | 0/1960419 [00:00<?, ?it/s]