# Criteria

1. Selected pairs of images with similar scores. 

    To get more detailed rank info. Because the scores are assigned by our scoring model, this might be accurate when predicting the selection of images with large score delta. But for images with small score delta, it performs badly.
2. Selected pairs within similar images.

    Currently,  annotator’s selection is strongly affected by the image topic or style. I.e., the annotator may alway choose Nintendo Mario style images, therefore, the scoring models will assign high scores to them. 
    This policy will force the annotator to focus on the image quality and may help us improve generation.

# Implement

1. Function: get_candidate_pairs_within_category

I will first provide a general function to get candidate pairs within category

Input:
- job_uuids: list[str], length of N
- categories: np.ndarray[int], shape is (N,)
- max_pairs: int, max selecting pairs. 
- max_pairs should 0 < max_pairs < (N / n_categories) ** 2.
    we will attempt to select (max_pairs / n_categories) pairs within each category.
    
Output:

pairs: list[(str, str)], seleted job_uuid pairs.


2. Function: get_candidate_pairs_by_score

I use 2 way to binning scores to categories:
By fixed step bins
By quantities
	
	I will provide a function to get candidate pairs with similar scores

Input:
- job_uuids: list[str], length of N
- scores: np.ndarray[float], shape is (N,)
- max_pairs: int, max selecting pairs. 
- n_bins: int, number of categories to be divided
- use_quantities: bool, to use quantities or fixed step bins

Output:

pairs: list[(str, str)], seleted job_uuid pairs.
	

3. Function: get_candidate_pairs_by_embedding
	
I use kmeans to divide images into categories of clusters.

Input:
- job_uuids: list[str], length of N
- embeddings: np.ndarray, shape is (N, 768)
- max_pairs: int, max selecting pairs. 
- n_clusters: int, number of categories to be divided

Output:

pairs: list[(str, str)], seleted job_uuid pairs.

These 2 criteria can be used with existing filters, we can filter images with score / variance / date, and pass the uuids and corresponding scores or embeddings to the function, and get candidate pairs.


In [1]:
import pandas as pd

import numpy as np

from sklearn.cluster import KMeans, MiniBatchKMeans

# general

In [2]:
def get_candidate_pairs_within_category(job_uuids: list, categories: np.ndarray, max_pairs: int):
    
    '''
    
    Input:
        - job_uuids: list[str], length of N
        - categories: np.ndarray[int], shape is (N,)
        - max_pairs: int, max selecting pairs. 
            max_pairs should 0 < max_pairs < (N / n_categories) ** 2.
            we will attempt to select (max_pairs / n_categories) pairs within each category.
            
    Output:
        - pairs: list[(str, str)], seleted job_uuid pairs.
    
    '''
    
    df = pd.DataFrame(np.stack([job_uuids, categories], axis=-1), columns=['job_uuid', 'category'])
    
    n_bins = len(np.unique(categories))
    
    max_pairs_within_bins = max_pairs // n_bins
    
    pairs = list()
    
    for c, g in df.groupby('category'):
        
        if g.shape[0] <= 1:
            continue
            
        sub_uuids = list(g['job_uuid'])
        np.random.shuffle(sub_uuids)
        
        cn2 = len(sub_uuids) * (len(sub_uuids) - 1) / 2
        
        step = max(1, int(cn2 // max_pairs_within_bins))
        
        sub_pairs = list()
        
        for i, uuid_i in enumerate(sub_uuids[:-1]):

            for uuid_j in sub_uuids[i+1::step]:

                sub_pairs.append((uuid_i, uuid_j))
                
            if len(sub_pairs) > max_pairs_within_bins:
                break
                
        if len(sub_pairs) == 0:
            sub_pairs.append((sub_uuids[0], sub_uuids[1]))
            
        pairs += sub_pairs[:max_pairs_within_bins]
        
    return pairs

# get pairs with similar score

In [3]:
def get_bins(min_value: float, max_value: float, n_bins: int):
    
    bins = np.linspace(min_value, max_value, n_bins - 1)
    
    return bins


def score_to_category_with_bins(scores: np.ndarray, bins: np.ndarray):
    
    return np.digitize(scores, bins)


def score_to_category_with_quantities(scores: np.ndarray, n_categories: int):
    
    rank = np.argsort(scores)
    
    n_samples = len(rank)
    
    step = int(np.ceil(n_samples / n_categories))
    
    return (rank + ((step - n_samples % step) // 2)) // step


def get_candidate_pairs_by_score(job_uuids: list, scores: np.ndarray, max_pairs: int, n_bins: int, use_quantities: bool = False):
    
    '''
    
    Input:
        - job_uuids: list[str], length of N
        - scores: np.ndarray[float], shape is (N,)
        - max_pairs: int, max selecting pairs. 
            max_pairs should 0 < max_pairs < (N / n_bins) ** 2.
            we will attempt to select (max_pairs / n_bins) pairs within each category.
        - n_bins: int, number of categories to be divided
        - use_quantities: bool, to use quantities or fixed step bins
            
    Output:
        - pairs: list[(str, str)], seleted job_uuid pairs.
    
    '''
    
    if use_quantities:
        
        categories = score_to_category_with_quantities(scores=scores, n_categories=n_bins)
    
    else:

        bins = get_bins(min_value=min(scores), max_value=max(scores), n_bins=n_bins)

        categories = score_to_category_with_bins(scores=scores, bins=bins)
    
    return get_candidate_pairs_within_category(
        job_uuids=job_uuids, 
        categories=categories, 
        max_pairs=max_pairs
    )

In [4]:
job_uuids = list(map(str, range(100)))
scores = np.random.randn(100)

In [5]:
len(get_candidate_pairs_by_score(job_uuids, scores, max_pairs=100, n_bins=10, use_quantities=True))

99

# sampling images by clustering

In [12]:
def embedding_to_category(embeddings: np.ndarray, n_clusters: int):
    
    model = MiniBatchKMeans(n_clusters=n_clusters, max_iter=100, n_init=3)
    
    labels = model.fit_predict(embeddings)
    
    return labels


def get_candidate_pairs_by_embedding(job_uuids: list, embeddings: np.ndarray, max_pairs: int, n_clusters: int):
    
    '''
    
    Input:
        - job_uuids: list[str], length of N
        - embeddings: np.ndarray, shape is (N, 768)
        - max_pairs: int, max selecting pairs. 
            max_pairs should 0 < max_pairs < (N / n_clusters) ** 2.
            we will attempt to select (max_pairs / n_clusters) pairs within each category.
        - n_clusters: int, number of categories to be divided
            
    Output:
        - pairs: list[(str, str)], seleted job_uuid pairs.
    
    '''
    
    categories = embedding_to_category(embeddings=embeddings, n_clusters=n_clusters)

    return get_candidate_pairs_within_category(
        job_uuids=job_uuids, 
        categories=categories, 
        max_pairs=max_pairs
    )

In [13]:
embeddings = np.random.rand(100, 768)

In [14]:
len(get_candidate_pairs_by_embedding(job_uuids, embeddings, max_pairs=100, n_clusters=10))

70

# select images

In [100]:
import pandas as pd
import os

In [107]:
df = pd.DataFrame(zip(npz['file_paths'], labels), columns=['file_path', 'label'])

In [108]:
target_dir = './image_clustering_clip_vision_kmeans'

for c, g in df.groupby('label'):
    
    os.makedirs(os.path.join(target_dir, f'{c}'), exist_ok=True)
    
    if g.shape[0] < 5:
        continue
    
    selected = np.random.choice(g['file_path'], 5, False)
    
    for file_path in selected:
        file_path = os.path.join('../kcg-ml-image-pipeline/output/dataset/image/', file_path.split('_')[0] + '.jpg')
        os.system(f'cp {file_path} {target_dir}/{c}/')