# minHash Image Clustering (MHIC) algorithm (Seed Generation Only)

Implementation based on 
* [Large-Scale Discovery of Spatially Related Images](ieeexplore.ieee.org/iel5/34/4359286/05235143.pdf) by Ondrej Chum and Jiri Matas
* [Scalable Near Identical Image and Shot Detection - Microsoft](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/civr2007.pdf) by Ondrej Chum, James Philbin, Michael Isard, Andrew Zisserman

## Purpose

If we see a similar image cluster as a connected compoenent, images are vertex. 
We have to find edges to get image cluster. minHash can be used to find subset of the edges quickly. 

Afterward, you may use image retrieval system to complete the connected component. 


## Requirements

* Visual words index list for each image

In [None]:
%load_ext autoreload
%autoreload 2
from src import mhic_seed_generation

mhic_seed_generation.main()

In [None]:
import os
import pickle


work_dir = './output'

output_similar_pair_result = 'similar_pair.pkl'
output_ransac_result = 'similar_pair_ransac.pkl'

similar_pairs = pickle.load(open(os.path.join(work_dir, output_similar_pair_result), 'rb'))
rasac_result = pickle.load(open(os.path.join(work_dir, output_ransac_result), 'rb'))

In [None]:
# IMAGE_DIR = "./data/oxford5k_images"
IMAGE_DIR = "./data/oxford/oxford5k/images"
similar_pairs.sort(key=lambda x: x[2], reverse=True)
for image_cluster, score, num_inlier in similar_pairs:
    print("pair: {}, score: {}".format(image_cluster, score, num_inlier))
    show_image_cluster(IMAGE_DIR, image_cluster)
    print('\n') 

In [None]:
import random
def show_image_cluster(image_dir, image_names):
    """
    show image cluster for oxford 5k dataset
    """
    # Visualize images assigned to this cluster    
    from PIL import Image
    import matplotlib.pyplot as plt
    
    imgs = []    
    for image_name in image_names:
        image_name = image_name.replace("oxc1_", "") + ".jpg"
        image_path = os.path.join(image_dir, image_name)
        img = Image.open(image_path)
        imgs.append(img)            
        
    cols = 5
    imgs = imgs[:cols]
    plt.figure(figsize=(20, 5))
    for i, img in enumerate(imgs):
        plt.subplot(1, cols, i + 1)
        plt.imshow(img)
    plt.show()

    
# IMAGE_DIR = "./data/oxford5k_images"
IMAGE_DIR = "./data/oxford/oxford5k/images"
sample_count = 10
print("Sampling from irrelevant images.")
target_seq = similar_pairs[:count_irr]
k = min(sample_count, len(target_seq))
for image_cluster, score in random.sample(target_seq, k):
    print("pair: {}, score: {}".format(image_cluster, score))
    show_image_cluster(IMAGE_DIR, image_cluster)
    print('\n')    
    

In [None]:

print("Sampling from similar images.")
target_seq = similar_pairs[count_irr:count_irr+count_sim]
k = min(sample_count, len(target_seq))
target_seq.sort(key=lambda x: x[1], reverse=True)
for image_cluster, score in target_seq:
    print("pair: {}, score: {}".format(image_cluster, score))
    show_image_cluster(IMAGE_DIR, image_cluster)
    print('\n')    
    
print("Sampling from near-duplicates images.")
target_seq = similar_pairs[count_irr+count_sim:]
target_seq.sort(key=lambda x: x[1], reverse=True)
k = min(sample_count, len(target_seq))
for image_cluster, score in random.sample(target_seq, k):
    print("pair: {}, score: {}".format(image_cluster, score))
    show_image_cluster(IMAGE_DIR, image_cluster)
    print('\n')    