# Make Bag-of-visual-words

* Load Image's descriptor for all images. 
* Find corresponding visual word index for each descriptor. 
* Save list of visual word index.

# For Oxford 5k dataset, You can use already provided visual words 

In [None]:
%%time
# Oxford 5k dataset provides already converted visual words. We could use this one
import os 
import pickle

oxf5k_visualword_dir = './data/word_oxc1_hesaff_sift_16M_1M'
work_dir = "./oxfk5_provided"
if not os.path.exists(work_dir):
    os.mkdir(work_dir)

filelist = os.listdir(oxf5k_visualword_dir)
filelist.sort()
# print(filelist)
# for parent_dir, _, files in os.walk(oxf5k_visualword_dir):
#     print(files)

bow_dict = {}
count_descriptor = 0
image_feature_count_info = []
for filename in filelist:
    filepath = os.path.join(oxf5k_visualword_dir, filename)
    image_name = filename.replace(".txt", "")    
    visual_words = []
    with open(filepath) as f:
        lines = list(map(lambda x: x.strip(), f.readlines()[2:])) # ignore first two lines        
        for l in lines:
            val = l.split(" ")
            visual_word_index = int(val[0])-1 # This data use 1 to 1,000,000. convert to zero-based so 0 to 999,999  
            visual_words.append(visual_word_index)
            # print('{} descriptor {}'.format(filename, l))
        count_descriptor = count_descriptor + len(lines)
    image_feature_count_info.append((image_name, len(visual_words)))
    bow_dict[image_name] = sorted(visual_words)
    # break
# print('bow_dict:', bow_dict)
print('count_descriptor:', count_descriptor)

pickle.dump(bow_dict, open(os.path.join(work_dir, 'bow_dict_word_oxc1_hesaff_sift_16M_1M_pretrained.pkl'), 'wb'))


# Other case, you have to build it from scratch
After you prepared everything, run below code

## Preparation list

* (image, descriptor_list) tuple
* centroids


## Requirements

* Image name and its associated 128d descriptors
* visual words assigner
    * For a given 128d descriptor, it tells index of visual words. 
    * After you ran k-means clustering, you could use it to get nearest centroid's id



TODO: determine design choice when we use encoding such as Product Quantization. 
1. keep centroid as PQ code. It means we always have to keep encoder parts. 
2. keep centroid as original vector. It means there is room for additional error when we assign each descriptor to the nearest centroid. 

In [None]:
# Load image descriptor dictionary, and assign each descriptor to visual words
import pickle
import os
import numpy as np
import pqkmeans
from tqdm import tqdm
from multiprocessing import Pool, TimeoutError

image_descriptor_dict_path = 'image_descriptor_dict_oxc5k_extracted_hesaff_rootsift_13M.pkl'

# You should use matching encoder that was used to do PQk-means clustering. 
# encoder_save_path = 'pqencoder_100k_random_sample_from_16M.pkl'
# cluster_center_save_path = 'clustering_centers_in_pqcode_numpy.npy'

# encoder_save_path = 'pqencoder_1000k_random_sample_from_16M.pkl'
# cluster_center_save_path = 'clustering_centers_numpy_16M_feature_1000k_coodebook_131k_cluster.npy'

work_dir = "./output_oxf5k_extracted_13M_rootsift_1M_vocab_pqkmeans_1M_codebook_train"
encoder_save_path = os.path.join(work_dir, 'pqencoder.pkl')
cluster_center_save_path = os.path.join(work_dir, 'centroids_in_pqcodes.npy')
output_bow_dict_save_path = os.path.join(work_dir, 'bow_dict.pkl')

# encoder_save_path = 'encoder.pkl'
# cluster_center_save_path = 'clustering_centers_numpy.npy'
    
# For PQ-kmeans clustering, we first convert query to PQ codes. 
with open(encoder_save_path, 'rb') as f:
    encoder = pickle.load(f)
clustering_centers_in_pqcode_numpy = np.load(cluster_center_save_path)
# print('cluster centers shape: ', clustering_centers_in_pqcode_numpy.shape)

k = clustering_centers_in_pqcode_numpy.shape[0]
print('number of clusters:', k)
engine = pqkmeans.clustering.PQKMeans(encoder=encoder, k=k, iteration=1, verbose=False, init_centers=clustering_centers_in_pqcode_numpy)
    
bow_dict = {}


def run(val):
    image_name, tupval = val
    descriptors = tupval[1]
    data_points_pqcodes = encoder.transform(descriptors)
    # print('num_descriptors:', len(data_points_pqcodes))
    # print('num_descriptors shape:', data_points_pqcodes.shape)
    # TODO: speedup by using pq-kmeans assignment step. 
    # visual_words = get_assigned_center_index(data_points_pqcodes, clustering_centers_in_pqcode_numpy)
    visual_words = engine.predict(data_points_pqcodes) # Fast assignment step. 
    
    return (image_name, list(set(list(visual_words))))
    
if __name__ == "__main__":    
    
    with open(image_descriptor_dict_path, 'rb') as f:
        # key: image_name, value: tuple of (keypoint_nparray, descriptor_nparray) 
        # descriptor_nparray: 2d numpy array of shape (num_descriptor, dim_descriptor)
        image_descriptor_dict = pickle.load(f) 
    print('num images:', len(image_descriptor_dict))
    
    pool = Pool(processes=20) # start 20 worker processes
    # print same numbers in arbitrary order
    for image_name, bow in tqdm(pool.imap_unordered(run, image_descriptor_dict.items()), total=len(image_descriptor_dict)):        
        bow_dict[image_name] = bow
        
    print('done')
    pickle.dump(bow_dict, open(output_bow_dict_save_path, 'wb'))
    
# Timing: 24min for 5062 images with 16M features. 100k leanred codebook with 4 subspaces. 2^17 clusters. 
# Timing: 38min for 5062 images with 16M features. 1M leanred codebook with 8 subspaces. 2^17 clusters. 