In [1]:
# Generate a dataset with transformations agreed upon after survey
# 1. Rotate (25 degrees) (25 degrees selected to keep in line with the image-tagging project)
# 2. Horizontal flip
# 3. Resize to (300, 300) [This modifies the aspect ratio to 1:1]
# 4. Crop images while preserving the aspect ratio (all images cropped to same size, but crops have different parts of
# the image). Selected size= (560, 420). Preserves the aspect ratio. Keeps 87.5% of the size (same as image-tagging)
# 5. Vertical flip

# Naming: Take each image, apply transformations and name: <original_name>_<transformation_name>.jpg

In [1]:
# paths

import os
import pdb
import pickle
import itertools
import numpy as np

from pathlib import Path
from PIL import Image
from shutil import copyfile

path_source = Path('ukbench/full/')

# Transformed Dataset

In [2]:
path_target_query = Path('Transformed_dataset/Query')
path_target_retrieval = Path('Transformed_dataset/Retrieval')

if not os.path.exists(path_target_query):
    os.makedirs(path_target_query)
if not os.path.exists(path_target_retrieval):
    os.makedirs(path_target_retrieval)

In [3]:
dict_ground_truth = {}

In [4]:
all_near_dups = sorted(os.listdir(path_source))
n_groups = 2550 # Total number of groups in the dataset (complete dataset has 2550 groups for a total of 10,200 images)
splitted_groups = np.array_split(all_near_dups, n_groups)

In [5]:
def save_func(transformed_image, target_dir, orig_name, transform_name):
    save_name = os.path.join(target_dir, orig_name + '_' + transform_name + '.jpg')
    transformed_image.save(save_name)
    return save_name

In [6]:
def rot_func(opened_image, image_name, theta=25):
    rot_im = opened_image.rotate(theta)
    save_name = save_func(rot_im, path_target_retrieval.absolute(), image_name.split('/')[2].split('.')[0], 'rotation')
    # pdb.set_trace()
    save_name = save_name.split('/')[-1]
    return save_name

In [7]:
def hflip_func(opened_image, image_name):
    hflip_im = opened_image.transpose(method=Image.FLIP_LEFT_RIGHT)
    save_name = save_func(hflip_im, path_target_retrieval.absolute(), image_name.split('/')[2].split('.')[0], 'hflip')
    save_name = save_name.split('/')[-1]
    return save_name

In [8]:
def resize_func(opened_image, image_name, target_size=(300,300)):
    res_im = opened_image.resize(target_size)
    save_name = save_func(res_im, path_target_retrieval.absolute(), image_name.split('/')[2].split('.')[0], 'resize')
    save_name = save_name.split('/')[-1]
    return save_name

In [9]:
def vflip_func(opened_image, image_name):
    vflip_im = opened_image.transpose(method=Image.FLIP_TOP_BOTTOM)
    save_name = save_func(vflip_im, path_target_retrieval.absolute(), image_name.split('/')[2].split('.')[0], 'vflip')
    save_name = save_name.split('/')[-1]
    return save_name

In [10]:
def random_crop_func(opened_image, image_name, crop_dims=(560, 420)):
    h, w = opened_image.height, opened_image.width # h=480, w=640
    ch, cw = crop_dims[1], crop_dims[0] # ch=420, cw=560
    
    left = np.random.randint(0, w - cw)
    right = left + cw
    upper = np.random.randint(0, h - ch)
    lower = upper + ch
    
    box = (left, upper, right, lower)
    
    cropped_im = opened_image.crop(box)
    save_name = save_func(cropped_im, path_target_retrieval.absolute(), image_name.split('/')[2].split('.')[0], 'cropped')
    save_name = save_name.split('/')[-1]
    return save_name

In [11]:
# get one candidate from each group

In [12]:
for j in splitted_groups:
    # generate random number between 0 and 3 (inclusive) to select candidate as query image
    index_to_select = np.random.randint(0, 4)
    i = j[index_to_select]
    
    image_name = os.path.join(path_source, i)
    im = Image.open(image_name)
    
    # save transformed images
    save_name_rot = rot_func(im, image_name)
    save_name_hflip = hflip_func(im, image_name)
    save_name_res = resize_func(im, image_name)
    save_name_vflip = vflip_func(im, image_name)
    save_name_crop = random_crop_func(im, image_name)
    key_name = i # image_name.split('/')[1]
    dict_ground_truth[key_name] = [save_name_rot, save_name_hflip, save_name_res, save_name_crop, save_name_vflip]
    
    # save query images
    im.save(os.path.join(path_target_query, i))
    # break

In [13]:
dict_ground_truth

{'ukbench00002.jpg': ['ukbench00002_rotation.jpg',
  'ukbench00002_hflip.jpg',
  'ukbench00002_resize.jpg',
  'ukbench00002_cropped.jpg',
  'ukbench00002_vflip.jpg'],
 'ukbench00005.jpg': ['ukbench00005_rotation.jpg',
  'ukbench00005_hflip.jpg',
  'ukbench00005_resize.jpg',
  'ukbench00005_cropped.jpg',
  'ukbench00005_vflip.jpg'],
 'ukbench00009.jpg': ['ukbench00009_rotation.jpg',
  'ukbench00009_hflip.jpg',
  'ukbench00009_resize.jpg',
  'ukbench00009_cropped.jpg',
  'ukbench00009_vflip.jpg'],
 'ukbench00013.jpg': ['ukbench00013_rotation.jpg',
  'ukbench00013_hflip.jpg',
  'ukbench00013_resize.jpg',
  'ukbench00013_cropped.jpg',
  'ukbench00013_vflip.jpg'],
 'ukbench00017.jpg': ['ukbench00017_rotation.jpg',
  'ukbench00017_hflip.jpg',
  'ukbench00017_resize.jpg',
  'ukbench00017_cropped.jpg',
  'ukbench00017_vflip.jpg'],
 'ukbench00023.jpg': ['ukbench00023_rotation.jpg',
  'ukbench00023_hflip.jpg',
  'ukbench00023_resize.jpg',
  'ukbench00023_cropped.jpg',
  'ukbench00023_vflip.jpg']

In [14]:
with open('ground_truth_transformed.pkl', 'wb') as f:
    pickle.dump(dict_ground_truth, f)

# Ground truth for near-duplicate dataset

In [15]:
path_near_duplicate = path_source
path_target_ND = Path('Near_duplicate_dataset/')

if not os.path.exists(path_target_ND):
    os.makedirs(path_target_ND)

In [16]:
# file names of all images in a group are consecutive
# get a list of filenames through os.listdir and sort them in a serial fashion
# Pick groups of 4 files and put them in a single group
# Create 4 sets of ground truths such that each image im the group gets to be the query image (we don't know which image
# should be the representative image for the group)

In [17]:
all_near_dups = sorted(os.listdir(path_near_duplicate))
n_groups = 2550 # Total number of groups in the dataset (complete dataset has 2550 groups for a total of 10,200 images)
splitted_groups = np.array_split(all_near_dups, n_groups)

In [18]:
# iterate over the splitted_groups and generate 4 ground truth dictionaries

In [19]:
group_size = 4 # number of duplicates in a group
dicts = {'dict_{}'.format(i):{} for i in range(group_size)}

In [20]:
def switcher(case):
    case_name = 'dict_{}'.format(case)
    return case_name

In [21]:
for gr in splitted_groups:
    group_files = list(gr)
    for i in range(len(group_files)):
        key_el = i
        val_els = set(np.arange(group_size)).difference(set([key_el]))
        dict_to_populate = switcher(key_el)
        dicts[dict_to_populate][group_files[key_el]] = [group_files[j] for j in val_els]

In [25]:
# save the groups

for i in range(group_size):
    print(f'saving dictionary: {i}')
    dict_to_save = dicts['dict_{}'.format(i)] 
    save_name_dict = 'ground_truth_ND_' + str(i) + '.pkl'
    with open(save_name_dict, 'wb') as f:
        pickle.dump(dict_to_save, f)

saving dictionary: 0
saving dictionary: 1
saving dictionary: 2
saving dictionary: 3


In [26]:
# Iterate over dictionaries and save query and retrievals

for iteration_name, cur_dict in dicts.items():
    # curr_dict has queries as keys, retrievals as value
    # pdb.set_trace()
    queries = cur_dict.keys()
    retrievals = cur_dict.values()
    
    list_query_filenames = list(queries)
    list_retrievals = list(itertools.chain(*list(retrievals)))
    
    # move queries to query directory, retrievals to retirieval directory
    # create a target path name using iteration name, create directories & subdirs (Query and Retrieval) if they don't exit
    target_save_directory = os.path.join(path_target_ND, iteration_name.split('_')[1])
    
    if not os.path.exists(target_save_directory):
        os.mkdir(target_save_directory)
        query_path = os.path.join(target_save_directory, 'Query')
        os.mkdir(query_path)
        retrieval_path = os.path.join(target_save_directory, 'Retrieval')
        os.mkdir(retrieval_path)
    
        for f in list_query_filenames:
            copyfile(os.path.join(path_source, f), os.path.join(query_path, f))

        for f in list_retrievals:
            copyfile(os.path.join(path_source, f), os.path.join(retrieval_path, f))

# Exact duplicates

In [55]:
path_target_exact = Path('exact_dataset/')

In [45]:
dict_exact = {}

In [46]:
for gr in splitted_groups:
    group_files = list(gr)
    index_to_select = np.random.randint(0, 4)
    i = gr[index_to_select]
    query = i
    retrievals = group_files
    dict_exact[query] = retrievals

In [50]:
# move the files into correct locations

list_query_exact = list(dict_exact.keys())
list_retrievals_exact = list(itertools.chain(*list(dict_exact.values())))

In [52]:
len(list_query_exact)

2550

In [53]:
len(list_retrievals_exact)

10200

In [57]:
if not os.path.exists(path_target_exact):
    os.mkdir(path_target_exact)
    query_path_exact = os.path.join(path_target_exact, 'Query')
    os.mkdir(query_path_exact)
    retrieval_path_exact = os.path.join(path_target_exact, 'Retrieval')
    os.mkdir(retrieval_path_exact)

    for f in list_query_exact:
        copyfile(os.path.join(path_source, f), os.path.join(query_path_exact, f))

    for f in list_retrievals_exact:
        copyfile(os.path.join(path_source, f), os.path.join(retrieval_path_exact, f))

In [58]:
# modify dictionary such that key = value

for k in dict_exact.keys():
    dict_exact[k] = [k]

In [59]:
dict_exact

{'ukbench00000.jpg': ['ukbench00000.jpg'],
 'ukbench00005.jpg': ['ukbench00005.jpg'],
 'ukbench00010.jpg': ['ukbench00010.jpg'],
 'ukbench00013.jpg': ['ukbench00013.jpg'],
 'ukbench00019.jpg': ['ukbench00019.jpg'],
 'ukbench00023.jpg': ['ukbench00023.jpg'],
 'ukbench00026.jpg': ['ukbench00026.jpg'],
 'ukbench00031.jpg': ['ukbench00031.jpg'],
 'ukbench00034.jpg': ['ukbench00034.jpg'],
 'ukbench00038.jpg': ['ukbench00038.jpg'],
 'ukbench00042.jpg': ['ukbench00042.jpg'],
 'ukbench00046.jpg': ['ukbench00046.jpg'],
 'ukbench00048.jpg': ['ukbench00048.jpg'],
 'ukbench00054.jpg': ['ukbench00054.jpg'],
 'ukbench00057.jpg': ['ukbench00057.jpg'],
 'ukbench00061.jpg': ['ukbench00061.jpg'],
 'ukbench00064.jpg': ['ukbench00064.jpg'],
 'ukbench00070.jpg': ['ukbench00070.jpg'],
 'ukbench00072.jpg': ['ukbench00072.jpg'],
 'ukbench00078.jpg': ['ukbench00078.jpg'],
 'ukbench00081.jpg': ['ukbench00081.jpg'],
 'ukbench00084.jpg': ['ukbench00084.jpg'],
 'ukbench00089.jpg': ['ukbench00089.jpg'],
 'ukbench00

In [60]:
with open('ground_truth_exact.pkl', 'wb') as f:
    pickle.dump(dict_exact, f)