In [1]:
from imagehash import phash, dhash, average_hash
import numpy as np

In [2]:
from types import FunctionType
import os
import random
from PIL import Image
from copy import deepcopy
from numpy import array

class Dataset:
    """
    Class wrapper to instantiate a Dataset object composing of a subset of test images
    and a smaller fraction of images that are used as queries to test search and retrieval.
    Object contains hashed image fingerprints as well, however, hashing method is set by user.
    """
    def __init__(self, path_to_queries: str, path_to_test: str) -> None:
        print(path_to_queries, path_to_test)
        self.query_docs = self.load_image_set(path_to_queries)
        self.test_docs = self.load_image_set(path_to_test)

    @staticmethod
    def load_image_set(path: str) -> dict: 
        return {doc: os.path.join(path, doc) for doc in os.listdir(path) if doc.endswith('.jpg')}

    
class HashedDataset(Dataset):
    def __init__(self, hashing_function: FunctionType, *args, **kwargs) -> None:
        super(HashedDataset, self).__init__(*args, **kwargs)
        self.hasher = hashing_function
        self.test_hashes = {doc: str(self.hasher(Image.open(self.test_docs[doc]))) for doc in self.test_docs}
        self.query_hashes = {doc: str(self.hasher(Image.open(self.query_docs[doc]))) for doc in self.query_docs}
        #self.fingerprint()
        self.doc2hash = deepcopy(self.test_hashes)
        self.doc2hash.update(self.query_hashes)
        self.hash2doc = {self.doc2hash[doc]: doc for doc in self.doc2hash}

    
    def fingerprint(self) -> None:
        self.test_hashes = {doc: str(self.hasher(Image.open(self.test_docs[doc]))) for doc in self.test_docs}
        self.query_hashes = {doc: str(self.hasher(Image.open(self.query_docs[doc]))) for doc in self.query_docs}

In [3]:
hull = HashedDataset(
    dhash,
    '/Users/zubin.john/forge/image-dedup/Transformed_dataset/Query/',
    '/Users/zubin.john/forge/image-dedup/Transformed_dataset/Retrieval/'
)

/Users/zubin.john/forge/image-dedup/Transformed_dataset/Query/ /Users/zubin.john/forge/image-dedup/Transformed_dataset/Retrieval/


In [28]:
## Unit test

from PIL import Image

x = np.array(Image.open('/Users/zubin.john/forge/image-dedup/Transformed_dataset/Retrieval/ukbench04754_vflip.jpg'))
y = Image.open('/Users/zubin.john/forge/image-dedup/Transformed_dataset/Retrieval/ukbench04754_vflip.jpg')           

assert dhash(Image.fromarray(x)) == dhash(y)

In [7]:
hull.doc2hash

{'ukbench05517_resize.jpg': 'f0c0e0c85aca84f8',
 'ukbench08043_resize.jpg': 'f0c9d2f2d3466064',
 'ukbench07494_cropped.jpg': 'f0c2323b8bc7e2f2',
 'ukbench00017_resize.jpg': '90202424346622c0',
 'ukbench05731_hflip.jpg': '02b08e8381014b09',
 'ukbench05008_resize.jpg': '0000008040406024',
 'ukbench04471_rotation.jpg': '74feb2c3869fbcc8',
 'ukbench09716_vflip.jpg': '4365f8e9e1939fb7',
 'ukbench04885_rotation.jpg': 'fef26467e6fcecb0',
 'ukbench09293_cropped.jpg': 'f22455727960f2e0',
 'ukbench09729_hflip.jpg': 'f8e4c68697162470',
 'ukbench05124_rotation.jpg': '7ce6cc9e1c9e8c90',
 'ukbench02404_vflip.jpg': 'f1e0c4c0c0c0e0f0',
 'ukbench05507_hflip.jpg': '31717131317175ff',
 'ukbench06695_rotation.jpg': '70f8b0f0f0da8090',
 'ukbench00925_hflip.jpg': 'a2585cb4bcb4b859',
 'ukbench01569_resize.jpg': 'e0e0e0980e9890e0',
 'ukbench01310_cropped.jpg': '3e333727627e7e7e',
 'ukbench07819_hflip.jpg': 'e0f8dcf0d038b8ac',
 'ukbench00114_hflip.jpg': 'f0e089717163732a',
 'ukbench09035_hflip.jpg': 'e0ccf2c2e