In [69]:
from PIL import Image
from imagehash import average_hash
import os
import random
from pybktree import BKTree, hamming_distance

In [70]:
def load_image_set(path):
    return load_image_sample(path, sample=1.0)

def load_image_sample(path, sample): 
    _docs = {doc: os.path.join(path, doc) for doc in os.listdir(path) if doc.endswith('.jpg')}
    return {doc: _docs[doc] for doc in random.sample(_docs.keys(), int(len(_docs)*sample))}

def fingerprint_images(images):
    return {img: str(average_hash(Image.open(images[img]))) for img in images}

In [71]:
query_set = load_image_set('/Users/zubin.john/forge/image-dedup/Transformed_dataset/Query/')
test_set = load_image_set('/Users/zubin.john/forge/image-dedup/Transformed_dataset/Retrieval/')

In [4]:
%timeit fingerprint_images(query_set)

25.5 s ± 987 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
%time test_hashes = fingerprint_images(test_set)
query_hashes = fingerprint_images(query_set)

CPU times: user 1min 55s, sys: 3.88 s, total: 1min 59s
Wall time: 2min 7s


In [44]:
def decode_hash(seq):
    return '{0:x}'.format(int(seq))

def encode_hash(seq):
    return hash(seq)

In [61]:
search_space = (
    [encode_hash(query) for query in query_hashes.values()] + [encode_hash(test) for test in test_hashes.values()]
)
%time index = BKTree(hamming_distance, search_space)

CPU times: user 76.2 ms, sys: 4.54 ms, total: 80.7 ms
Wall time: 78.6 ms


In [62]:
K, R = 15, {}

for image in query_hashes.values():
    res = index.find(encode_hash(image), 15)[:K]
    R[image] = [(each[1], decode_hash(each[1])) for each in res]
    break

In [63]:
R

{'fcf8e0e0e0e0fcfc': [(-5867088785018804111, '-516c162a30b9578f'),
  (-5867088785018804111, '-516c162a30b9578f')]}

In [64]:
res

[(0, -5867088785018804111), (0, -5867088785018804111)]

In [68]:
index.find(encode_hash(image), 5)

[(0, -5867088785018804111), (0, -5867088785018804111)]

In [66]:
encode_hash('fcf8e0e0e0e0fcfc')

-5867088785018804111

In [67]:
search_space

[-5867088785018804111,
 -6528560538849511402,
 -7746929009297931641,
 1051685256279499934,
 -1891440469104949485,
 5842147318468418364,
 -1926300134930477142,
 -3814399399233168106,
 -1304893310402441357,
 -4845106135236479656,
 4096347854494100159,
 5549606135367084388,
 5327281421255726700,
 -166582754196152679,
 -703647982427275693,
 -4327576837025754628,
 -3688034017671805497,
 8957884931512852017,
 4353043702099349122,
 7684503417335733301,
 -2699569555728908688,
 -7336027191305592598,
 -4664287749151465040,
 3414381053153550994,
 -2274534395360291781,
 3198216226225386744,
 -3055602417381626488,
 -1819909513889641963,
 -3518348536563112498,
 -5195215328341337957,
 4485947331263571857,
 -5526326311404612403,
 2494613194861429552,
 6749430472379289500,
 5193591187208284904,
 8209214323576331467,
 -2256035642685577164,
 1120963983050825472,
 2008317461274601477,
 6110368817876897884,
 -6631903944334116331,
 -6997791607457023302,
 -6178484833284045349,
 80755530580824343,
 2418178103

In [73]:
query_hashes

{'ukbench09550.jpg': 'fcf8e0e0e0e0fcfc',
 'ukbench01789.jpg': '0008fcfe7e7c3000',
 'ukbench02474.jpg': 'ff81819dd9837e3c',
 'ukbench06858.jpg': 'fffb4f1f3f1e1e0c',
 'ukbench04699.jpg': '3f3f7f7c0c020100',
 'ukbench03691.jpg': '40c0e0e0f2f0fcfc',
 'ukbench08865.jpg': '7f7f3380117f3f1f',
 'ukbench01857.jpg': 'cfcfe7e7e2e47800',
 'ukbench04115.jpg': '0028ff28082bffff',
 'ukbench03682.jpg': '000677fffefc0c00',
 'ukbench02622.jpg': '7cae83e1e0d04c66',
 'ukbench02385.jpg': '60785c4cf8f8f060',
 'ukbench02144.jpg': 'cec0d8c0c0c0c0ce',
 'ukbench06452.jpg': '387c3c7c7c7c7c70',
 'ukbench03476.jpg': 'fef0f8e0b8c8c0e0',
 'ukbench02732.jpg': '000c3c3e3e3e1f1c',
 'ukbench01003.jpg': 'ffc38381c1c1c7ef',
 'ukbench01999.jpg': '3f38000000e0ffff',
 'ukbench00184.jpg': 'f9d9918999c0e0f8',
 'ukbench10188.jpg': 'ffc7e10c80c1c181',
 'ukbench03783.jpg': 'c7c3d1d1d9d9c3e7',
 'ukbench02934.jpg': '7e7e7e3a393f3e1c',
 'ukbench03120.jpg': 'ffe7e7e3e1e0c080',
 'ukbench02093.jpg': '20003870dcfcfcfc',
 'ukbench03576.j