In [19]:
from pathlib import Path
from PIL import Image
from PIL import ImageFile

import imagehash
import os

In [20]:
ImageFile.LOAD_TRUNCATED_IMAGES = True

folder = Path(r"D:\images\dream_market")
duplicated_ones = []

### Average hashing
Average hashing is the simplest algorithm which uses only a few transformation.
Scale the image, convert to greyscale, calculate the mean and binarize the greyscale based on the mean.
Now convert the binary image into the integer.

In [21]:
average = []
a_img_hashes = {}
error_files = []
duplicated_ones = []

for image in os.listdir(folder):
    image_path = Path(f"{folder}\{image}")
    
    try:
        hash = imagehash.average_hash(Image.open(image_path))

        if hash in a_img_hashes:
             duplicated_ones.append(str(image))
#             average.append('{} duplicate of {}'.format(image, a_img_hashes[hash]))
             os.remove(image_path)
        else:
            a_img_hashes[hash] = image
    except:
        error_files.append(image)

In [23]:
print(duplicated_ones)

['11191.jpg', '12174.jpg', '128.jpg', '13010.jpg', '13544.jpg', '136.jpg', '14254.jpg', '14761.jpg', '14811.jpg', '1515.jpg', '15188.jpg', '15277.jpg', '15377.jpg', '15397.jpg', '1593.jpg', '15931.jpg', '1636.jpg', '16432.jpg', '1645.jpg', '16750.jpg', '16924.jpg', '16963.jpg', '17219.jpg', '17263.jpg', '17443.jpg', '17957.jpg', '17964.jpg', '17966.jpg', '17991.jpg', '18005.jpg', '18080.jpg', '18103.jpg', '18110.jpg', '18153.jpg', '18237.jpg', '18269.jpg', '18338.jpg', '18345.jpg', '18349.jpg', '18441.jpg', '18527.jpg', '18552.jpg', '19360.jpg', '19677.jpg', '19903.jpg', '20150.jpg', '20448.jpg', '21126.jpg', '21276.jpg', '21396.jpg', '21402.jpg', '21481.jpg', '21695.jpg', '21817.jpg', '22003.jpg', '22316.jpg', '22602.jpg', '22635.jpg', '23135.jpg', '23185.jpg', '23259.jpg', '24659.jpg', '24754.jpg', '24903.jpg', '25093.jpg', '25288.jpg', '2586.jpg', '2594.jpg', '27059.jpg', '27635.jpg', '27957.jpg', '28309.jpg', '28615.jpg', '28669.jpg', '28924.jpg', '29045.jpg', '29094.jpg', '2929.jp

### P-Hash
Perceptual hash uses similar approach but instead of averaging relies on discrete cosine transformation.

In [6]:
phash = []
p_img_hashes = {}
error_files = []

for image in os.listdir(folder):
    image_path = Path(f"{folder}\{image}")
    
    try:
        hash = imagehash.phash(Image.open(image_path))

        if hash in p_img_hashes:
            phash.append('{} duplicate of {}'.format(image, p_img_hashes[hash]))
    #         os.remove(image_path)
        else:
            p_img_hashes[hash] = image
    except:
        error_files.append(image)

In [7]:
print(len(p_img_hashes))

2995


### D-Hash
 Difference hash uses the same approach as a-hash, but instead of using information about average values, it uses gradients (difference between adjacent pixels).

In [8]:
dhash = []
d_img_hashes = {}
error_files = []

for image in os.listdir(folder):
    image_path = Path(f"{folder}\{image}")
    
    try:
        hash = imagehash.dhash(Image.open(image_path))

        if hash in d_img_hashes:
    #         duplicated_ones.append(str(image))
            dhash.append('{} duplicate of {}'.format(image, d_img_hashes[hash]))
    #         os.remove(image_path)
        else:
            d_img_hashes[hash] = image
    except:
        error_files.append(image)

In [9]:
print(len(d_img_hashes))

2986


### Wavelet hashing
Discrete Wavelet Transformation (DWT) is another form of frequency representation.

In [10]:
colorhash = []
color_img_hashes = {}
error_files = []

for image in os.listdir(folder):
    image_path = Path(f"{folder}\{image}")
    
    try:
        hash = imagehash.colorhash(Image.open(image_path))

        if hash in color_img_hashes:
    #         duplicated_ones.append(str(image))
            colorhash.append('{} duplicate of {}'.format(image, color_img_hashes[hash]))
    #         os.remove(image_path)
        else:
            color_img_hashes[hash] = image
    except:
        error_files.append(image)

In [11]:
print(len(color_img_hashes))

1234


## Comparison between image hashing techniques

In [None]:
print(f"Number of repeated images")
print(f"Average Hashing: {len(average)}")
print(f"P-Hash: {len(phash)}")
print(f"D-Hash: {len(dhash)}")
print(f"Wavelet Hashing: {len(colorhash)}")

In [13]:
list(set(average) - set(dhash))
# list(set(dhash) - set(average))
# list(set(colorhash) - set(average))

# list(set(phash) - set(dhash))
# list(set(phash) - set(colorhash))

# list(set(dhash) - set(colorhash))

for i in dhash:
    print(i)

12174.jpg duplicate of 12074.jpg
13010.jpg duplicate of 12074.jpg
136.jpg duplicate of 11425.jpg
1515.jpg duplicate of 10182.jpg
15377.jpg duplicate of 12074.jpg
1593.jpg duplicate of 1524.jpg
15931.jpg duplicate of 10003.jpg
16432.jpg duplicate of 16416.jpg
16750.jpg duplicate of 11087.jpg
16924.jpg duplicate of 16630.jpg
17219.jpg duplicate of 17016.jpg
17957.jpg duplicate of 14729.jpg
17964.jpg duplicate of 14874.jpg
18005.jpg duplicate of 15299.jpg
18103.jpg duplicate of 15258.jpg
18237.jpg duplicate of 15351.jpg
18338.jpg duplicate of 14737.jpg
18349.jpg duplicate of 15351.jpg
18441.jpg duplicate of 18158.jpg
18527.jpg duplicate of 15188.jpg
19360.jpg duplicate of 18116.jpg
20150.jpg duplicate of 19333.jpg
21276.jpg duplicate of 11087.jpg
21396.jpg duplicate of 11087.jpg
21481.jpg duplicate of 21126.jpg
23135.jpg duplicate of 21817.jpg
23259.jpg duplicate of 22559.jpg
24553.jpg duplicate of 23908.jpg
24754.jpg duplicate of 23106.jpg
24903.jpg duplicate of 23038.jpg
25288.jpg dupli