In [1]:
# !pip install simplejson

In [2]:
from pymongo import MongoClient
from pathlib import Path
from tqdm.notebook import tqdm
import numpy as np
import simplejson as json
import itertools

from IPython.display import display, Image, JSON
from ipywidgets import widgets, Image, HBox, VBox, Button, ButtonStyle

from lib.image_dedup import make_hashes, calculate_distance, hashes_diff

In [4]:
import ipywidgets
ipywidgets.Widget.close_all()

In [None]:
images_dir = Path('./images')
handmade_dir = Path('./handmade')
handmade_dir.mkdir(exist_ok=True)

In [None]:
mongo = MongoClient('172.17.0.1', 27017)
db = mongo['bad-vis']
posts = db['posts']
imagemeta = db['imagemeta']

# Load image metadata

In [None]:
imageMetas = [m for m in imagemeta.find()]
imageMetas.sort(key=lambda x: x['image_id'])

In [None]:
phash_to_idx_mapping = {}
for i in range(len(imageMetas)):
    phash = imageMetas[i]['phash']
    l = phash_to_idx_mapping.get(phash, [])
    l.append(i)
    phash_to_idx_mapping['phash'] = l


In [None]:
def phash_to_idx (phash):
    return phash_to_idx_mapping.get(phash, None)

# Calculate distance

## Hash distance

In [None]:
image_hashes = [make_hashes(m) for m in imageMetas]

In [None]:
distance = calculate_distance(image_hashes)

In [None]:
# distance2 = np.ndarray([len(image_hashes), len(image_hashes)])
# for i in tqdm(range(len(image_hashes))):
#     for j in range(i+1):
#         diff = hashes_diff(image_hashes[i], image_hashes[j])
#         distance2[i, j] = diff
#         distance2[j, i] = diff
# np.array_equal(distance, distance2)

In [None]:
pdistance = calculate_distance(image_hashes, hash_type='phash')

In [None]:
duplicated_image_phash_pairs_auto = set()
for i in tqdm(range(distance.shape[0])):
    for j in range(i):
        if distance[i, j] <= 1: # checked, all distance <= 1 are duplicated
            duplicated_image_phash_pairs_auto.add(frozenset([imageMetas[i]['phash'], imageMetas[j]['phash']]))

In [None]:
duplicated_image_phash_pairs_auto = set()
for i in tqdm(range(distance.shape[0])):
    for j in range(i):
        if distance[i, j] <= 1: # checked, all distance <= 1 are duplicated
            duplicated_image_phash_pairs_auto.add(frozenset([imageMetas[i]['phash'], imageMetas[j]['phash']]))

In [None]:
def set_distance (phash_x, phash_y, value, mat=distance):
    idx_x = phash_to_idx(phash_x)
    idx_y = phash_to_idx(phash_y)
    if not idx_x or not idx_y:
        return
    for s in itertools.product(idx_x, idx_y):
        i, j = s
        mat[i, j] = value
        mat[j, i] = value

def set_distance_pairs (phash_pairs, value, mat=distance):
    for p in phash_pairs:
        if len(p) > 1:
            set_distance(*list(p), value, mat=mat)

## Apply information from meta data

In [None]:
duplicated_post_image_phash_pairs = set()

for p in posts.find():
    if 'duplicated_posts' not in p or len(p['duplicated_posts']) == 0:
        continue

    dp_phashes = [i['phash']
                    for dp in p['duplicated_posts']
                    for i in imagemeta.find({'post_id': dp})]
    if len(dp_phashes) > 1:
        print(f"More than 1 image {p['post_id']} {dp_phashes}")

    phashes = [i['phash'] for i in imagemeta.find({'post_id': p['post_id']})]
    for s in itertools.product(dp_phashes, phashes):
        fs = frozenset(s)
        if len(fs) > 1:
            duplicated_post_image_phash_pairs.add(fs)

set_distance_pairs(duplicated_post_image_phash_pairs, 0)

In [None]:
related_album_image_phash_pairs = set()

for album in {i['album'] for i in imagemeta.find({'album': {'$exists': True, '$ne': ''}})}:
    ra_phashes = [i['phash'] for i in imagemeta.find({'album': album})]
    if len(ra_phashes) <= 1:
        print(f"Only 1 or less image {album} {p['post_id']} {ra_phashes}")

    for s in itertools.product(ra_phashes, ra_phashes):
        fs = frozenset(s)
        if len(fs) > 1:
            related_album_image_phash_pairs.add(fs)

set_distance_pairs(related_album_image_phash_pairs, 60)

## Apply manual labeled data

In [None]:
class PersistentSet (set):
    @staticmethod
    def load_set (file):
        if Path(file).exists():
            s = PersistentSet([frozenset(a) if isinstance(a, list) else a for a in json.load(open(file))])
        else:
            s = PersistentSet()
        s.set_file(file)
        return s

    def set_file (self, file):
        self.file = file

    def persist_add (self, item):
        self.add(item)
        json.dump(list(self), open(self.file, 'w'), iterable_as_array=True)

    def persist_remove (self, item):
        self.remove(item)
        json.dump(list(self), open(self.file, 'w'), iterable_as_array=True)

In [None]:
duplicated_image_phash_pairs = PersistentSet.load_set(handmade_dir/'duplicated_image_phash_pairs.json')
not_duplicated_image_phash_pairs = PersistentSet.load_set(handmade_dir/'not_duplicated_image_phash_pairs.json')
related_image_phash_pairs = PersistentSet.load_set(handmade_dir/'related_image_phash_pairs.json')
invalid_image_phashes = PersistentSet.load_set(handmade_dir/'invalid_image_phashes.json')

In [None]:
set_distance_pairs(duplicated_post_image_phash_pairs, 0)
set_distance_pairs(duplicated_image_phash_pairs, 0)
set_distance_pairs(not_duplicated_image_phash_pairs, 60)

related_distance = np.full(distance.shape, 60)
set_distance_pairs(related_album_image_phash_pairs, 0, mat=related_distance)
set_distance_pairs(related_image_phash_pairs, 0, mat=related_distance)

In [None]:
def make_dedup_box (idx_x, idx_y, default=None):
    image_x = imageMetas[idx_x]
    phash_x = image_x['phash']
    image_y = imageMetas[idx_y]
    phash_y = image_y['phash']
    hash_pair = frozenset([phash_x, phash_y])

    yes_btn = widgets.Button(description="Duplicated", button_style='success')
    no_btn = widgets.Button(description="Not", button_style='info')
    related_btn = widgets.Button(description="Related", button_style='warning')
    invalid_x_btn = widgets.Button(description="X Invalid")
    invalid_y_btn = widgets.Button(description="Y Invalid")
    reset_btn = widgets.Button(description="Reset")
    output = widgets.Output()

    def on_yes (btn):
        with output:
            if default == 'no' and hash_pair in not_duplicated_image_phash_pairs:
                not_duplicated_image_phash_pairs.persist_remove(hash_pair)
                print('-Not')
            duplicated_image_phash_pairs.persist_add(hash_pair)
            print('Duplicated')

    def on_no (btn):
        not_duplicated_image_phash_pairs.persist_add(hash_pair)
        with output:
            print('Not')

    def on_related (btn):
        with output:
            if default == 'no' and hash_pair in not_duplicated_image_phash_pairs:
                not_duplicated_image_phash_pairs.persist_remove(hash_pair)
                print('-Not')
            related_image_phash_pairs.persist_add(hash_pair)
            print('Related')

    def on_invalid_x (btn):
        invalid_image_phashes.persist_add(phash_x)
        with output:
            print('Invalid X')

    def on_invalid_y (btn):
        invalid_image_phashes.persist_add(phash_y)
        with output:
            print('Invalid Y')

    def on_reset (btn):
        with output:
            if hash_pair in duplicated_image_phash_pairs:
                duplicated_image_phash_pairs.persist_remove(hash_pair)
                print('-Duplicated')
            if hash_pair in not_duplicated_image_phash_pairs:
                not_duplicated_image_phash_pairs.persist_remove(hash_pair)
                print('-Not')
            if hash_pair in related_image_phash_pairs:
                related_image_phash_pairs.persist_remove(hash_pair)
                print('-Related')
            if phash_x in invalid_image_phashes:
                invalid_image_phashes.persist_remove(phash_x)
                print('-Invalid X')
            if phash_y in invalid_image_phashes:
                invalid_image_phashes.persist_remove(phash_y)
                print('-Invalid Y')
            print('Reset')

    yes_btn.on_click(on_yes)
    no_btn.on_click(on_no)
    related_btn.on_click(on_related)
    invalid_x_btn.on_click(on_invalid_x)
    invalid_y_btn.on_click(on_invalid_y)
    reset_btn.on_click(on_reset)

    if default == 'no':
        on_no(None)

    return HBox([VBox([yes_btn, no_btn, related_btn, invalid_x_btn, invalid_y_btn, reset_btn, output]),
                 widgets.Image(value=open(image_x['file_path'], 'rb').read(), width=250, height=150),
                 widgets.Image(value=open(image_y['file_path'], 'rb').read(), width=250, height=150)])

In [None]:
# interested_phashes = set()

In [None]:
# def potential_duplicates_high (threshold):
#     for i in range(distance.shape[0]):
#         for j in range(i):
#             if distance[i, j] >= threshold:
#                 phash_pair = frozenset([imageMetas[i]['phash'], imageMetas[j]['phash']])
#                 if (phash_pair in duplicated_image_phash_pairs):
#                     interested_phashes.add(imageMetas[i]['phash'])
#                     interested_phashes.add(imageMetas[j]['phash'])
#                     yield (i, j)

In [None]:
# pduph = potential_duplicates_high(10)

In [None]:
# for i in range(100):
#     try:
#         next_pdup = next(pduph)
#     except StopIteration:
#         print('StopIteration')
#         break

#     idx_x, idx_y = next_pdup
#     image_x = imageMetas[idx_x]
#     image_y = imageMetas[idx_y]
#     print(f"{idx_x} {idx_y} {distance[idx_x, idx_y]} {image_x['phash']} {image_y['phash']} {image_x['width']} {image_y['width']} {image_x['image_id']} {image_y['image_id']}")
# #     display(make_dedup_box(idx_x, idx_y))

In [None]:
# interested_phashes.remove('bf1ed0e0c49cb16a')

In [None]:
# def potential_duplicates_interested (threshold):
#     for i in range(distance.shape[0]):
#         for j in range(i):
#             if distance[i, j] <= threshold:
#                 phash_pair = frozenset([imageMetas[i]['phash'], imageMetas[j]['phash']])
#                 if (imageMetas[i]['phash'] in interested_phashes or
#                     imageMetas[j]['phash'] in interested_phashes) and (
#                     phash_pair not in duplicated_image_phash_pairs_auto and
#                     phash_pair not in duplicated_post_image_phash_pairs and
#                     phash_pair not in duplicated_image_phash_pairs and
#                     phash_pair not in not_duplicated_image_phash_pairs and
#                     phash_pair not in related_album_image_phash_pairs and
#                     phash_pair not in related_image_phash_pairs):
#                     yield (i, j)

In [None]:
# pdupi = potential_duplicates_interested(18)

In [None]:
# for i in range(10):
#     try:
#         next_pdup = next(pdupi)
#     except StopIteration:
#         print('StopIteration')
#         break

#     idx_x, idx_y = next_pdup
#     image_x = imageMetas[idx_x]
#     image_y = imageMetas[idx_y]
#     print(f"{idx_x} {idx_y} {distance[idx_x, idx_y]} {image_x['phash']} {image_y['phash']} {image_x['width']} {image_y['width']} {image_x['image_id']} {image_y['image_id']}")
#     display(make_dedup_box(idx_x, idx_y))

In [None]:
def potential_duplicates (threshold):
    for i in range(distance.shape[0]):
        for j in range(i):
            if distance[i, j] <= threshold:
                phash_pair = frozenset([imageMetas[i]['phash'], imageMetas[j]['phash']])
                if (phash_pair not in duplicated_image_phash_pairs_auto and
                    phash_pair not in duplicated_post_image_phash_pairs and
                    phash_pair not in duplicated_image_phash_pairs and
                    phash_pair not in not_duplicated_image_phash_pairs and
                    phash_pair not in related_album_image_phash_pairs and
                    phash_pair not in related_image_phash_pairs):
                    yield (i, j)

In [None]:
distance_threshold = 12

In [None]:
pdup = potential_duplicates(distance_threshold)

In [None]:
# for i in range(10):
#     try:
#         next_pdup = next(pdup)
#     except StopIteration:
#         print('StopIteration')
#         break

#     idx_x, idx_y = next_pdup
#     image_x = imageMetas[idx_x]
#     image_y = imageMetas[idx_y]
#     print(f"{idx_x} {idx_y} {distance[idx_x, idx_y]} {image_x['phash']} {image_y['phash']} {image_x['width']} {image_y['width']} {image_x['image_id']} {image_y['image_id']}")
#     display(make_dedup_box(idx_x, idx_y, default=None if distance[idx_x, idx_y] < 9 else 'no'))