In [None]:
# !pip install simplejson

In [2]:
from pymongo import MongoClient
from pathlib import Path
from tqdm.notebook import tqdm
import numpy as np
import simplejson as json
import itertools
from functools import cmp_to_key
import networkx as nx

from IPython.display import display, Image, JSON
from ipywidgets import widgets, Image, HBox, VBox, Button, ButtonStyle, Layout, Box

from lib.image_dedup import make_hashes, calculate_distance, hashes_diff
from lib.PersistentSet import PersistentSet
from lib.sort_things import post_score, sort_posts, sort_images
from lib.parallel import parallel

In [3]:
images_dir = Path('../images')
handmade_dir = Path('./handmade')
handmade_dir.mkdir(exist_ok=True)

In [4]:
mongo_uri = json.load(open('./credentials/mongodb_credentials.json'))['uri']
mongo = MongoClient(mongo_uri)
db = mongo['bad-vis']
posts = db['posts']
imagefiles = db['imagefiles']
imagemeta = db['imagemeta']
imagededup = db['imagededup']

In [5]:
imagededup.drop()
for i in imagemeta.find():
    imagededup.insert_one(i)

# Load image metadata

In [6]:
imageDedup = [m for m in imagemeta.find()]
imageDedup.sort(key=lambda x: x['image_id'])

In [7]:
phash_to_idx_mapping = {}
for i in range(len(imageDedup)):
    phash = imageDedup[i]['phash']
    l = phash_to_idx_mapping.get(phash, [])
    l.append(i)
    phash_to_idx_mapping[phash] = l
def phash_to_idx (phash):
    return phash_to_idx_mapping.get(phash, None)

In [8]:
image_id_to_idx_mapping = {imageDedup[i]['image_id']:i for i in range(len(imageDedup))}
def image_id_to_idx (image_id):
    return image_id_to_idx_mapping.get(image_id, None)

# Calculate distance

## Hash distance

In [9]:
image_hashes = [make_hashes(m) for m in imageDedup]

In [10]:
# distance = calculate_distance(image_hashes)
distance = calculate_distance(image_hashes, hash_type='phash')

HBox(children=(FloatProgress(value=0.0, max=2601.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10449.0), HTML(value='')))




In [11]:
# distance2 = np.ndarray([len(image_hashes), len(image_hashes)])
# for i in tqdm(range(len(image_hashes))):
#     for j in range(i+1):
#         diff = hashes_diff(image_hashes[i], image_hashes[j])
#         distance2[i, j] = diff
#         distance2[j, i] = diff
# np.array_equal(distance, distance2)

In [12]:
# pdistance = calculate_distance(image_hashes, hash_type='phash')

## Find duplicated pairs from distance matrix

In [13]:
def set_distance (hashes, value, mat=distance):
    phash_x = hashes[0]
    phash_y = phash_x if len(hashes) == 1 else hashes[1]
    idx_x = phash_to_idx(phash_x)
    idx_y = phash_to_idx(phash_y)
    if idx_x == None or idx_y == None:
        return
    for s in itertools.product(idx_x, idx_y):
        i, j = s
        mat[i, j] = value
        mat[j, i] = value

def set_distance_pairs (phash_pairs, value, mat=distance):
    for p in phash_pairs:
        set_distance(list(p), value, mat=mat)

In [14]:
auto_duplicated_image_phash_pairs = PersistentSet()
auto_duplicated_image_phash_pairs.set_file(handmade_dir/'auto_duplicated_image_phash_pairs.json')

In [15]:
for i in tqdm(range(distance.shape[0])):
    for j in range(i):
        if distance[i, j] <= 1: # checked, all distance <= 1 are duplicated
            auto_duplicated_image_phash_pairs.add(frozenset([imageDedup[i]['phash'], imageDedup[j]['phash']]))

HBox(children=(FloatProgress(value=0.0, max=10449.0), HTML(value='')))




In [16]:
# for i in tqdm(range(pdistance.shape[0])):
#     for j in range(i):
#         if pdistance[i, j] <= 1: # checked, all distance <= 1 are duplicated
#             auto_duplicated_image_phash_pairs.add(frozenset([imageDedup[i]['phash'], imageDedup[j]['phash']]))

In [17]:
auto_duplicated_image_phash_pairs.save()

## Apply information from meta data

In [18]:
duplicated_post_image_phash_pairs = PersistentSet()
duplicated_post_image_phash_pairs.set_file(handmade_dir/'duplicated_post_image_phash_pairs.json')

for p in tqdm(posts.find()):
    if len(p.get('duplicated_posts', [])) == 0:
        continue

    dp_phashes = {i['phash']
                    for dp in p['duplicated_posts']
                    for i in imagemeta.find({'post_id': dp})}
    if len(dp_phashes) > 1:
#         print(f"More than 1 dp image {p['post_id']}")
#         print(f"{p['duplicated_posts']} {dp_phashes}")
        continue

    phashes = [i['phash'] for i in imagemeta.find({'post_id': p['post_id']})]
    if len(phashes) > 1:
#         print(f"More than 1 image {p['post_id']} {phashes}")
        continue
    for s in itertools.product(dp_phashes, phashes):
        fs = frozenset(s)
        if len(fs) > 1:
            duplicated_post_image_phash_pairs.add(fs)

duplicated_post_image_phash_pairs.save()

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [19]:
related_album_image_phash_pairs = PersistentSet()
related_album_image_phash_pairs.set_file(handmade_dir/'related_album_image_phash_pairs.json')

for album in tqdm({i['album'] for i in imagemeta.find({'album': {'$exists': True, '$ne': ''}})}):
    ra_phashes = [i['phash'] for i in imagemeta.find({'album': album})]
    if len(ra_phashes) <= 1:
        print(f"Only 1 or less image {album} {ra_phashes}")

    for s in itertools.product(ra_phashes, ra_phashes):
        fs = frozenset(s)
        if len(fs) > 1:
            related_album_image_phash_pairs.add(fs)

related_album_image_phash_pairs.save()

HBox(children=(FloatProgress(value=0.0, max=105.0), HTML(value='')))

Only 1 or less image 2ophbe ['8c7233d364cc6673']



## Apply manual labeled data

In [20]:
duplicated_image_phash_pairs = PersistentSet.load_set(handmade_dir/'duplicated_image_phash_pairs.json')
not_duplicated_image_phash_pairs = PersistentSet.load_set(handmade_dir/'not_duplicated_image_phash_pairs.json')
related_image_phash_pairs = PersistentSet.load_set(handmade_dir/'related_image_phash_pairs.json')
invalid_image_phashes = PersistentSet.load_set(handmade_dir/'invalid_image_phashes.json')

In [21]:
set_distance_pairs(auto_duplicated_image_phash_pairs, 0)
set_distance_pairs(duplicated_post_image_phash_pairs, 0)
set_distance_pairs(duplicated_image_phash_pairs, 0)
set_distance_pairs(not_duplicated_image_phash_pairs, 60)
set_distance_pairs(related_album_image_phash_pairs, 60)
set_distance_pairs(related_image_phash_pairs, 60)

related_distance = np.full(distance.shape, 60)
set_distance_pairs(related_album_image_phash_pairs, 0, mat=related_distance)
set_distance_pairs(related_image_phash_pairs, 0, mat=related_distance)

# Human in the Loop

In [22]:
def make_dedup_box (idx_x, idx_y, default=None):
    image_x = imageDedup[idx_x]
    phash_x = image_x['phash']
    image_y = imageDedup[idx_y]
    phash_y = image_y['phash']
    hash_pair = frozenset([phash_x, phash_y])

    yes_btn = widgets.Button(description="Duplicated", button_style='success')
    no_btn = widgets.Button(description="Not", button_style='info')
    related_btn = widgets.Button(description="Related", button_style='warning')
    invalid_x_btn = widgets.Button(description="X Invalid")
    invalid_y_btn = widgets.Button(description="Y Invalid")
    reset_btn = widgets.Button(description="Reset")
    output = widgets.Output()

    def on_yes (btn):
        with output:
            if hash_pair in not_duplicated_image_phash_pairs:
                not_duplicated_image_phash_pairs.persist_remove(hash_pair)
                print('-Not')
            duplicated_image_phash_pairs.persist_add(hash_pair)
            print('Duplicated')

    def on_no (btn):
        with output:
            if hash_pair in duplicated_image_phash_pairs:
                duplicated_image_phash_pairs.persist_remove(hash_pair)
                print('-Duplicated')
            not_duplicated_image_phash_pairs.persist_add(hash_pair)
            print('Not')

    def on_related (btn):
        with output:
            if hash_pair in not_duplicated_image_phash_pairs:
                not_duplicated_image_phash_pairs.persist_remove(hash_pair)
                print('-Not')
            related_image_phash_pairs.persist_add(hash_pair)
            print('Related')

    def on_invalid_x (btn):
        invalid_image_phashes.persist_add(phash_x)
        with output:
            print('Invalid X')

    def on_invalid_y (btn):
        invalid_image_phashes.persist_add(phash_y)
        with output:
            print('Invalid Y')

    def on_reset (btn):
        with output:
            if hash_pair in duplicated_image_phash_pairs:
                duplicated_image_phash_pairs.persist_remove(hash_pair)
                print('-Duplicated')
            if hash_pair in not_duplicated_image_phash_pairs:
                not_duplicated_image_phash_pairs.persist_remove(hash_pair)
                print('-Not')
            if hash_pair in related_image_phash_pairs:
                related_image_phash_pairs.persist_remove(hash_pair)
                print('-Related')
            if phash_x in invalid_image_phashes:
                invalid_image_phashes.persist_remove(phash_x)
                print('-Invalid X')
            if phash_y in invalid_image_phashes:
                invalid_image_phashes.persist_remove(phash_y)
                print('-Invalid Y')
            print('Reset')

    yes_btn.on_click(on_yes)
    no_btn.on_click(on_no)
    related_btn.on_click(on_related)
    invalid_x_btn.on_click(on_invalid_x)
    invalid_y_btn.on_click(on_invalid_y)
    reset_btn.on_click(on_reset)

    if default == 'no':
        on_no(None)
    elif default == 'yes':
        on_yes(None)

    return HBox([VBox([yes_btn, no_btn, related_btn, invalid_x_btn, invalid_y_btn, reset_btn, output]),
                 widgets.Image(value=open(image_x['file_path'], 'rb').read(), width=250, height=150),
                 widgets.Image(value=open(image_y['file_path'], 'rb').read(), width=250, height=150)])

In [23]:
def potential_duplicates (threshold):
    for i in range(distance.shape[0]):
        for j in range(i):
            if distance[i, j] <= threshold:
                phash_pair = frozenset([imageDedup[i]['phash'], imageDedup[j]['phash']])
                if (phash_pair not in auto_duplicated_image_phash_pairs and
                    phash_pair not in duplicated_post_image_phash_pairs and
                    phash_pair not in duplicated_image_phash_pairs and
                    phash_pair not in not_duplicated_image_phash_pairs and
                    phash_pair not in related_album_image_phash_pairs and
                    phash_pair not in related_image_phash_pairs):
                    yield (i, j)

In [24]:
distance_threshold = 10

In [25]:
pdup = potential_duplicates(distance_threshold)

In [26]:
for i in range(10):
    try:
        next_pdup = next(pdup)
    except StopIteration:
        print('StopIteration')
        break

    idx_x, idx_y = next_pdup
    image_x = imageDedup[idx_x]
    image_y = imageDedup[idx_y]
    print(f"{idx_x} {idx_y} {distance[idx_x, idx_y]} {image_x['phash']} {image_y['phash']} {image_x['width']} {image_y['width']} {image_x['image_id']} {image_y['image_id']}")
    display(make_dedup_box(idx_x, idx_y, default=None if distance[idx_x, idx_y] < 6 else 'no'))
    # display(make_dedup_box(idx_x, idx_y, default='yes' if distance[idx_x, idx_y] < 9 else 'no'))

StopIteration


# Visually check images

## Images with high variability

In [27]:
# interested_phashes = set()

In [28]:
# def potential_duplicates_high (threshold):
#     for i in range(distance.shape[0]):
#         for j in range(i):
#             if distance[i, j] >= threshold:
#                 phash_pair = frozenset([imageDedup[i]['phash'], imageDedup[j]['phash']])
#                 if (phash_pair in duplicated_image_phash_pairs):
#                     interested_phashes.add(imageDedup[i]['phash'])
#                     interested_phashes.add(imageDedup[j]['phash'])
#                     yield (i, j)

In [29]:
# pduph = potential_duplicates_high(13)

In [30]:
# for i in range(100):
#     try:
#         next_pdup = next(pduph)
#     except StopIteration:
#         print('StopIteration')
#         break

#     idx_x, idx_y = next_pdup
#     image_x = imageDedup[idx_x]
#     image_y = imageDedup[idx_y]
#     print(f"{idx_x} {idx_y} {distance[idx_x, idx_y]} {image_x['phash']} {image_y['phash']} {image_x['width']} {image_y['width']} {image_x['image_id']} {image_y['image_id']}")
#     display(make_dedup_box(idx_x, idx_y))

In [31]:
# invalid_image_phashes = set(json.load(open('handmade/invalid_image_phashes.json')))

In [32]:
# examined_images = [
#     'reddit/dataisugly/2o08rl_0', # manually downloaded
#     'reddit/dataisugly/2nwubr_0', # manually downloaded
#     'reddit/dataisugly/beivt8_0', # manually downloaded
#     'reddit/dataisugly/683b4i_0', # manually downloaded
#     'reddit/dataisugly/3zcw30_0', # manually downloaded
#     'reddit/dataisugly/1oxrh5_0', # manually downloaded a higher resolution image
#     'reddit/dataisugly/3or2g0_0', # manually downloaded
#     'reddit/dataisugly/5iobqn_0', # manually downloaded
#     'reddit/dataisugly/29fpuo_0', # manually downloaded
#     'reddit/dataisugly/5xux1f_0', # manually downloaded
#     'reddit/dataisugly/35lrw1_0', # manually downloaded
#     'reddit/dataisugly/1bxhv2_0', # manually downloaded a higher resolution image
#     'reddit/dataisugly/3peais_0', # manually downloaded
#     'reddit/dataisugly/2vdk71_0', # manually downloaded
#     'reddit/dataisugly/6b8w73_0', # manually downloaded
#     'reddit/dataisugly/2w8pnr_0', # manually downloaded an image with more context
#     'reddit/dataisugly/2dt19h_0', # manually downloaded
#     'reddit/dataisugly/31tj8a_0', # manually downloaded
#     'reddit/dataisugly/30smxr_0', # manually downloaded
#     'reddit/dataisugly/30dbx6_0', # manually downloaded
#     'reddit/dataisugly/561ytm_0', # manually downloaded
#     'reddit/dataisugly/6q4tre_0', # manually downloaded
#     'reddit/dataisugly/3icm4g_0', # manually downloaded
#     'reddit/dataisugly/6z5v98_0', # manually downloaded
#     'reddit/dataisugly/5fucjm_0', # manually downloaded
#     'reddit/dataisugly/99bczz_0', # manually downloaded
#     'reddit/dataisugly/2662wv_0', # manually downloaded
#     'reddit/dataisugly/26otpi_0', # manually downloaded a higher resolution image
#     'reddit/dataisugly/68scgb_0', # manually downloaded
#     'reddit/dataisugly/et75qp_0', # manually downloaded
#     'reddit/dataisugly/4c9zc1_0', # manually downloaded an image with more context
#     'reddit/dataisugly/2525a5_0', # manually downloaded more images, but does not matched with the one with more context
#     'reddit/dataisugly/2la7zt_0', # thumbnail alt
# ]

## Invalid images

In [33]:
# invalids = []
# for h in invalid_image_phashes:
#     invalid_images = [f for f in imagefiles.find({'phash': h})]
#     if len(invalid_images) > 0:
#         invalids.append(invalid_images[0])

# display(Box([widgets.Image(value=open(i['file_path'], 'rb').read(), width=100, height=100) for i in invalids],
#             layout=Layout(display='flex', flex_flow='row wrap')))

# Consolidate

## Related images

In [34]:
related_images = [[imageDedup[idx]['image_id'] for idx in c]
                     for c in nx.components.connected_components(nx.Graph(related_distance <= 1))
                     if len(c) > 1]
len(related_images)

139

In [35]:
for ids in related_images:
    for i in ids:
        imageMeta = imageDedup[image_id_to_idx(i)]
        ri = [r for r in set(imageMeta.get('related_images', []) + ids) if r != i]
        imagededup.update_one({'image_id': i}, {'$set': {'related_images': ri}})

## Duplicated images

In [36]:
excluding_image_phashes = PersistentSet.load_set(handmade_dir/'excluding_image_phashes.json')

In [37]:
excluding_image_phashes.persist_add('c13e3ae10e70fd86')
excluding_image_phashes.persist_add('fe81837a94e3807e')
excluding_image_phashes.persist_add('af9da24292fae149')
excluding_image_phashes.persist_add('ad87d2696738ca4c')
excluding_image_phashes.persist_add('d25264dfa9659392')
excluding_image_phashes.persist_add('964e3b3160e14f8f')

In [38]:
class ImageDedup ():
    _attrs = [
        'id',
        'post_id',
        'datetime',
        'url',
        'title',
        'content',
        'author',
        'removed',
        'ups',
        'num_comments',
        'external_link',
        'source',
        'source_platform',
        'source_url',
        'tags',
        'labels',
        'media_type',
        'thumbnail_url',
        'preview_url',
        'external_link_url',
        'archive_url',

        'thumbnail',
        'preview',
        'external_link',
        'archive',
        'manual',

        'image_id',
        'short_image_id',
        'album',
        'index_in_album',
        'image_type',
        'file_path',
        'ext',
        'animated',
        'size',
        'width',
        'height',
        'pixels',
        'image_order',
        'ahash',
        'phash',
        'pshash',
        'dhash',
        'whash',

        'duplicated_posts',
        'related_images',
        'duplicated_images',
        'popularity_score'
    ]

    def __init__ (self, imageMetas=[]):
        # print(imageMetas)
        if len(imageMetas) == 0:
            raise Exception('Empty imageFiles array.')

        self._imageMetas = imageMetas
        self._image_ids = [i['image_id'] for i in imageMetas]
        self._image_order = sort_images(self._imageMetas)

        self._post_ids = {i['post_id'] for i in imageMetas}
        self._posts = [posts.find_one({'post_id': i}) for i in self._post_ids]
        dpost = []
        for p in self._posts:
            if 'duplicated_posts' in p:
                for i in p['duplicated_posts']:
                    if i not in self._post_ids:
                         dpost.append(posts.find_one({'post_id': i}))
        self._posts += dpost
        if None in self._posts:
            print(self._post_ids)
        self._post_order = sort_posts(self._posts)

        for k, v in self.main_image.items():
            if k in ['duplicated_posts', 'related_images']:
                continue
            setattr(self, k, v)

        for k, v in self.main_post.items():
            if k in ['duplicated_posts', 'related_images']:
                continue
            if k in ['preview', 'thumbnail', 'external_link', 'archive', 'manual']:
                setattr(self, f"{k}_url", v)
            else:
                setattr(self, k, v)

    def digest (self):
        return {a:getattr(self, a) for a in ImageDedup._attrs if hasattr(self, a)}

    @property
    def duplicated_posts (self):
        post_ids = self._post_ids.union(*[set(p.get('duplicated_posts', [])) for p in self._posts])
        return [i for i in post_ids if i != self.post_id]

    @property
    def duplicated_images (self):
        return [i for i in self._image_ids if i != self.image_id]

    @property
    def related_images (self):
        return [ri for i in self._imageMetas for ri in i.get('related_images', []) if ri != self.image_id]

    @property
    def main_post (self):
#         if len(self._post_order) > 1 and self._post_order[0]['source_platform'] != 'reddit':
#             print(f"main post warning: {[p['post_id'] for p in self._post_order]}")
        return self._post_order[0]

    @property
    def popularity_score (self):
        return sum([post_score(p) for p in self._posts if p['source'] == 'dataisugly'])

    @property
    def main_image (self):
#         if len(self._image_order) > 1 and self._image_order[0]['source_platform'] != 'reddit':
#             print(f"main image warning: {[i['image_id'] for i in self._image_order]}")
        mi = [i for i in self._image_order if i['phash'] not in excluding_image_phashes][0]
        return mi

In [39]:
duplicated_images = [list(set([imageDedup[idx]['image_id'] for idx in c]))
                     for c in nx.components.connected_components(nx.Graph(distance <= 1))]

In [40]:
# imageDedup[image_id_to_idx('reddit/AusFinance/fman6b_0')]

In [41]:
def dedup_image (ids):
    imagedd = ImageDedup([imageDedup[image_id_to_idx(i)] for i in set(ids)])
    # if imagedd.main_post['source'] != 'dataisugly':
        # print(f"Image not from dataisugly: {imagedd.main_post['post_id']}")
    for i in imagedd.duplicated_images:
        imagededup.delete_one({'image_id': i})
    imagededup.replace_one({'image_id': imagedd.image_id}, imagedd.digest(), upsert=True)
    return imagedd

In [42]:
imagedds = parallel(dedup_image, duplicated_images, n_jobs=-1)

HBox(children=(FloatProgress(value=0.0, max=8894.0), HTML(value='')))




In [43]:
# duplicated_image_ids = [c
#                      for c in nx.components.connected_components(nx.Graph(distance <= 1))
#                      if len(c) > 1]
# start = 0

In [44]:
# # len(duplicated_image_ids)
# cnt = 0
# end = start + 50
# for idxs in duplicated_image_ids:
# #     print(f"{[imageDedup[i]['image_id'] for i in idxs]}")
# #     if len(idxs) == 2:
#     if len(idxs) >= 4:
#         if cnt >= start:
#             print(*[imageDedup[i]['image_id'] for i in idxs])
#             print(*[imageDedup[i]['phash'] for i in idxs])
#             display(HBox([
#                 widgets.Image(value=open(imageDedup[i]['file_path'], 'rb').read(), width=100, height=100)
#                 for i in idxs]))
#         cnt += 1
#         if cnt >= end:
#             print(end)
#             start = end
#             break