In [1]:
# !pip install networkx

In [2]:
from pymongo import MongoClient
from functools import cmp_to_key
from pathlib import Path
from tqdm.notebook import tqdm
import numpy as np
import simplejson as json
import networkx as nx

from lib.image_dedup import make_hashes, calculate_distance

In [3]:
mongo = MongoClient('172.17.0.1', 27017)
db = mongo['bad-vis']
posts = db['posts']
imagemeta = db['imagemeta']
imagededup = db['imagededup']

In [4]:
images_dir = Path('./images')
handmade_dir = Path('./handmade')
handmade_dir.mkdir(exist_ok=True)

In [5]:
# imagededup.drop()
# imagededup.insert_many([i for i in imagemeta.find()])

In [6]:
imageMetas = [m for m in imagededup.find()]
imageMetas.sort(key=lambda x: x['image_id'])

In [7]:
image_id_to_idx_mapping = {imageMetas[i]['image_id']:i for i in range(len(imageMetas))}
def image_id_to_idx (image_id):
    return image_id_to_idx_mapping.get(image_id, None)

In [8]:
image_hashes = [make_hashes(m) for m in imageMetas]

In [9]:
distance = calculate_distance(image_hashes)

ValueError: range() arg 3 must not be zero

In [None]:
def set_distance (image_id_x, image_id_y, value, mat=distance):
    idx_x = image_id_to_idx(image_id_x)
    idx_y = image_id_to_idx(image_id_y)
    if not idx_x or not idx_y:
        return
    mat[idx_x, idx_y] = value
    mat[idx_y, idx_x] = value

def set_distance_pairs (id_pairs, value, mat=distance):
    for ids in id_pairs:
        set_distance(*list(ids), value, mat=mat)

In [28]:
duplicated_images = [[imageMetas[idx]['image_id'] for idx in c]
                     for c in nx.components.connected_components(nx.Graph(distance <= 1))
                     if len(c) > 1]
len(duplicated_images)

738

In [30]:
class ImageDedup ():
    _attrs = [
        'id',
        'post_id',
        'datetime',
        'url',
        'title',
        'content',
        'author',
        'removed',
        'ups',
        'num_comments',
        'external_link',
        'source',
        'source_platform',
        'source_url',
        'tags',
        'labels',
        'media_type',

        'thumbnail',
        'preview',
        'external_link',
        'archive',

        'image_id',
        'short_image_id',
        'album',
        'index_in_album',
        'file_path',
        'ext',
        'animated',
        'size',
        'width',
        'height',
        'ahash',
        'phash',
        'pshash',
        'dhash',
        'whash',

        'duplicated_posts',
        'related_images',
        'duplicated_images'
    ]

    rank = {
        'dataisugly': 3,
        'wtf-viz': 2,
        'badvisualisations': 1
    }

    def __init__ (self, imageMetas=[]):
        if len(imageMetas) == 0:
            raise Exception('Empty imageFiles array.')
        self._imageMetas = imageMetas
        self._image_ids = [i['image_id'] for i in imageMetas]
        self._image_order = self.sort_images()
        self._post_ids = {i['post_id'] for i in imageMetas}
        self._posts = [posts.find_one({'post_id': i}) for i in self._post_ids]
        self._post_order = self.sort_posts()
        for k, v in self.main_image.items():
            if k in ['duplicated_posts']:
                continue
            setattr(self, k, v)
        for k, v in self.main_post.items():
            if k in ['duplicated_posts']:
                continue
            if k in ['preview', 'thumbnail']:
                setattr(self, f"{k}_url", v)
            else:
                setattr(self, k, v)

    def digest (self):
        return {a:getattr(self, a) for a in ImageDedup._attrs if hasattr(self, a)}

    @property
    def duplicated_posts (self):
        post_ids = list(self._post_ids)
        for p in self._posts:
            if 'duplicated_posts' in post_ids:
                post_ids += p['duplicated_posts']
        return [i for i in set(post_ids) if i != self.post_id]

    @property
    def duplicated_images (self):
#         return self._image_ids
        return [i for i in self._image_ids if i != self.image_id]

    @property
    def main_post (self):
        return self._post_order[0]

    def post_score (self, post):
        score = post['ups'] if 'ups' in post else 0
        score += 5 * post['num_comments'] if 'num_comments' in post else 0
        return score

    def source_rank (self, source):
        return ImageDedup.rank.get(source, 0)

    def sort_posts (self):
        def preferred (post_x, post_y):
            rank_x = self.source_rank(post_x['source'])
            rank_y = self.source_rank(post_y['source'])
            if rank_x != rank_y:
                return rank_y - rank_x
            post_score_x = self.post_score(post_x)
            post_score_y = self.post_score(post_y)
            return post_score_y - post_score_x

        return sorted(self._posts, key=cmp_to_key(preferred))

    @property
    def main_image (self):
        return self._image_order[0]

    def sort_images (self):
        def preferred (image_x, image_y):
            if image_x['animated'] != image_y['animated']:
                if image_x['animated']:
                    return -1
                if image_y['animated']:
                    return 1

            pixels_x = image_x['width'] * image_x['height']
            pixels_y = image_y['width'] * image_y['height']

            if pixels_x == pixels_y:

                if image_x['ext'] != image_y['ext']:
                    if image_x['ext'] == '.png':
                        return -1
                    if image_y['ext'] == '.png':
                        return 1

                if image_x['size'] != image_y['size']:
                    return image_y['size'] - image_x['size']

                return self.source_rank(image_y['source']) - self.source_rank(image_x['source'])
            else:
                return pixels_y - pixels_x

        return sorted(self._imageMetas, key=cmp_to_key(preferred))

In [34]:
for ids in duplicated_images:
    imagedd = ImageDedup([imagemeta.find_one({'image_id': i}) for i in ids])
    for i in imagedd.duplicated_images:
        imagededup.delete_one({'image_id': i})
    imagededup.replace_one({'image_id': imagedd.image_id}, imagedd.digest(), upsert=True)

In [None]:
similarities = [(distance <= i) & (distance > (i-1)) for i in range(20)]

In [None]:
similar_image_idxs = [[c for c in nx.components.connected_components(nx.Graph(s)) if len(c) > 1] for s in similarities]

In [None]:
{i:len(similar_image_idxs[i]) for i in range(20)}

In [None]:
threshold = 7
print(f"distance <= {threshold} & > {threshold-1} total: {len(similar_image_idxs[threshold])}")
for idxs in similar_image_idxs[threshold]:
#     pass
#     display(*[Image(filename=imageMetas[i]['file_path'], width=100, height=100) for i in idxs])
    print(f"{[imageMetas[i]['image_id'] for i in idxs]}")
#     print(f"image_id: {[imageMetas[i]['image_id'] for i in idxs]}")
    print(f"file_path: {[imageMetas[i]['file_path'] for i in idxs]}")
    if len(idxs) > 4:
        continue
    display(HBox([widgets.Image(value=open(imageMetas[i]['file_path'], 'rb').read(), width=150, height=150) for i in idxs]))