In [1]:
# !pip install pydash

In [2]:
from pymongo import MongoClient
from functools import cmp_to_key
from pathlib import Path
from PIL import Image
import imagehash
from tqdm.notebook import tqdm
import pydash as _

from IPython.display import display, Image

In [3]:
images_dir = Path('./images')

In [4]:
mongo = MongoClient('172.17.0.1', 27017)
db = mongo['bad-vis']
posts = db['posts']
imagefiles = db['imagefiles']
imagemeta = db['imagemeta']

# Group image files into images

In [5]:
image_ids =  list({f['image_id'] for f in imagefiles.find()})
len(image_ids)

8100

In [13]:
class ImageMeta ():
    _attrs = [
        'id',
        'post_id',
        'image_id',
        'album',
        'index_in_album',
        'main_image',
        'thumbnail',
        'preview',
        'external_link',
        'archive',
        'ahash',
        'phash',
        'pshash',
        'dhash',
        'whash',
        'duplicated_posts',
        'related_images',
        'image_order'
    ]

    def __init__ (self, imageFiles=[]):
        if len(imageFiles) == 0:
            raise Exception('Empty imageFiles array.')
        self._imageFiles = imageFiles
        self.image_order = self.sort_images()

    def digest (self):
        return {a:getattr(self, a) for a in ImageMeta._attrs}

    @property
    def id (self):
        return self.main_image['id']

    @property
    def post_id (self):
        return self.main_image['post_id']

    @property
    def image_id (self):
        return self.main_image['image_id']

    @property
    def album (self):
        return self.main_image['album']

    @property
    def index_in_album (self):
        return self.main_image['index_in_album']

    @property
    def ahash (self):
        return self.main_image['ahash']

    @property
    def phash (self):
        return self.main_image['phash']

    @property
    def pshash (self):
        return self.main_image['pshash']

    @property
    def dhash (self):
        return self.main_image['dhash']

    @property
    def whash (self):
        return self.main_image['whash']

    @property
    def is_hash_consistent (self):
        return _.every([self.hash_consistent(h) for h in ['phash']])
#         return _.every([self.hash_consistent(h) for h in ['phash', 'pshash', 'dhash', 'whash']])
#         return _.every([self.hash_consistent(hash_type) for hash_type in ['ahash', 'phash', 'pshash', 'dhash', 'whash']])

    def hash_distance (self, hash_type):
        main_hash = imagehash.hex_to_hash(self.main_image[hash_type])
        image_types = [t for t in self.image_order if t != 'thumbnail']
        return [imagehash.hex_to_hash(self.find_image_type(t)[hash_type]) - main_hash for t in image_types]

    def hash_consistent (self, hash_type):
        main_hash = imagehash.hex_to_hash(self.main_image[hash_type])
        return _.every(self.hash_distance(hash_type), lambda x: abs(x) < 5) # magic number threshold, by experiment when trying out the imagehash library

    @property
    def main_image (self):
        return getattr(self, self.image_order[0])

    @property
    def duplicated_posts (self):
        post = posts.find_one({'post_id': self.post_id})
        return post['duplicated_posts'] if 'duplicated_posts' in post else []

    @property
    def related_images (self):
        if self.index_in_album == 0:
            return []
        return [f['image_id'] for f in imagefiles.find({'post_id': self.post_id})]

    @property
    def thumbnail (self):
        return self.find_image_type('thumbnail')

    @property
    def preview (self):
        return self.find_image_type('preview')

    @property
    def external_link (self):
        return self.find_image_type('external_link')

    @property
    def archive (self):
        return self.find_image_type('archive')

    def find_image_type (self, image_type):
        return _.find(self._imageFiles, lambda x: x['image_type'] == image_type)

    @property
    def available_image_types (self):
        return [image_type
             for image_type in ['thumbnail', 'preview', 'external_link', 'archive']
             if getattr(self, image_type)]

    def sort_images (self):
        def preferred (x, y):
            if x == 'thumbnail':
                return 1
            if y == 'thumbnail':
                return -1

            image_x = getattr(self, x)
            image_y = getattr(self, y)

            if image_x['ext'] != image_y['ext']:
                if image_x['ext'] == '.png':
                    return -1
                if image_y['ext'] == '.png':
                    return 1

            if image_x['width'] == image_y['width']:
                return image_y['size'] - image_x['size']
            else:
                return image_y['width'] - image_x['width']

        return sorted(self.available_image_types, key=cmp_to_key(preferred))

In [14]:
for image_id in tqdm(image_ids):
    imageFiles = [i for i in imagefiles.find({'image_id': image_id})]
    for i in imageFiles:
        del i['_id']
    imageMeta = ImageMeta(imageFiles)
    imagemeta.replace_one({'image_id': imageMeta.image_id}, imageMeta.digest(), upsert=True)

HBox(children=(FloatProgress(value=0.0, max=8100.0), HTML(value='')))




# Find images with inconsistent hash

In [8]:
images = []

In [9]:
processed_image_id = set()

In [10]:
examed_images = [
    'reddit/dataisugly/2wkifl:0', # image edited, bottom right corner
    'reddit/dataisugly/4aujij:0', # wrong preview, thumbnail
    'reddit/dataisugly/5icrv4:0', # png over gif
    'reddit/dataisugly/b8jz25:0', # preview clipped
    'reddit/dataisugly/3cdyxc:0', # external_link tinypic.com expired
    'reddit/dataisbeautiful/bx1fzb:0', # image edited, color and others changed
    'reddit/dataisugly/2htbld:0', # wrong preview, thumbnail
    'reddit/dataisugly/blxz6t:0', # visually the same, maybe difference in color?
    'reddit/dataisugly/basasb:0', # transparent background in png
    'reddit/dataisugly/6aiz8g:0', # transparent background in png
    'reddit/dataisugly/1asr8h:0', # wrong preview, thumbnail
    'reddit/dataisugly/148fmk:0', # expired imgur
    'reddit/dataisugly/3asdm5:0', # visually the same
    'reddit/dataisugly/4eymb0:0', # wrong preview, thumbnail
    'reddit/dataisugly/4pp4ua:0', # visually the same
    'reddit/dataisugly/20mwkm:0', # png over gif
]

In [11]:
for image_id in tqdm(image_ids):
    if image_id in processed_image_id:
        continue
    imageFiles = [i for i in imagefiles.find({'image_id': image_id})]
    for i in imageFiles:
        del i['_id']
    imageMeta = ImageMeta(imageFiles)
    if not imageMeta.is_hash_consistent and image_id not in examed_images:
        print(image_id)
        print(imageMeta.image_order)
        print(imageMeta.hash_distance('phash'))
        print([imageMeta.find_image_type(i)['phash'] for i in imageMeta.image_order])
        print([imageMeta.find_image_type(i)['ext'] for i in imageMeta.image_order])
        print([imageMeta.find_image_type(i)['width'] for i in imageMeta.image_order])
        print([imageMeta.find_image_type(i)['height'] for i in imageMeta.image_order])
        print([imageMeta.find_image_type(i)['size'] for i in imageMeta.image_order])
        images = [imageMeta.find_image_type(i)['file_path'] for i in imageMeta.image_order]
        break
    processed_image_id.add(image_id)
#     imagemeta.replace_one({'image_id': imageMeta.image_id}, imageMeta.digest(), upsert=True)

HBox(children=(FloatProgress(value=0.0, max=8100.0), HTML(value='')))




In [12]:
if images and len(images) > 0:
    display(*[Image(filename=i) for i in images])