In [1]:
# !pip install pydash

In [2]:
from pymongo import MongoClient
from functools import cmp_to_key
from pathlib import Path
import imagehash
from tqdm.notebook import tqdm
import pydash as _
import numpy as np
import json

from IPython.display import display, Image

In [3]:
mongo = MongoClient('172.17.0.1', 27017)
db = mongo['bad-vis']
posts = db['posts']
imagefiles = db['imagefiles']
imagemeta = db['imagemeta']

In [4]:
images_dir = Path('./images')

In [5]:
invalid_post_ids = set(json.load(open('handmade/invalid_post_ids.json')))

In [6]:
invalid_image_phashes = set(json.load(open('handmade/invalid_image_phashes.json')))

# Group image files into images

In [7]:
image_ids =  list({f['image_id'] for f in imagefiles.find()})
len(image_ids)

8066

In [8]:
imageMetas = []

In [9]:
class ImageMeta ():
    _attrs = [
        'id',
        'post_id',
        'image_id',
        'short_image_id',
        'album',
        'index_in_album',
        'file_path',
        'source_platform',
        'source',
        'ext',
        'animated',
        'size',
        'width',
        'height',
        'thumbnail',
        'preview',
        'external_link',
        'archive',
        'ahash',
        'phash',
        'pshash',
        'dhash',
        'whash',
        'duplicated_posts',
        'related_images',
        'image_order'
    ]

    rank = {
        'archive': 3,
        'external': 2,
        'preview': 1
    }

    def __init__ (self, imageFiles=[]):
        if len(imageFiles) == 0:
            raise Exception('Empty imageFiles array.')
        self._imageFiles = imageFiles
        self.image_order = self.sort_images()
        for k, v in self.main_image.items():
            setattr(self, k, v)

    def digest (self):
        return {a:getattr(self, a) for a in ImageMeta._attrs}

    @property
    def is_hash_consistent (self):
        return _.every([self.hash_consistent(h) for h in ['phash']])
#         return _.every([self.hash_consistent(h) for h in ['phash', 'pshash', 'dhash', 'whash']])
#         return _.every([self.hash_consistent(hash_type) for hash_type in ['ahash', 'phash', 'pshash', 'dhash', 'whash']])

    def hash_distance (self, hash_type):
        main_hash = imagehash.hex_to_hash(self.main_image[hash_type])
        image_types = [t for t in self.image_order if t != 'thumbnail']
        return [imagehash.hex_to_hash(self.find_image_type(t)[hash_type]) - main_hash for t in image_types]

    def hash_consistent (self, hash_type):
        main_hash = imagehash.hex_to_hash(self.main_image[hash_type])
        return _.every(self.hash_distance(hash_type), lambda x: abs(x) < 5) # magic number threshold, by experiment when trying out the imagehash library

    @property
    def main_image (self):
        return getattr(self, self.image_order[0])

    @property
    def duplicated_posts (self):
        post = posts.find_one({'post_id': self.post_id})
        return post['duplicated_posts'] if 'duplicated_posts' in post else []

    @property
    def related_images (self):
        return list({f['image_id'] for f in imagefiles.find({'post_id': self.post_id})})

    @property
    def thumbnail (self):
        return self.find_image_type('thumbnail')

    @property
    def preview (self):
        return self.find_image_type('preview')

    @property
    def external_link (self):
        return self.find_image_type('external_link')

    @property
    def archive (self):
        return self.find_image_type('archive')

    def find_image_type (self, image_type):
        return _.find(self._imageFiles, lambda x: x['image_type'] == image_type)

    @property
    def available_image_types (self):
        return [image_type
             for image_type in ['thumbnail', 'preview', 'external_link', 'archive']
             if getattr(self, image_type)]

    def image_type_rank (self, source):
        return ImageMeta.rank.get(source, 0)

    def sort_images (self):
        def preferred (x, y):
            image_x = getattr(self, x)
            image_y = getattr(self, y)

            # animated over non-animated
            if image_x['animated'] != image_y['animated']:
                if image_x['animated']:
                    return -1
                if image_y['animated']:
                    return 1

            pixels_x = image_x['width'] * image_x['height']
            pixels_y = image_y['width'] * image_y['height']
            # high resolution over low
            if pixels_x != pixels_y:
                return pixels_y - pixels_x

            # png over others
            if image_x['ext'] != image_y['ext']:
                if image_x['ext'] == '.png':
                    return -1
                if image_y['ext'] == '.png':
                    return 1

            rank_x = self.image_type_rank(image_x['image_type'])
            rank_y = self.image_type_rank(image_y['image_type'])
            # external over preview
            if rank_x != rank_y:
                return rank_y - rank_x

            return image_y['size'] - image_x['size']

        return sorted(self.available_image_types, key=cmp_to_key(preferred))

In [10]:
for image_id in tqdm(image_ids):
    imageFiles = [i for i in imagefiles.find({'image_id': image_id})]

    if imageFiles[0]['post_id'] in invalid_post_ids:
        continue

    valid_imageFiles = [i for i in imageFiles if i['phash'] not in invalid_image_phashes]
    if len(valid_imageFiles) == 0:
        print(f"All image files are invalid: {image_id}")
#         display(*[Image(filename=i['file_path']) for i in imageFiles])
        continue

    for i in valid_imageFiles:
        del i['_id']
    imageMeta = ImageMeta(valid_imageFiles)
    imagemeta.replace_one({'image_id': imageMeta.image_id}, imageMeta.digest(), upsert=True)
    imageMetas.append(imageMeta)

HBox(children=(FloatProgress(value=0.0, max=8066.0), HTML(value='')))

All image files are invalid: reddit/dataisugly/921umu:0
All image files are invalid: reddit/dataisugly/2hbbbr:0
All image files are invalid: reddit/dataisugly/2nwubr:0
All image files are invalid: reddit/dataisugly/2ajjde:0
All image files are invalid: reddit/dataisugly/40s58x:0
All image files are invalid: reddit/dataisugly/4n6jh9:0
All image files are invalid: reddit/dataisugly/2zrizi:0
All image files are invalid: reddit/dataisugly/1ktavo:0
All image files are invalid: reddit/dataisugly/148uv0:0
All image files are invalid: reddit/dataisugly/24ammj:0
All image files are invalid: reddit/dataisugly/3f3c4w:0
All image files are invalid: reddit/dataisugly/29fugw:0
All image files are invalid: reddit/dataisugly/1mbw5k:0
All image files are invalid: reddit/dataisugly/1nlcmx:0
All image files are invalid: reddit/dataisugly/23mkib:0
All image files are invalid: reddit/dataisugly/2p3g6k:0
All image files are invalid: reddit/dataisugly/54srjj:0
All image files are invalid: reddit/dataisugly/2

# Find images with inconsistent hash

In [11]:
images = []

In [12]:
processed_image_id = set()

In [13]:
examed_images = [
    'reddit/dataisugly/2wkifl:0', # image edited, bottom right corner
    'reddit/dataisbeautiful/bx1fzb:0', # image edited, color and others changed
    'reddit/dataisugly/basasb:0', # transparent background in png
    'reddit/dataisugly/6aiz8g:0', # transparent background in png
    'reddit/dataisugly/blxz6t:0', # visually the same
    'reddit/dataisugly/3asdm5:0', # visually the same
    'reddit/dataisugly/4pp4ua:0', # visually the same
    'reddit/dataisugly/8a69j1:1', # visually the same
    'reddit/dataisugly/20mwkm:0', # visually the same
    'reddit/dataisugly/5icrv4:0', # visually the same
    'reddit/dataisugly/b8jz25:0', # preview clipped
#     'reddit/dataisugly/148fmk:0', # expired imgur
#     'reddit/dataisugly/1asr8h:0', # wrong preview, thumbnail
#     'reddit/dataisugly/2htbld:0', # wrong preview, thumbnail
#     'reddit/dataisugly/4aujij:0', # wrong preview, thumbnail
#     'reddit/dataisugly/4eymb0:0', # wrong preview, thumbnail
#     'reddit/dataisugly/3cdyxc:0', # external_link tinypic.com expired
#     'reddit/dataisugly/4au6ms:0', # only archive image is valid
#     'reddit/dataisugly/3mu2oh:0', # only archive image is valid
#     'reddit/dataisugly/34g2jr:0', # only archive image is valid
#     'reddit/dataisugly/40hq6g:0', # only archive image is valid
#     'reddit/dataisugly/4a4j9m:0', # only archive image is valid
#     'reddit/dataisugly/3hg337:0', # only archive image is valid
]

In [14]:
for imageMeta in tqdm(imageMetas):
    if imageMeta.image_id in processed_image_id:
        continue
    if not imageMeta.is_hash_consistent and imageMeta.image_id not in examed_images:
        print(imageMeta.image_id)
        print(imageMeta.image_order)
        print(imageMeta.hash_distance('phash'))
        print([imageMeta.find_image_type(i)['phash'] for i in imageMeta.image_order])
        print([imageMeta.find_image_type(i)['ext'] for i in imageMeta.image_order])
        print([imageMeta.find_image_type(i)['width'] for i in imageMeta.image_order])
        print([imageMeta.find_image_type(i)['height'] for i in imageMeta.image_order])
        print([imageMeta.find_image_type(i)['size'] for i in imageMeta.image_order])
        images = [imageMeta.find_image_type(i)['file_path'] for i in imageMeta.image_order]
        break
    processed_image_id.add(imageMeta.image_id)

HBox(children=(FloatProgress(value=0.0, max=7999.0), HTML(value='')))




In [15]:
if images and len(images) > 0:
    display(*[Image(filename=i) for i in images])