In [1]:
# !pip install imutils
# !pip install opencv-python

In [2]:
from pymongo import MongoClient
from pathlib import Path
from tqdm.notebook import tqdm
from shutil import copyfile
import imutils
import cv2
from PIL import Image
import json
from itertools import chain
from pydash import pick, omit
from datetime import datetime

from IPython.display import JSON

from lib.parallel import parallel
from lib.sort_things import sort_posts
# from lib import images2gif

In [3]:
images_dir = Path('../images')
gallery_dir = Path('../')
labelling_dir = Path('../labelling')

In [4]:
# rebuild_images = True
rebuild_images = False

In [5]:
if rebuild_images:
    !rm -r {labelling_dir/'preview'}
    !rm -r {labelling_dir/'thumbnail'}

    !rm -r {gallery_dir/'preview'}
    !rm -r {gallery_dir/'thumbnail'}

!rm -r {gallery_dir/'meta'}
!rm -r {gallery_dir/'image_lists'}

In [6]:
labelling_dir.mkdir(exist_ok=True)
gallery_dir.mkdir(exist_ok=True)

labelling_preview_dir = labelling_dir/'preview'
labelling_thumbnail_dir = labelling_dir/'thumbnail'
labelling_preview_dir.mkdir(exist_ok=True)
labelling_thumbnail_dir.mkdir(exist_ok=True)

preview_dir = gallery_dir/'preview'
thumbnail_dir = gallery_dir/'thumbnail'
preview_dir.mkdir(exist_ok=True)
thumbnail_dir.mkdir(exist_ok=True)

meta_dir = gallery_dir/'meta'
imagelist_dir = gallery_dir/'image_lists'
meta_dir.mkdir(exist_ok=True)
imagelist_dir.mkdir(exist_ok=True)

In [7]:
mongo = MongoClient('172.17.0.1', 27017)
db = mongo['bad-vis']
posts = db['posts']
imagededup = db['imagededup']
imagelabel = db['imagelabel']
visimages = db['visimages']
vislabels = db['vislabels']

In [8]:
imagelabel.drop()
visimages.drop()

In [9]:
preview_size = 2048
thumbnail_size = 140
size_MB = 1024 * 1024

In [10]:
class VisImage ():
    _attrs = [
        'id',
        'post_id',
        'image_name',
        'image_path',
        'thumbnail_path',

        'datetime',
        'url',
        'title',
        'author',
        # 'source',
        # 'source_url',
        # 'source_platform',
        'popularity_score',

    #     'size',
    #     'width',
    #     'height',
        'phash',
        'duplicated_images',
        'duplicated_posts',

        'labels',
        'remarks'
    ]

    def __init__ (self, image):
        self._image = image

        for k, v in image.items():
            if k in ['labels', 'datetime', 'duplicated_posts']:
                continue
            setattr(self, k, v)

    def digest (self, full=False):
        digest = {a:getattr(self, a) for a in VisImage._attrs if hasattr(self, a)}
        if full:
            dict_digest = {k:v for k, v in self.__dict__.items() if not k.startswith('_')}
            digest = {**digest, **dict_digest}
            digest['labelling_path'] = self.labelling_path
            digest['labelling_thumbnail_path'] = self.labelling_thumbnail_path
            digest['exclude_from_list'] = self.exclude_from_list
            digest['visLabel'] = self.visLabel
        digest = {k:v if '_path' not in k else v[3:] for k, v in digest.items()}
        return digest

    @property
    def image_name (self):
        return f"{self._image['id']}_{self._image['index_in_album']}"

    @property
    def datetime (self):
        return datetime.fromisoformat(self._image['datetime']).timestamp()

    @property
    def visLabel (self):
        if not hasattr(self, '_labels'):
            image_ids = [self._image['image_id']] + [i for i in self._image['duplicated_images']]
            visLabels = [l for l in vislabels.find({'image_id': {'$in': image_ids}})]
            if len(visLabels) == 0:
                return None

            if len(visLabels) > 1:
                duplicated_labels = [l for l in visLabels if 'flag:duplicated' in l['labels']]
                if (len(duplicated_labels) > 0):
                    print(f"labels duplicated: more than 1 label {len(duplicated_labels)} {image_ids} {visLabels}")
                    visLabels = [l for l in visLabels if l not in duplicated_labels]
                else:
                    print(f"labels warning: more than 1 label {image_ids} {visLabels}")

            self._labels = visLabels[0]
        return self._labels

    @property
    def labels (self):
        if not self.visLabel:
            return []
        labels = [l for l in self.visLabel['labels'] if l != 'flag:starred']
        if self._image.get('animated'):
            labels.append('layout:animated')
        return labels

    @property
    def duplicated_posts (self):
        duplicated_post_ids = set(self._image['duplicated_posts'] + ['_'.join(image_id.split('_')[:-1]) for image_id in self._image['duplicated_images']])
        duplicated_posts = [posts.find_one({'post_id': post_id}) for post_id in duplicated_post_ids if post_id != self.post_id]

        return [{
            **pick(p, ['url', 'title', 'author', 'post_id']),
            'datetime': datetime.fromisoformat(p['datetime']).timestamp()
        } for p in sort_posts(duplicated_posts)]

    @property
    def exclude_from_list (self):
        return len(self.labels) == 0 or 'flag:invalid' in self.labels or 'flag:needreview' in self.labels or 'flag:notbad' in self.labels or 'flag:parody' in self.labels

    @property
    def remarks (self):
        if not self.visLabel:
            return ''
        return self.visLabel['remarks']

    @property
    def filename (self):
        return f"{self.image_name}{self._image['ext']}"

    @property
    def labelling_path (self):
        return str(labelling_preview_dir/self.filename)

    @property
    def labelling_thumbnail_path (self):
        return str(labelling_thumbnail_dir/self.filename)

    @property
    def image_path (self):
        return str(preview_dir/self.filename)

    @property
    def thumbnail_path (self):
#         return str(thumbnail_dir/f"{self.image_name}{'.gif' if self._image['ext'] == '.gif' else '.jpg'}")
        return str(thumbnail_dir/f"{self.image_name}.jpg")

    def makePreview (self, direct_copy=False):
        if direct_copy or self._image['animated']:
            copyfile(self._image['file_path'], self.labelling_path)
        else:
            im = Image.open(self._image['file_path'])

            if '.jpg' in self.labelling_path:
                im = im.convert('RGB')

            # if self._image['width'] > preview_size or self._image['height'] > preview_size:
            #     im.thumbnail((preview_size, preview_size), Image.LANCZOS)

            im.save(self.labelling_path, optimize=True)

            old_size = Path(self._image['file_path']).stat().st_size
            new_size = Path(self.labelling_path).stat().st_size
            if new_size / old_size >= 1:
                copyfile(self._image['file_path'], self.labelling_path)
            if new_size > 3 * size_MB:
                print(new_size/size_MB, old_size/size_MB, new_size/old_size, self._image['file_path'], self.labelling_path)

        if not self.exclude_from_list:
            copyfile(self.labelling_path, self.image_path)

    def makeThumbnail (self):
#         if self._image['ext'] == '.gif':
#             frames = images2gif.readGif(self._image['file_path'], False)
#             for frame in frames:
#                 frame.thumbnail((thumbnail_size, thumbnail_size), Image.ANTIALIAS)

#             images2gif.writeGif(self.thumbnail_path, frames)
#         else:

        im = Image.open(self._image['file_path']).convert('RGB')
        im.thumbnail((thumbnail_size, thumbnail_size), Image.LANCZOS)
        im.save(self.labelling_thumbnail_path, optimize=True)

        if not self.exclude_from_list:
            copyfile(self.labelling_thumbnail_path, self.thumbnail_path)

#         ratio = thumbnail_size / max([self._image['width'], self._image['height']])
#         thumbnail_width = min([thumbnail_size, int(round(self._image['width'] * ratio))])
#         thumbnail_height = min([thumbnail_size, int(round(self._image['height'] * ratio))])

#         im = Image.open(self._image['file_path']).convert('RGB')
#         im.thumbnail((thumbnail_width, thumbnail_height))
#         im.save(self.thumbnail_path, 'JPEG')

    def makeMetaData (self):
        imagelabel.insert_one(self.digest(full=True))

        if not self.exclude_from_list:
            digest = self.digest()
            with open(meta_dir/f"{self.image_name}.json", 'w') as f:
                json.dump(digest, f)
            visimages.insert_one(digest)

In [11]:
def finalize_image (image):
    visImage = VisImage(image)
    if rebuild_images:
        visImage.makePreview()
        visImage.makeThumbnail()
    visImage.makeMetaData()
    return visImage

In [12]:
visImages = parallel(finalize_image, imagededup.find(), total=imagededup.estimated_document_count())

HBox(children=(FloatProgress(value=0.0, max=5823.0), HTML(value='')))




In [13]:
visImages = [i for i in visImages if not i.exclude_from_list]

# Image Lists

In [14]:
visImages = [i for i in visimages.find()]

In [15]:
visImages.sort(key=lambda x: x['popularity_score'])

In [16]:
visImages.reverse()

In [17]:
len(visImages)

911

In [18]:
for i in visImages:
    del i['_id']
    # del i['duplicated_images']

In [19]:
json.dump({'images': visImages}, open(imagelist_dir/'all.json', 'w'), separators=(',', ':'))

In [20]:
batch_cuts = [0] + [i for i in range(225, len(visImages), 400)] + [len(visImages)]
for (batch_begin, batch_end) in zip(batch_cuts[:-1], batch_cuts[1:]):
    json.dump({
        'images': visImages[batch_begin:batch_end],
        'next': f"{batch_end}.json" if batch_end != len(visImages) else None
    }, open(imagelist_dir/f"{batch_begin}.json", 'w'), separators=(',', ':'))

# Labels Options

In [21]:
# labelOptions = [{
#     'tag': 'fault',
#     'name': 'Issues',
#     'options': [
#         'percentage:sum', 'percentage:encoding', 'percentage:wholepart',
#         'label', 'description',
#         'axis', 'axis:label', 'axis:flipped', 'axis:truncated', "axis:missing", 'axis:double', # axis:dual
#         'legend',
#         'color', 'color:over12',
#         'scale', 'scale:log', 'scale:inconsistent', 'binning',
#         'area',
#         'picto:distortion', 'picto:area',
#         'position', 'itemorder',
#         'connection',
#         '3d', 'animation',
#         'cluttering', 'occulusion',
#         'data', 'data:selective', 'data:questionable', 'data:missingvalues', 'data:redundant', 'data:prediction',
#         'index:comparison', # index:crossbasiscomparison
#         'parody', 'faultylogic', 'missingcontext',
#         'confirmationbias', 'chartjunk', 'betteralternative',
#         'faultystatistics', 'invalidcomparison',
#         'map:population', 'invalidencoding',
#         'unreadable', 'visuallyawful',
#         'encoding', 'cannotaddup', 'legibility'
#     ]}, {
#     'tag': 'form',
#     'name': 'Chart Type',
#     'options': [
#         'barchart', 'linechart', 'scatterplot', 'dotplot', 'barcodechart', 'areachart', 'histogram',
#         'nodelink', 'radarchart', 'bubblechart', 'dumbbellplot',
#         'piechart', 'donutchart',
#         'pyramid', 'venn', 'choropleth', 'flowmap', 'map', 'chernoffface',
#         'table', 'heatmap', 'treemap', 'quadrant',
#         'boxplot', 'violinplot', 'parallelcoor', 'streamgraph',
#         'pictogram', 'guagechart', 'sankeydiagram', 'chorddiagram', 'sunbrust', 'voronoi',
#         'unknown'
#     ]}, {
#     'tag': 'data',
#     'name': 'Data Types',
#     'options': [
#         'categorical', 'quantitative', 'indexvalue', 'accumulated',
#         'percentage', 'probability',
#         'ordinal', 'ranking', 'sequential',
#         'timeseries', 'cyclic',
#         'geospatial', 'flow',
#         'network', 'tree',
#         'set', 'bitmap',
#         'text', 'multivariate',
#     ]}, {
#     'tag': 'media',
#     'name': 'Medium',
#     'options': [
#         'inreallife', 'printed', 'handdrawn', 'tv', 'ads', 'news', 'NSFW'
#     ]}, {
#     'tag': 'layout',
#     'name': 'Layout',
#     'options': [
#         'circular', 'infographics', 'stacked', 'map', 'juxtaposition', 'overlay', 'mixed'
#     ]}, {
#     'tag': 'metaphor',
#     'name': 'Metaphor',
#     'options': [
#         'pictograph', 'periodictable', 'gear', 'clock'
#     ]}, {
#     'tag': 'flag',
#     'name': 'Flag',
#     'options': [
#         'needreview', 'invalid', 'notbad', 'starred'
#     ]}
# ]

In [22]:
# dump all tags but flag
# json.dump(labelOptions[:-1], open(imagelist_dir/f"labelOptions.json", 'w'), separators=(',', ':'))

In [23]:
labelOptions = json.load(open('tmp/labelOptions.json'))

In [24]:
JSON(labelOptions)

<IPython.core.display.JSON object>

In [25]:
labelTags = json.load(open('tmp/labelTags.json'))

In [26]:
JSON(labelTags)

<IPython.core.display.JSON object>

In [27]:
# dump all tags but flag
json.dump(labelOptions[:-1], open(imagelist_dir/f"labelOptions.json", 'w'), separators=(',', ':'))

In [28]:
# dump all tags but flag
labelTags['categories'] = labelTags['categories'][:-1]
labelTags['tags'] = [t for t in labelTags['tags'] if t['category'] != 'flag']
json.dump(labelTags, open(imagelist_dir/f"labelTags.json", 'w'), separators=(',', ':'))

# Sprite

In [29]:
# ims = []
# sprites_path = str(gallery_dir/'0.jpg')
# for i in range(225):
#     visImage = visImages[i]
#     visImage['sprites_path'] = sprites_path
#     visImage['sprites_pos_x'] = (i % 15) * thumbnail_size
#     visImage['sprites_pos_y'] = (i // 15) * thumbnail_size
#     im = cv2.imread(visImage['thumbnail_path'])
#     w, h = im.shape[:2]
#     new_im = cv2.copyMakeBorder(im, 0, thumbnail_size-w, 0, thumbnail_size-h, cv2.BORDER_CONSTANT, value=[0, 0, 0])
#     ims.append(new_im)
# montages = imutils.build_montages(ims, (140, 140), (15, 15))
# cv2.imwrite(sprites_path, montages[0])